0

The following code runs successfully but the indexer execution history shows the warning:

Cannot iterate over non-array '/document/contentVector'.
Could not map output field 'contentVector' to search index. Check the 'outputFieldMappings' property of your indexer.

Based on the split skill and openai embedding skill docs (https://learn.microsoft.com/en-us/azure/search/cognitive-search-skill-azure-openai-embedding , https://learn.microsoft.com/en-us/azure/search/cognitive-search-skill-textsplit), I am quite sure i've configured input outputs and field mappings correctly. When retrieving a doc, the chunks field is correctly chunked however contentVector is an empty list [].

import os
from pprint import pprint
from tqdm import tqdm
import time
import json
from dotenv import load_dotenv
from lxml import etree
from bs4 import BeautifulSoup
from typing import List, Dict, Collection
import uuid
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    SimpleField,
    SearchableField,
    SearchFieldDataType,
    SearchField,
    VectorSearch,
    VectorSearchProfile,
    HnswAlgorithmConfiguration,
    _edm,
    AzureOpenAIEmbeddingSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    SearchIndexerSkillset,
    SearchIndexerSkill,
    SearchIndexer,
    SplitSkill,
    SearchIndexerDataSourceConnection,
    SearchIndexerDataContainer,
    FieldMapping,
    IndexingParameters,
    IndexingParametersConfiguration,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from azure.core.exceptions import ResourceNotFoundError, ResourceExistsError
from langchain.vectorstores import AzureSearch
from langchain.retrievers import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from azure.search.documents import IndexDocumentsBatch
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient



load_dotenv()
AZURE_SEARCH_ENDPOINT = os.getenv("AZURE_SEARCH_ENDPOINT")
AZURE_SEARCH_KEY = os.getenv("AZURE_SEARCH_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_BLOB_URL = os.getenv("AZURE_BLOB_URL")
AZURE_BLOB_CONN_STRING = os.getenv("AZURE_BLOB_CONN_STRING")
AZURE_BLOB_ACC_KEY = os.getenv("AZURE_BLOB_ACC_KEY")




"""
Create index
Create chunking and embedding skills
Index data
"""

def ta_create_skillset():
    split_skill = SplitSkill(
        inputs=[InputFieldMappingEntry(name="text", source="/document/content")],
        outputs=[OutputFieldMappingEntry(name="textItems", target_name="chunks")],
        name="cf-textsplit-skill-1k",
        text_split_mode="pages",
        maximum_page_length=1000,
        page_overlap_length=100
    )

    embedding_skill = AzureOpenAIEmbeddingSkill(
        inputs=[InputFieldMappingEntry(name="text", source="/document/chunks/*")],
        outputs=[OutputFieldMappingEntry(name="embedding", target_name="contentVector")],
        context="/document/chunks/*",
        name="cf-embedding-skill-large",
        resource_url=AZURE_OPENAI_ENDPOINT,
        api_key=AZURE_OPENAI_API_KEY,
        deployment_name="text-embedding-3-large",
        model_name="text-embedding-3-large",
        dimensions=3072
    )

    skillset = SearchIndexerSkillset(
        name="cf-chunk-embed-skillset",
        description="Skillset for chunking and Azure OpenAI embeddings",
        skills=[split_skill, embedding_skill]
    )

    indexer = SearchIndexerClient(
        endpoint=AZURE_SEARCH_ENDPOINT,
        credential=AzureKeyCredential(AZURE_SEARCH_KEY)
    )

    indexer.create_or_update_skillset(skillset)
    print(f"Skillset {skillset.name} with skills: {', '.join([x.name for x in skillset.skills])} created.")


def ta_create_index(index_name):
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(name="title", type=SearchFieldDataType.String),
        SearchableField(name="content", type=SearchFieldDataType.String),
        SearchableField(name="chunks", collection=True, type=SearchFieldDataType.String),
        SimpleField(name="location", type=SearchFieldDataType.String, filterable=True),
        SimpleField(name="document_type", type=SearchFieldDataType.String, filterable=True),
        SearchableField(name="jurisdiction", collection=True, type=SearchFieldDataType.String, filterable=True),
        SearchableField(name="category", collection=True, type=SearchFieldDataType.String, filterable=True),
        SearchableField(name="summary", type=SearchFieldDataType.String),
        SearchableField(name="abstract", type=SearchFieldDataType.String),
        SearchableField(name="contentVector", collection=True, type=SearchFieldDataType.Single,
                        vector_search_dimensions=3072, vector_search_profile_name="cf-vector")
        # SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        #             searchable=True, hidden=False, vector_search_dimensions=3072, vector_search_profile_name="cf-vector"),
    ]

    vectorizer = AzureOpenAIVectorizer(
        vectorizer_name="cf-vectorizer",
        parameters=AzureOpenAIVectorizerParameters(
            resource_url=AZURE_OPENAI_ENDPOINT,
            deployment_name="text-embedding-3-large",
            api_key=AZURE_OPENAI_API_KEY,
            model_name="text-embedding-3-large"
        )
    )

    vector_search = VectorSearch(
        profiles=[
            VectorSearchProfile(name="cf-vector",
                                algorithm_configuration_name="vector-config",
                                vectorizer_name="cf-vectorizer"
            )
        ],
        algorithms=[
            HnswAlgorithmConfiguration(
                name="vector-config",
                parameters={
                    "m": 4,
                    "efConstruction": 400,
                    "efSearch": 500,
                    "metric": "cosine"
                }
            )
        ],
       vectorizers=[vectorizer]
    )

    index_client = SearchIndexClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=AzureKeyCredential(AZURE_SEARCH_KEY))

    index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)
    index_client.create_or_update_index(index)
    print(f"Index '{index_name}' created or updated with vector search capability.")





ta_create_skillset()
ta_create_index("cf-rag-index")


indexer_client = SearchIndexerClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=AzureKeyCredential(AZURE_SEARCH_KEY))
data_source_conn = SearchIndexerDataSourceConnection(
    name="cf-ta-blob-conn",
    connection_string=AZURE_BLOB_CONN_STRING,
    type="azureblob",
    container=SearchIndexerDataContainer(name="cf-ta-container")
)
indexer_client.create_or_update_data_source_connection(data_source_conn)
indexer = SearchIndexer(
    name="cf-ta-indexer",
    data_source_name="cf-ta-blob-conn",
    target_index_name="cf-rag-index",
    skillset_name="cf-chunk-embed-skillset",
    output_field_mappings=[
        FieldMapping(source_field_name="/document/chunks", target_field_name="chunks"),
        FieldMapping(source_field_name="/document/contentVector/*", target_field_name="contentVector")
    ],
    parameters={"configuration": {"parsing_mode":"json"}}
)
indexer_client.create_or_update_indexer(indexer)
indexer_client.run_indexer(name="cf-ta-indexer")


indexer_status = indexer_client.get_indexer_status("cf-ta-indexer")
print(indexer_status.status)


def print_execution_history(indexer_client, indexer_name):
    indexer_status = indexer_client.get_indexer_status(indexer_name)
    for execution in indexer_status.execution_history:
        if len(execution.errors) > 0:
            e = execution.errors[0]
            print(e.details)
            print(e.error_message)
            print("-"*10)
        if len(execution.warnings) > 0:
            w = execution.warnings[0]
            print(w.details)
            print(w.message)
            print("-"*10)

print_execution_history(indexer_client, "cf-ta-indexer")


search_client = SearchClient(
    endpoint=AZURE_SEARCH_ENDPOINT,
    index_name="cf-rag-index",
    credential=AzureKeyCredential(AZURE_SEARCH_KEY)
)
results = search_client.search("*", top=1)
res = list(results)[0]
print(res["chunks"][0][-100:])
print(res["chunks"][1][:100])
pprint(res)

The following vector search returns the error:

Message: The field 'contentVector' in the vector field list is not a vector field.
Parameter name: vector.fields
Exception Details:  (FieldNotSearchable) The field 'contentVector' in the vector field list is not a vector field.
    Code: FieldNotSearchable
    Message: The field 'contentVector' in the vector field list is not a vector field.
results = search_client.search(
    select="title,chunks",
    vector_queries=[VectorizableTextQuery(
        text=myquery,
        k_nearest_neighbors=3,
        fields="contentVector"
    )]
)
for r in results:
    pprint(r)

Changing the vector field to

SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, vector_search_dimensions=3072, vector_search_profile_name="cf-vector")

actually gets rid of the above error however the search result is empty and another error is returned:

There's a mismatch in vector dimensions. The vector field 'contentVector', with dimension of '3072', expects a length of '3072'. However, the provided vector has a length of '0'. Please ensure that the vector length matches the expected length of the vector field. Read the following documentation for more details: https://learn.microsoft.com/en-us/azure/search/vector-search-how-to-configure-compression-storage.
Could not index document because some of the data in the document was not valid.
----------
Cannot iterate over non-array '/document/contentVector'.
Could not map output field 'contentVector' to search index. Check the 'outputFieldMappings' property of your indexer.

1 Answer 1

0

Had to change a few things but it is working as expected now.

Firstly, had to add index projections to the skillset and explicitly map all fields I want to populate in the search index. Note that contentVector does not show up in search index because it is not retrievable.

skillset = SearchIndexerSkillset(
        name="cf-chunk-embed-skillset",
        description="Skillset for chunking and Azure OpenAI embeddings",
        skills=[split_skill, embedding_skill],
        index_projection=SearchIndexerIndexProjection(
            selectors=[
                SearchIndexerIndexProjectionSelector(
                    target_index_name="cf-rag-index",
                    parent_key_field_name="parent_id",
                    source_context="/document/chunks/*",
                    mappings=[
                        InputFieldMappingEntry(name="chunk", source="/document/chunks/*"),
                        InputFieldMappingEntry(name="contentVector", source="/document/chunks/*/contentVector"),
                        InputFieldMappingEntry(name="title", source="/document/title"),
                        InputFieldMappingEntry(name="abstract", source="/document/abstract"),
                        InputFieldMappingEntry(name="summary", source="/document/summary"),
                        InputFieldMappingEntry(name="jurisdiction", source="/document/jurisdiction"),
                        InputFieldMappingEntry(name="category", source="/document/category"),
                        InputFieldMappingEntry(name="location", source="/document/location"),
                        InputFieldMappingEntry(name="document_type", source="/document/document_type"),
                    ]
                )
            ],
            parameters=SearchIndexerIndexProjectionsParameters(
                projection_mode="skipIndexingParentDocuments"
            )
        )
    )

Also needed to create a new parent id field and modify the key field to have keyword analyzer.

fields = [
        SearchField(name="id", type=SearchFieldDataType.String, key=True, analyzer_name="keyword"),
        SimpleField(name="parent_id", type=SearchFieldDataType.String, filterable=True),
        SearchableField(name="title", type=SearchFieldDataType.String),
        # SearchableField(name="content", type=SearchFieldDataType.String),
        SearchableField(name="chunk", type=SearchFieldDataType.String),
        SimpleField(name="location", type=SearchFieldDataType.String, filterable=True),
        SimpleField(name="document_type", type=SearchFieldDataType.String, filterable=True),
        SearchableField(name="jurisdiction", collection=True, type=SearchFieldDataType.String, filterable=True),
        SearchableField(name="category", collection=True, type=SearchFieldDataType.String, filterable=True),
        SearchableField(name="summary", type=SearchFieldDataType.String),
        SearchableField(name="abstract", type=SearchFieldDataType.String),
        SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, vector_search_dimensions=3072, vector_search_profile_name="cf-vector"),
    ]

Lastly, you can remove the output field mappings from the search indexer. It didn't seem to do anything when the index projection is set on the skillset.

Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.