How to populate Azure AI Search field with a custom skill?

Question

I am trying to populate a ParsedDate field in an Azure Search Index so that I can later on create a scoring profile to improve the search results (using Freshness).

I defined the index, skillset and indexer according to the documentation but when I run the indexer, the field is always Null.

This is how I defined the skill in my skillset:


  {
      "@odata.type": "#Microsoft.Skills.Custom.WebApiSkill",
      "name": "#0",
      "description": "A custom skill that parses dates from file names",
      "uri": "https://az-function.azurewebsites.net/api/custom-skill-date-extraction?code=ABC",
      "httpMethod": "POST",
      "timeout": "PT30S",
      "batchSize": 1,
      "context": "/document",
      "inputs": [
        {
          "name": "fileName",
          "source": "/document/metadata_storage_path" 
        }
      ],
      "outputs": [
        {
          "name": "parsedDate",
          "targetName": "parsedDate"
        }
      ]
    }

This is how I defined the index:

def main(req: func.HttpRequest) -> func.HttpResponse:
    logging.info('Python HTTP trigger function processed a request.')

    # Environment Variables
    endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
    endpoint_openai = os.environ["AZURE_OPENAI_ENDPOINT"]
    deployment_id = os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT_ID"]
    credential_search = AzureKeyCredential(os.environ["AZURE_SEARCH_ADMIN_KEY"]) if os.environ["AZURE_SEARCH_ADMIN_KEY"] else DefaultAzureCredential()
    credential = DefaultAzureCredential()
    customer = readRequestBody(req)
    _ , datasource_name = utils.getStorageAccountInfo(customer, credential)
    index_name = utils.get_index_name( datasource_name)

    # Logic for creating a search index
    try:
        index_client = SearchIndexClient(endpoint=endpoint, credential=credential_search)
        fields = [
            SearchField(name="parent_id", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
            SearchField(name="parsedDate", type=SearchFieldDataType.DateTimeOffset, sortable=True, filterable=True, facetable=True),
            SearchField(name="title", type=SearchFieldDataType.String),
            SearchField(name="chunk_id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True, analyzer_name="keyword"),
            SearchField(name="chunk", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),
            SearchField(name="vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
        ]

        vector_search = VectorSearch(  
    algorithms=[  
        HnswAlgorithmConfiguration(  
            name="myHnsw",  
            parameters=HnswParameters(  
                m=4,  
                ef_construction=400,  
                ef_search=500,  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
        ExhaustiveKnnAlgorithmConfiguration(  
            name="myExhaustiveKnn",  
            parameters=ExhaustiveKnnParameters(  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",  
            vectorizer="myOpenAI",  
        ),  
        VectorSearchProfile(  
            name="myExhaustiveKnnProfile",  
            algorithm_configuration_name="myExhaustiveKnn",  
            vectorizer="myOpenAI",  
        ),  
    ],  
    vectorizers=[  
        AzureOpenAIVectorizer(  
            name="myOpenAI",  
            kind="azureOpenAI",  
            azure_open_ai_parameters=AzureOpenAIParameters(  
                resource_uri=endpoint_openai,  
                deployment_id=deployment_id,  
                api_key=credential_search,  
            ),  
        ),  
    ],  
)  
        semantic_search = SemanticSearch(configurations=[SemanticConfiguration(
            name="my-semantic-config",
            prioritized_fields=SemanticPrioritizedFields(content_fields=[SemanticField(field_name="chunk"),SemanticField(field_name="title")] )
        )])

        index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)
        result = index_client.create_or_update_index(index)
        return func.HttpResponse(f"{result.name} created", status_code=200)
    except Exception as e:
        return func.HttpResponse(f"Failed to create or update the index. Error: {str(e)}", status_code=500)

And finally, how I configured the indexer:

Python

def main(req: func.HttpRequest) -> func.HttpResponse:
    logging.info('Python HTTP trigger function processed a request.')
    
    # Environment Variables
    endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
    credential_search = AzureKeyCredential(os.environ["AZURE_SEARCH_ADMIN_KEY"])
    customer = readRequestBody(req)
    credential = DefaultAzureCredential()
    _, data_source_name= utils.getStorageAccountInfo(customer, credential)
    index_name = utils.get_index_name(data_source_name)
    skillset_name = utils.get_skillset_name(data_source_name)

    
    # Indexer creation logic
    try:
        indexer_name = f"{data_source_name}-indexer"
        indexer = SearchIndexer(
            name=indexer_name,
            description="Indexer to index documents and generate embeddings",
            skillset_name=skillset_name,
            target_index_name=index_name,
            data_source_name=data_source_name,
            field_mappings=[FieldMapping(source_field_name="metadata_storage_name", target_field_name="title"),
                            FieldMapping(source_field_name="parsedDate", target_field_name="parsedDate")],
            parameters=IndexingParameters(
                configuration={
                    "dataToExtract": "contentAndMetadata",
                    "imageAction": "generateNormalizedImages"
                }
            )
        )
        
        indexer_client = SearchIndexerClient(endpoint, credential_search)
        indexer_result = indexer_client.create_or_update_indexer(indexer)
        
        # Run the indexer
        indexer_client.run_indexer(indexer_name)
        message = f'{indexer_name} is created and running. If queries return no results, please wait a bit and try again.'
        logging.info(message)
        return func.HttpResponse(message, status_code=200)
    except Exception as e:
        error_message = f"Failed to create or run the indexer. Error: {str(e)}"
        logging.error(error_message)
        return func.HttpResponse(error_message, status_code=500)

I have also tried using out_field_mappings in the indexer but to no avail.

Any suggestion would be great.

Azure AI Search

what is the output of your custom web api skill. it should return the date that matches the data type DateTimeOffset and need to add the mappings in out_field_mappings — Jaya Shankar G S
– Jaya Shankar G S, Commented Apr 22, 2024 at 6:40

Jaya Shankar G S · Accepted Answer · 2024-04-22 10:29:58Z

You will get receive null values for the following reasons:

Improper outputFieldMappings in the indexer, such as a name mismatch in the skillset output targetName or the sourceFieldName in the indexer definition.
Empty outputFieldMappings.
A mismatch between the output name in the skillset and the data value from the custom web API.

Example: If parsedDate is given in the output name field and you return data like the example below, you will get a null value. The field should match the output name in the skillset.

{
  "values": [
    {
      "recordId": "0",
      "data": {
        "parsedate": "2015-01-01T00:00:00.000Z"
      }
    }
  ]
}

Next, use the following code in your Azure function when returning values:

    req_body = req.get_json()
    values = req_body.get('values')
    res = []
    for i in values:
        tmp = i
        tmp['data'] = {'parsedDte': "parsed_date_from_path"}  # example: 2015-01-01T00:00:00.000Z
        res.append(tmp)
    if res:
        return func.HttpResponse(json.dumps({"values": res}), mimetype="application/json")

Configure outputFieldMappings:

"outputs": [
        {
          "name": "parsedDate",
          "targetName": "parsedDate"
        }]

For the above output in the skillset, you need to provide outputFieldMappings like this:

"outputFieldMappings": [
    {
      "sourceFieldName": "/document/parsedDate", # output of skillset
      "targetFieldName": "parsedDate" # Target index field name
    }
  ]

Or in code:

indexer_name = f"vs-code-2-indexer"
indexer = SearchIndexer(
    name=indexer_name,
    description="Indexer to index documents and generate embeddings",
    skillset_name="skillset1712921532571",
    target_index_name="vs-code-2",
    data_source_name="hotels-sample",
    output_field_mappings=[FieldMapping(source_field_name="/document/parsedDate", target_field_name="parsedDate")]
)

If the above solution doesn't work, delete the current index and create a new index with the same definition. Then reset and run the indexer with the above outputFieldMappings.

Indexer Configuration

Collectives™ on Stack Overflow

How to populate Azure AI Search field with a custom skill?

1 Answer 1

Comments

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Linked

Related