Using Azure's AI Search Service to chunk and read textfiles, the target field values are always NULL.
I tried using a simplified example, just for the fields metadata_blob_name / title and metadata_blob_path / source.
Warning
This warning appears after running the indexer:
Message: Could not generate projection from input '/document'. Check the 'source' or 'sourceContext' property of your projection in your skillset. =$(/document) ?map { "source": $(/document/blob_path), "title": $(/document/blob_name) }Details:
Missing or empty value '/document/blob_name'.
Skillset
{
"@odata.etag": "<REDACTED>",
"name": "<REDACTED>",
"description": "Text extract",
"skills": [
{
"@odata.type": "#Microsoft.Skills.Util.DocumentExtractionSkill",
"name": "extract-text",
"context": "/document",
"parsingMode": "text",
"dataToExtract": "contentAndMetadata",
"inputs": [
{
"name": "file_data",
"source": "/document/file_data",
"inputs": []
}
],
"outputs": [
{
"name": "content",
"targetName": "raw_content"
},
{
"name": "metadata_storage_name",
"targetName": "blob_name"
},
{
"name": "metadata_storage_path",
"targetName": "blob_path"
}
],
"configuration": {}
}
],
"indexProjections": {
"selectors": [
{
"targetIndexName": "<redacted>",
"parentKeyFieldName": "parent_id",
"sourceContext": "/document",
"mappings": [
{
"name": "title",
"source": "/document/blob_name",
"inputs": []
},
{
"name": "source",
"source": "/document/blob_path",
"inputs": []
}
]
}
],
"parameters": {
"projectionMode": "skipIndexingParentDocuments"
}
}
}
Index
{
"@odata.etag": "<REDACTED>",
"name": "<REDACTED>",
"fields": [
{
"name": "id",
"type": "Edm.String",
"searchable": true,
"filterable": true,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": true,
"analyzer": "keyword",
"synonymMaps": []
},
{
"name": "title",
"type": "Edm.String",
"searchable": true,
"filterable": true,
"retrievable": true,
"stored": true,
"sortable": true,
"facetable": false,
"key": false,
"analyzer": "standard.lucene",
"synonymMaps": []
},
{
"name": "source",
"type": "Edm.String",
"searchable": false,
"filterable": true,
"retrievable": true,
"stored": true,
"sortable": true,
"facetable": false,
"key": false,
"synonymMaps": []
},
{
"name": "parent_id",
"type": "Edm.String",
"searchable": true,
"filterable": true,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"analyzer": "keyword",
"synonymMaps": []
}
],
"similarity": {
"@odata.type": "#Microsoft.Azure.Search.BM25Similarity"
},
"vectorSearch": {
"algorithms": [
{
"name": "<redacted>",
"kind": "hnsw",
"hnswParameters": {
"metric": "cosine",
"m": 4,
"efConstruction": 400,
"efSearch": 500
}
}
],
"profiles": [
{
"name": "<REDACTED>",
"algorithm": "<REDACTED>"
}
]
}
}
Indexer
{
"@odata.context": "<redacted>/$metadata#indexers/$entity",
"@odata.etag": "<redacted>",
"name": "<redacted>",
"dataSourceName": "<redacted>",
"skillsetName": "<redacted>",
"targetIndexName": "<redacted>",
"parameters": {
"batchSize": 50,
"maxFailedItems": 0,
"maxFailedItemsPerBatch": 0,
"configuration": {
"dataToExtract": "contentAndMetadata",
"parsingMode": "text",
"allowSkillsetToReadFileData": true,
"indexedFileNameExtensions": ".md",
"excludedFileNameExtensions": "",
"imageAction": "none",
"pdfTextRotationAlgorithm": "none"
}
},
"fieldMappings": [
{
"sourceFieldName": "metadata_storage_path",
"targetFieldName": "id",
"mappingFunction": {
"name": "base64Encode",
"parameters": null
}
}
],
"outputFieldMappings": [],
"cache": {
"id": "<redacted>",
"enableReprocessing": true,
"storageConnectionString": "<redacted>",
"identity": null
}
}
Example file 01_keywords.md
# ContentTaxonomy/Keywords
# Variant 1 (v2025-10-06)
#
# keyword- and alias lists
## topic1
- Keywords: lorem, ipsum, dolor, sit, amet
## topic2
- Keywords: lorem, ipsum, dolor, sit, amet
Search Explorer (NULL value result)
JSON-Search-Query:
{
"search": "*",
"select": "*",
"count": true
}
RESULT:
{
"@odata.context": "<redacted>/$metadata#docs(*)",
"@odata.count": 5,
"value": [
{
"@search.score": 1,
"id": "<redacted (valid base64)>",
"title": null,
"source": null,
"parent_id": "<redacted (valid base64)>"
},
// ...
Question: Why are the fields title and source always NULL ?
What I tried:
- resetting the Indexer cache
- using
"targetName": "/document/blob_name"in the skillset (outputs) - The run is processing 5 documents