The following should do the trick if I understand you question correctly:
collection.aggregate({
$unwind: "$items" // flatten the items array
}, {
$group: {
"_id": { "_id": "$_id", "clear_number": "$clear_number", "group_id": "$group_id", "hash": "$items.hash" }, // per each document group by hash value
"items": { $first: "$items" } // keep only the first of all matching ones per group
}
}, {
$group: {
"_id": { "_id": "$_id._id", "clear_number": "$_id.clear_number", "group_id": "$_id.group_id" }, // now let's group everything again without the hashes
"items": { $push: "$items" } // push all single items into the "items" array
}
}, {
$project: { // this is just to restore the original document layout
"_id": "$_id._id",
"clear_number": "$_id.clear_number",
"group_id": "$_id.group_id",
"items": "$items"
}
})
In response to your comment I would suggest the following query to get the list of all document ids that contain duplicate hashes:
collection.aggregate({
$addFields: {
"hashes": {
$setUnion: [
[ { $size: "$items.hash" } ], // total number of hashes
[ { $size: { $setUnion: "$items.hash" } } ] // number of distinct hashes
]
}
}
}, {
$match:
{
"hashes.1": { $exists: true } // find all documents with a different value for distinct vs total number of hashes
}
}, {
$project: { _id: 1 } // only return _id field
})
There might be different approaches but this one seems pretty straight forward:
Basically, in the $addFields part, for each document, we first create an array consisting of two numbers:
- the total number of hashes
- the number of distinct hashes
Then we drive this array of two numbers through a $setUnion. After this step there can
- either be two different numbers left in the array in which case the
hash field does contain duplicates
- or there is only one element left, in which case the number of distinct hashes equals the total number of hashes (so there are no duplicates).
We can check if there are two items in the array by testing if the element at position 1 (arrays are zero-based!) exists. That's what the $match stage does.
And the final $project stage is just to limit the output to the _id field only.