MongoDB 聚合查询以获取记录中每个实例的唯一元素列表和计数
MongoDB aggregate query to get the Unique element list and count of every instance in record
我有一个 2 collections,如下所示。
数据 1:
{ "_id" : , "timestamp" : ISODate("2016-01-05T07:42:37.312Z"), "Prof_Name" : "Jack ", "SUBJECT" : "Maths, Chemistry, Machinery1, Ele1" }
{ "_id" : , "timestamp" : ISODate("2016-01-05T07:42:37.312Z"), "Prof_Name" : "Mac", "SUBJECT" : "Chemistry, CS, German" }
数据2:
{ "_id" : ObjectId(""), timestamp" : ISODate("2016-08-05T07:42:37.312Z", "SUBJECT_ID" : "Maths", "ID" : "OI-12", "Rating" : 6, "UUID" : 8123 }
{ "_id" : ObjectId(""), timestamp" : ISODate("2017-09-05T07:42:37.312Z", "SUBJECT_ID" : "Maths, Machinery1, German", "ID" : "OI-134", "Rating" : 6, "UUID" : 8123 }
{ "_id" : ObjectId(""), timestamp" : ISODate("2016-01-05T07:42:37.312Z", "SUBJECT_ID" : "Machinery1, Maths, French, German", "ID" : "OI-32", "Rating" : 3, "UUID" : 8123 }
{ "_id" : ObjectId(""), timestamp" : ISODate("2016-01-05T07:42:37.312Z", "SUBJECT_ID" : "CS, Chemistry", "ID" : "OI-36", "Rating" : , "UUID" : 8124 }
我想在时间戳 2016 年 1 月到 2106 年 11 月之间得到一个 3 collection,其中每个 Prof_Name 和 [=36= 的主题中的主题],检查它是否存在于 "data2"并得到UUID和UUID计数为1,如果在下一条记录中发现相同的主题则使UUID计数=2,依此类推。这是我的 collection 想要的方式..
数据 3:
{ "_id" : ,
"Prof_Name" : "Jack",
"Subjects_list" : [ "Maths", "Chemistry", "Machinery1"],
"UUID_list" : [8123, 8124 ],
"UUID_count" : 3, // Because UUID 8123 has present in 2 records which comes under 2016 timestamp
"subject_count" : 3 } // Ele1 is not mentioned because it has not been seen in any of the data2 record
{ "_id" : ,
"Prof_Name" : "Mac",
"Subjects_list" : [ "CS"],
"UUID_list" : [8124 ],
"UUID_count" : 1, // Because UUID 8123 has present in 2 records which comes under 2016 timestamp
"subject_count" : 1 }
我的汇总查询是:
db.data1.aggregate([
{
"$addFields": {
"SUBJECT": {
"$split": [
"$SUBJECT",
", "
]
}
}
},
{
"$unwind": "$SUBJECT"
},
{
"$lookup": {
"from": "data2",
"let": {
"subject": "$SUBJECT"
},
"pipeline": [
{
"$addFields": {
"SUBJECT_ID": {
"$split": [
"$SUBJECT_ID",
", "
]
}
}
},
{
"$match": {
"$expr": {
"$in": [
"$$subject",
"$SUBJECT_ID"
]
}
}
},
{
"$project": {
"UUID": 1,
"_id": 0
}
}
],
"as": "ref_data"
}
},
{
"$unwind": {
"path": "$ref_data",
"preserveNullAndEmptyArrays": true
}
},
{
"$group": {
"_id": "$Prof_Name",
"subjects_list": {
"$addToSet": "$SUBJECT"
},
"UUID_list": {
"$addToSet": "$ref_data.UUID"
}
}
},
{
"$addFields": {
"Prof_Name": "$_id",
"UUID_count": {
"$size": "$UUID_list"
},
"subject_count": {
"$size": "$subjects_list"
}
}
},
{
"$project": {
"_id": 0
}
},
{
"$out": "data3"
}
])
这个查询需要做哪些修改才能得到上面提到的collection data3,主要是UUID_list and UUID-count and Subject_list.
还想知道如何在下面的查询聚合查询中匹配给定月份和年份但不是 iso 的记录的时间戳。
试过这个:
{ "$project": {"year":{"$year":"$timestamp"},"month":{"$month":"$timestamp"}}},{ "$match":{"year" :"2016","month": "01"}}
但努力工作。
您可以通过将主题从逗号分隔值更改为数据库中的数组来简化聚合。
前者 "SUBJECT" : ["Maths", "", "Chemistry", "Machinery1", "Ele1"]
您可以使用以下聚合。
db.data1.aggregate([
{"$lookup":{
"from":"data2",
"localField":"SUBJECT",
"foreignField":"SUBJECT_ID",
"as":"ref_data"
}}, // outputs all the input documents where there is any match between two subjects array.
{"$unwind":{"path":"$ref_data","preserveNullAndEmptyArrays":true}},
{"$match":{"ref_data.timestamp":{"$gte":ISODate("2016-01-01T00:00:00.000Z"), "$lte":ISODate("2016-11-31T11:59:59.999Z")}}},
{"$addFields":{"SUBJECT":{"$setIntersection":["$SUBJECT","$ref_data.SUBJECT_ID"]}}}, // outputs the common subjects (matching) between two subjects array
{"$unwind":"$SUBJECT"},
{"$group":{
"_id":{
"Prof_Name":"$Prof_Name",
"UUID":"$ref_data.UUID",
"SUBJECT":"$SUBJECT"
}
}},// outputs all the distinct combination of UUID and Subject
{"$group":{
"_id":"$_id.Prof_Name",
"UUID_count":{"$sum":1},
"subjects_list":{"$push":"$_id.SUBJECT"},
"UUID_distinct_list":{"$addToSet":"$_id.UUID"}
}}, // outputs the distinct uuid list, count the uuids & subjects list
{"$addFields": {
"Prof_Name": "$_id",
"UUID_distinct_count": {
"$size": "$UUID_distinct_list"
},
"subject_count": {
"$size": "$subjects_list"
}
}}, // Adds the subject list size
{"$project": {"_id": 0}},// excludes the id from final output
{"$out":"data3"}])
无需修改架构即可使用以下聚合查询。
db.data1.aggregate([
{"$lookup":{
"from":"data2",
"let":{"subject":{"$split":["$SUBJECT",", "]}},
"pipeline":[
{"$match": {"expr":{"$and":[{"$eq":[{"$year":"$timestamp"}, 2016]}, {"$eq":[{"$month":"$timestamp"}, 1]}]}}},
{"$addFields":{"SUBJECT_ID":{"$split":["$SUBJECT_ID",", "]},"SUBJECT":"$$subject"}},
{"$unwind":"$SUBJECT"},
{"$match":{"$expr":{"$in":["$SUBJECT","$SUBJECT_ID"]}}},
{"$facet":{
"UUID":[{"$group":{"_id":{"id":"$_id","UUID":"$UUID"}}},{"$count":"UUID_Count"}],
"REST":[
{"$group":{"_id":null,"subjects_list":{"$addToSet":"$SUBJECT"},"UUID_distinct_list":{"$addToSet":"$UUID"}}},
{"$addFields":{"subject_count":{"$size":"$subjects_list"},"UUID_distinct_count":{"$size":"$UUID_distinct_list"}}},
{"$project":{"_id":0}}
]
}},
{"$replaceRoot":{"newRoot":{"$mergeObjects":[{"$arrayElemAt":["$UUID",0]},{"$arrayElemAt":["$REST",0]}]}}}
],
"as":"ref_data"
}},
{"$unwind":{"path":"$ref_data","preserveNullAndEmptyArrays":true}},
{"$addFields":{"ref_data.Prof_Name":"$Prof_Name"}},
{"$replaceRoot":{"newRoot":"$ref_data"}},
{"$out":"data3"}
])
我有一个 2 collections,如下所示。
数据 1:
{ "_id" : , "timestamp" : ISODate("2016-01-05T07:42:37.312Z"), "Prof_Name" : "Jack ", "SUBJECT" : "Maths, Chemistry, Machinery1, Ele1" }
{ "_id" : , "timestamp" : ISODate("2016-01-05T07:42:37.312Z"), "Prof_Name" : "Mac", "SUBJECT" : "Chemistry, CS, German" }
数据2:
{ "_id" : ObjectId(""), timestamp" : ISODate("2016-08-05T07:42:37.312Z", "SUBJECT_ID" : "Maths", "ID" : "OI-12", "Rating" : 6, "UUID" : 8123 }
{ "_id" : ObjectId(""), timestamp" : ISODate("2017-09-05T07:42:37.312Z", "SUBJECT_ID" : "Maths, Machinery1, German", "ID" : "OI-134", "Rating" : 6, "UUID" : 8123 }
{ "_id" : ObjectId(""), timestamp" : ISODate("2016-01-05T07:42:37.312Z", "SUBJECT_ID" : "Machinery1, Maths, French, German", "ID" : "OI-32", "Rating" : 3, "UUID" : 8123 }
{ "_id" : ObjectId(""), timestamp" : ISODate("2016-01-05T07:42:37.312Z", "SUBJECT_ID" : "CS, Chemistry", "ID" : "OI-36", "Rating" : , "UUID" : 8124 }
我想在时间戳 2016 年 1 月到 2106 年 11 月之间得到一个 3 collection,其中每个 Prof_Name 和 [=36= 的主题中的主题],检查它是否存在于 "data2"并得到UUID和UUID计数为1,如果在下一条记录中发现相同的主题则使UUID计数=2,依此类推。这是我的 collection 想要的方式..
数据 3:
{ "_id" : ,
"Prof_Name" : "Jack",
"Subjects_list" : [ "Maths", "Chemistry", "Machinery1"],
"UUID_list" : [8123, 8124 ],
"UUID_count" : 3, // Because UUID 8123 has present in 2 records which comes under 2016 timestamp
"subject_count" : 3 } // Ele1 is not mentioned because it has not been seen in any of the data2 record
{ "_id" : ,
"Prof_Name" : "Mac",
"Subjects_list" : [ "CS"],
"UUID_list" : [8124 ],
"UUID_count" : 1, // Because UUID 8123 has present in 2 records which comes under 2016 timestamp
"subject_count" : 1 }
我的汇总查询是:
db.data1.aggregate([
{
"$addFields": {
"SUBJECT": {
"$split": [
"$SUBJECT",
", "
]
}
}
},
{
"$unwind": "$SUBJECT"
},
{
"$lookup": {
"from": "data2",
"let": {
"subject": "$SUBJECT"
},
"pipeline": [
{
"$addFields": {
"SUBJECT_ID": {
"$split": [
"$SUBJECT_ID",
", "
]
}
}
},
{
"$match": {
"$expr": {
"$in": [
"$$subject",
"$SUBJECT_ID"
]
}
}
},
{
"$project": {
"UUID": 1,
"_id": 0
}
}
],
"as": "ref_data"
}
},
{
"$unwind": {
"path": "$ref_data",
"preserveNullAndEmptyArrays": true
}
},
{
"$group": {
"_id": "$Prof_Name",
"subjects_list": {
"$addToSet": "$SUBJECT"
},
"UUID_list": {
"$addToSet": "$ref_data.UUID"
}
}
},
{
"$addFields": {
"Prof_Name": "$_id",
"UUID_count": {
"$size": "$UUID_list"
},
"subject_count": {
"$size": "$subjects_list"
}
}
},
{
"$project": {
"_id": 0
}
},
{
"$out": "data3"
}
])
这个查询需要做哪些修改才能得到上面提到的collection data3,主要是UUID_list and UUID-count and Subject_list.
还想知道如何在下面的查询聚合查询中匹配给定月份和年份但不是 iso 的记录的时间戳。
试过这个:
{ "$project": {"year":{"$year":"$timestamp"},"month":{"$month":"$timestamp"}}},{ "$match":{"year" :"2016","month": "01"}}
但努力工作。
您可以通过将主题从逗号分隔值更改为数据库中的数组来简化聚合。
前者 "SUBJECT" : ["Maths", "", "Chemistry", "Machinery1", "Ele1"]
您可以使用以下聚合。
db.data1.aggregate([
{"$lookup":{
"from":"data2",
"localField":"SUBJECT",
"foreignField":"SUBJECT_ID",
"as":"ref_data"
}}, // outputs all the input documents where there is any match between two subjects array.
{"$unwind":{"path":"$ref_data","preserveNullAndEmptyArrays":true}},
{"$match":{"ref_data.timestamp":{"$gte":ISODate("2016-01-01T00:00:00.000Z"), "$lte":ISODate("2016-11-31T11:59:59.999Z")}}},
{"$addFields":{"SUBJECT":{"$setIntersection":["$SUBJECT","$ref_data.SUBJECT_ID"]}}}, // outputs the common subjects (matching) between two subjects array
{"$unwind":"$SUBJECT"},
{"$group":{
"_id":{
"Prof_Name":"$Prof_Name",
"UUID":"$ref_data.UUID",
"SUBJECT":"$SUBJECT"
}
}},// outputs all the distinct combination of UUID and Subject
{"$group":{
"_id":"$_id.Prof_Name",
"UUID_count":{"$sum":1},
"subjects_list":{"$push":"$_id.SUBJECT"},
"UUID_distinct_list":{"$addToSet":"$_id.UUID"}
}}, // outputs the distinct uuid list, count the uuids & subjects list
{"$addFields": {
"Prof_Name": "$_id",
"UUID_distinct_count": {
"$size": "$UUID_distinct_list"
},
"subject_count": {
"$size": "$subjects_list"
}
}}, // Adds the subject list size
{"$project": {"_id": 0}},// excludes the id from final output
{"$out":"data3"}])
无需修改架构即可使用以下聚合查询。
db.data1.aggregate([
{"$lookup":{
"from":"data2",
"let":{"subject":{"$split":["$SUBJECT",", "]}},
"pipeline":[
{"$match": {"expr":{"$and":[{"$eq":[{"$year":"$timestamp"}, 2016]}, {"$eq":[{"$month":"$timestamp"}, 1]}]}}},
{"$addFields":{"SUBJECT_ID":{"$split":["$SUBJECT_ID",", "]},"SUBJECT":"$$subject"}},
{"$unwind":"$SUBJECT"},
{"$match":{"$expr":{"$in":["$SUBJECT","$SUBJECT_ID"]}}},
{"$facet":{
"UUID":[{"$group":{"_id":{"id":"$_id","UUID":"$UUID"}}},{"$count":"UUID_Count"}],
"REST":[
{"$group":{"_id":null,"subjects_list":{"$addToSet":"$SUBJECT"},"UUID_distinct_list":{"$addToSet":"$UUID"}}},
{"$addFields":{"subject_count":{"$size":"$subjects_list"},"UUID_distinct_count":{"$size":"$UUID_distinct_list"}}},
{"$project":{"_id":0}}
]
}},
{"$replaceRoot":{"newRoot":{"$mergeObjects":[{"$arrayElemAt":["$UUID",0]},{"$arrayElemAt":["$REST",0]}]}}}
],
"as":"ref_data"
}},
{"$unwind":{"path":"$ref_data","preserveNullAndEmptyArrays":true}},
{"$addFields":{"ref_data.Prof_Name":"$Prof_Name"}},
{"$replaceRoot":{"newRoot":"$ref_data"}},
{"$out":"data3"}
])