Mongodb 聚合时如何添加附加信息?
Mongodb How to add addtional information when aggregating?
我是初学者,我为我的管道写了一行,但我想在我的输出中添加其他信息,比如屏幕名称,或者 tweets.I 试图在 $group 下添加的数量但是每次都给我一个语法错误
这是我的管道:
def make_pipeline():
# complete the aggregation pipeline
pipeline = [
{
'$match': {
"user.statuses_count": {"$gt":99 },
"user.time_zone": "Brasilia"
}
},
{
"$group": {
"_id": "$user.id",
"followers": { "$max": "$user.followers_count" }
}
},
{
"$sort": { "followers": -1 }
},
{
"$limit" : 1
}
];
我在这个例子中使用它:
{
"_id" : ObjectId("5304e2e3cc9e684aa98bef97"),
"text" : "First week of school is over :P",
"in_reply_to_status_id" : null,
"retweet_count" : null,
"contributors" : null,
"created_at" : "Thu Sep 02 18:11:25 +0000 2010",
"geo" : null,
"source" : "web",
"coordinates" : null,
"in_reply_to_screen_name" : null,
"truncated" : false,
"entities" : {
"user_mentions" : [ ],
"urls" : [ ],
"hashtags" : [ ]
},
"retweeted" : false,
"place" : null,
"user" : {
"friends_count" : 145,
"profile_sidebar_fill_color" : "E5507E",
"location" : "Ireland :)",
"verified" : false,
"follow_request_sent" : null,
"favourites_count" : 1,
"profile_sidebar_border_color" : "CC3366",
"profile_image_url" : "http://a1.twimg.com/profile_images/1107778717/phpkHoxzmAM_normal.jpg",
"geo_enabled" : false,
"created_at" : "Sun May 03 19:51:04 +0000 2009",
"description" : "",
"time_zone" : null,
"url" : null,
"screen_name" : "Catherinemull",
"notifications" : null,
"profile_background_color" : "FF6699",
"listed_count" : 77,
"lang" : "en",
"profile_background_image_url" : "http://a3.twimg.com/profile_background_images/138228501/149174881-8cd806890274b828ed56598091c84e71_4c6fd4d8-full.jpg",
"statuses_count" : 2475,
"following" : null,
"profile_text_color" : "362720",
"protected" : false,
"show_all_inline_media" : false,
"profile_background_tile" : true,
"name" : "Catherine Mullane",
"contributors_enabled" : false,
"profile_link_color" : "B40B43",
"followers_count" : 169,
"id" : 37486277,
"profile_use_background_image" : true,
"utc_offset" : null
},
"favorited" : false,
"in_reply_to_user_id" : null,
"id" : NumberLong("22819398300")
}
使用 $first 和您的聚合管道查询如下:
db.collectionName.aggregate({
"$match": {
"user.statuses_count": {
"$gt": 99
},
"user.time_zone": "Brasilia"
}
}, {
"$sort": {
"user.followers_count": -1 // sort followers_count first
}
}, {
"$group": {
"_id": "$user.id",
"followers": {
"$first": "$user.followers_count" //use mongo $first method to get followers count or max followers count
},
"screen_name": {
"$first": "$user.screen_name"
},
"retweet_count": {
"$first": "$retweet_count"
}
}
})
db.collectionName.aggregate({
"$match": {
"user.statuses_count": {
"$gt": 99
},
"user.time_zone": "Brasilia"
}
}, {
"$sort": {
"user.followers_count": -1 // sort followers_count
}
}, {
"$limit": 1 // Set limit 1 so get max followers_count document first
}, {
"$project": { // user project here
"userId": "$user.id",
"screen_name": "$user.screen_name",
"retweet_count": "$retweet_count"
}
}).pretty()
以下聚合管道使用 $$ROOT
system variable which references the root document, i.e. the top-level document, currently being processed in the $group
aggregation pipeline stage. This is added to an array using the $addToSet
operator. In the following pipeline stage, you can then $unwind
the array to get the desired fields through a $project
运算符修改输出文档的形式:
db.tweet.aggregate([
{
'$match': {
"user.statuses_count": { "$gte": 100 },
"user.time_zone": "Brasilia"
}
},
{
"$group": {
"_id": "$user.id",
"max_followers": { "$max": "$user.followers_count" },
"data": { "$addToSet": "$$ROOT" }
}
},
{
"$unwind": "$data"
},
{
"$project": {
"_id": "$data._id",
"followers": "$max_followers",
"screen_name": "$data.user.screen_name",
"tweets": "$data.user.statuses_count"
}
},
{
"$sort": { "followers": -1 }
},
{
"$limit" : 1
}
])
以下流水线也取得了相同的结果,但未使用 $group 运算符:
pipeline = [
{
"$match": {
"user.statuses_count": {
"$gte": 100
},
"user.time_zone": "Brasilia"
}
},
{
"$project": {
"followers": "$user.followers_count",
"screen_name": "$user.screen_name",
"tweets": "$user.statuses_count"
}
},
{
"$sort": {
"followers": -1
}
},
{"$limit" : 1}
]
Pymongo 输出:
{u'ok': 1.0,
u'result': [{u'_id': ObjectId('5304e2d34149692bc5172729'),
u'followers': 17209,
u'screen_name': u'AndreHenning',
u'tweets': 8219}]}
我是初学者,我为我的管道写了一行,但我想在我的输出中添加其他信息,比如屏幕名称,或者 tweets.I 试图在 $group 下添加的数量但是每次都给我一个语法错误
这是我的管道:
def make_pipeline():
# complete the aggregation pipeline
pipeline = [
{
'$match': {
"user.statuses_count": {"$gt":99 },
"user.time_zone": "Brasilia"
}
},
{
"$group": {
"_id": "$user.id",
"followers": { "$max": "$user.followers_count" }
}
},
{
"$sort": { "followers": -1 }
},
{
"$limit" : 1
}
];
我在这个例子中使用它:
{
"_id" : ObjectId("5304e2e3cc9e684aa98bef97"),
"text" : "First week of school is over :P",
"in_reply_to_status_id" : null,
"retweet_count" : null,
"contributors" : null,
"created_at" : "Thu Sep 02 18:11:25 +0000 2010",
"geo" : null,
"source" : "web",
"coordinates" : null,
"in_reply_to_screen_name" : null,
"truncated" : false,
"entities" : {
"user_mentions" : [ ],
"urls" : [ ],
"hashtags" : [ ]
},
"retweeted" : false,
"place" : null,
"user" : {
"friends_count" : 145,
"profile_sidebar_fill_color" : "E5507E",
"location" : "Ireland :)",
"verified" : false,
"follow_request_sent" : null,
"favourites_count" : 1,
"profile_sidebar_border_color" : "CC3366",
"profile_image_url" : "http://a1.twimg.com/profile_images/1107778717/phpkHoxzmAM_normal.jpg",
"geo_enabled" : false,
"created_at" : "Sun May 03 19:51:04 +0000 2009",
"description" : "",
"time_zone" : null,
"url" : null,
"screen_name" : "Catherinemull",
"notifications" : null,
"profile_background_color" : "FF6699",
"listed_count" : 77,
"lang" : "en",
"profile_background_image_url" : "http://a3.twimg.com/profile_background_images/138228501/149174881-8cd806890274b828ed56598091c84e71_4c6fd4d8-full.jpg",
"statuses_count" : 2475,
"following" : null,
"profile_text_color" : "362720",
"protected" : false,
"show_all_inline_media" : false,
"profile_background_tile" : true,
"name" : "Catherine Mullane",
"contributors_enabled" : false,
"profile_link_color" : "B40B43",
"followers_count" : 169,
"id" : 37486277,
"profile_use_background_image" : true,
"utc_offset" : null
},
"favorited" : false,
"in_reply_to_user_id" : null,
"id" : NumberLong("22819398300")
}
使用 $first 和您的聚合管道查询如下:
db.collectionName.aggregate({
"$match": {
"user.statuses_count": {
"$gt": 99
},
"user.time_zone": "Brasilia"
}
}, {
"$sort": {
"user.followers_count": -1 // sort followers_count first
}
}, {
"$group": {
"_id": "$user.id",
"followers": {
"$first": "$user.followers_count" //use mongo $first method to get followers count or max followers count
},
"screen_name": {
"$first": "$user.screen_name"
},
"retweet_count": {
"$first": "$retweet_count"
}
}
})
db.collectionName.aggregate({
"$match": {
"user.statuses_count": {
"$gt": 99
},
"user.time_zone": "Brasilia"
}
}, {
"$sort": {
"user.followers_count": -1 // sort followers_count
}
}, {
"$limit": 1 // Set limit 1 so get max followers_count document first
}, {
"$project": { // user project here
"userId": "$user.id",
"screen_name": "$user.screen_name",
"retweet_count": "$retweet_count"
}
}).pretty()
以下聚合管道使用 $$ROOT
system variable which references the root document, i.e. the top-level document, currently being processed in the $group
aggregation pipeline stage. This is added to an array using the $addToSet
operator. In the following pipeline stage, you can then $unwind
the array to get the desired fields through a $project
运算符修改输出文档的形式:
db.tweet.aggregate([
{
'$match': {
"user.statuses_count": { "$gte": 100 },
"user.time_zone": "Brasilia"
}
},
{
"$group": {
"_id": "$user.id",
"max_followers": { "$max": "$user.followers_count" },
"data": { "$addToSet": "$$ROOT" }
}
},
{
"$unwind": "$data"
},
{
"$project": {
"_id": "$data._id",
"followers": "$max_followers",
"screen_name": "$data.user.screen_name",
"tweets": "$data.user.statuses_count"
}
},
{
"$sort": { "followers": -1 }
},
{
"$limit" : 1
}
])
以下流水线也取得了相同的结果,但未使用 $group 运算符:
pipeline = [
{
"$match": {
"user.statuses_count": {
"$gte": 100
},
"user.time_zone": "Brasilia"
}
},
{
"$project": {
"followers": "$user.followers_count",
"screen_name": "$user.screen_name",
"tweets": "$user.statuses_count"
}
},
{
"$sort": {
"followers": -1
}
},
{"$limit" : 1}
]
Pymongo 输出:
{u'ok': 1.0,
u'result': [{u'_id': ObjectId('5304e2d34149692bc5172729'),
u'followers': 17209,
u'screen_name': u'AndreHenning',
u'tweets': 8219}]}