python mongodb $匹配和$组
python mongodb $match and $group
我想编写一个简单的查询,为我提供具有巴西时区且已发推文 100 次或更多次的关注者最多的用户:
这是我的台词:
pipeline = [{'$match':{"user.statuses_count":{"$gt":99},"user.time_zone":"Brasilia"}},
{"$group":{"_id": "$user.followers_count","count" :{"$sum":1}}},
{"$sort":{"count":-1}} ]
我根据一道练习题改编而来。
This was given as an example for the structure :
{
"_id" : ObjectId("5304e2e3cc9e684aa98bef97"),
"text" : "First week of school is over :P",
"in_reply_to_status_id" : null,
"retweet_count" : null,
"contributors" : null,
"created_at" : "Thu Sep 02 18:11:25 +0000 2010",
"geo" : null,
"source" : "web",
"coordinates" : null,
"in_reply_to_screen_name" : null,
"truncated" : false,
"entities" : {
"user_mentions" : [ ],
"urls" : [ ],
"hashtags" : [ ]
},
"retweeted" : false,
"place" : null,
"user" : {
"friends_count" : 145,
"profile_sidebar_fill_color" : "E5507E",
"location" : "Ireland :)",
"verified" : false,
"follow_request_sent" : null,
"favourites_count" : 1,
"profile_sidebar_border_color" : "CC3366",
"profile_image_url" : "http://a1.twimg.com/profile_images/1107778717/phpkHoxzmAM_normal.jpg",
"geo_enabled" : false,
"created_at" : "Sun May 03 19:51:04 +0000 2009",
"description" : "",
"time_zone" : null,
"url" : null,
"screen_name" : "Catherinemull",
"notifications" : null,
"profile_background_color" : "FF6699",
"listed_count" : 77,
"lang" : "en",
"profile_background_image_url" : "http://a3.twimg.com/profile_background_images/138228501/149174881-8cd806890274b828ed56598091c84e71_4c6fd4d8-full.jpg",
"statuses_count" : 2475,
"following" : null,
"profile_text_color" : "362720",
"protected" : false,
"show_all_inline_media" : false,
"profile_background_tile" : true,
"name" : "Catherine Mullane",
"contributors_enabled" : false,
"profile_link_color" : "B40B43",
"followers_count" : 169,
"id" : 37486277,
"profile_use_background_image" : true,
"utc_offset" : null
},
"favorited" : false,
"in_reply_to_user_id" : null,
"id" : NumberLong("22819398300")
}
有人能发现我的错误吗?
假设您有几个包含最少测试用例的示例文档。将测试文档插入到 mongoshell 中的集合中:
db.collection.insert([
{
"_id" : ObjectId("5304e2e3cc9e684aa98bef97"),
"user" : {
"friends_count" : 145,
"statuses_count" : 457,
"screen_name" : "Catherinemull",
"time_zone" : "Brasilia",
"followers_count" : 169,
"id" : 37486277
},
"id" : NumberLong(22819398300)
},
{
"_id" : ObjectId("52fd2490bac3fa1975477702"),
"user" : {
"friends_count" : 145,
"statuses_count" : 12334,
"time_zone" : "Brasilia",
"screen_name" : "marble",
"followers_count" : 2597,
"id" : 37486278
},
"id" : NumberLong(22819398301)
}])
为了让您获得在时区 "Brasilia"
中拥有最多关注者并且发推文 100
次或更多次的用户,此管道实现了预期的结果但不使用 $group
运算符:
pipeline = [
{
"$match": {
"user.statuses_count": {
"$gt":99
},
"user.time_zone": "Brasilia"
}
},
{
"$project": {
"followers": "$user.followers_count",
"screen_name": "$user.screen_name",
"tweets": "$user.statuses_count"
}
},
{
"$sort": {
"followers": -1
}
},
{"$limit" : 1}
]
Pymongo 输出:
{u'ok': 1.0,
u'result': [{u'_id': ObjectId('52fd2490bac3fa1975477702'),
u'followers': 2597,
u'screen_name': u'marble',
u'tweets': 12334}]}
以下聚合管道也将为您提供所需的结果。在管道中,第一阶段是 $match
operator which filters those documents where the user has got the timezone
field value "Brasilia"
and has a tweet count (represented by the statuses_count
) greater than or equal to 100 matched via the $gte
比较运算符。
第二个管道阶段有 $group
运算符,它按指定的标识符表达式对过滤后的文档进行分组,该标识符表达式是 $user.id
字段并应用累加器表达式 $max
到 $user.followers_count
字段上的每个组以获取每个用户的最大关注者数量。系统变量 $$ROOT
引用根文档,即顶级文档,当前正在 $group
聚合管道阶段,被添加到一个额外的数组字段供以后使用。这是通过使用 $addToSet
数组运算符实现的。
下一个管道阶段$unwinds
为data
数组中的每个元素输出一个文档,以供下一步处理。
接下来的流水线步骤 $project
然后通过添加具有前一个流中的值的新字段来转换流中的每个文档。
最后两个管道阶段 $sort
and $limit
按指定的排序键 followers
和 returns 一个文档重新排序文档流,其中包含具有最多关注者的用户。
您最终的聚合管道应如下所示:
db.collection.aggregate([
{
'$match': {
"user.statuses_count": { "$gte": 100 },
"user.time_zone": "Brasilia"
}
},
{
"$group": {
"_id": "$user.id",
"max_followers": { "$max": "$user.followers_count" },
"data": { "$addToSet": "$$ROOT" }
}
},
{
"$unwind": "$data"
},
{
"$project": {
"_id": "$data._id",
"followers": "$max_followers",
"screen_name": "$data.user.screen_name",
"tweets": "$data.user.statuses_count"
}
},
{
"$sort": { "followers": -1 }
},
{
"$limit" : 1
}
])
在 Robomongo 中执行此操作会得到结果
/* 0 */
{
"result" : [
{
"_id" : ObjectId("52fd2490bac3fa1975477702"),
"followers" : 2597,
"screen_name" : "marble",
"tweets" : 12334
}
],
"ok" : 1
}
在python中,实现应该基本相同:
>>> pipeline = [
... {"$match": {"user.statuses_count": {"$gte":100 }, "user.time_zone": "Brasilia"}},
... {"$group": {"_id": "$user.id","max_followers": { "$max": "$user.followers_count" },"data": { "$addToSet": "$$ROO
T" }}},
... {"$unwind": "$data"},
... {"$project": {"_id": "$data._id","followers": "$max_followers","screen_name": "$data.user.screen_name","tweets":
"$data.user.statuses_count"}},
... {"$sort": { "followers": -1 }},
... {"$limit" : 1}
... ]
>>>
>>> for doc in collection.aggregate(pipeline):
... print(doc)
...
{u'tweets': 12334.0, u'_id': ObjectId('52fd2490bac3fa1975477702'), u'followers': 2597.0, u'screen_name': u'marble'}
>>>
其中
pipeline = [
{"$match": {"user.statuses_count": {"$gte":100 }, "user.time_zone": "Brasilia"}},
{"$group": {"_id": "$user.id","max_followers": { "$max": "$user.followers_count" },"data": { "$addToSet": "$$ROOT" }}},
{"$unwind": "$data"},
{"$project": {"_id": "$data._id","followers": "$max_followers","screen_name": "$data.user.screen_name","tweets": "$data.user.statuses_count"}},
{"$sort": { "followers": -1 }},
{"$limit" : 1}
]
我想编写一个简单的查询,为我提供具有巴西时区且已发推文 100 次或更多次的关注者最多的用户:
这是我的台词:
pipeline = [{'$match':{"user.statuses_count":{"$gt":99},"user.time_zone":"Brasilia"}},
{"$group":{"_id": "$user.followers_count","count" :{"$sum":1}}},
{"$sort":{"count":-1}} ]
我根据一道练习题改编而来。
This was given as an example for the structure :
{
"_id" : ObjectId("5304e2e3cc9e684aa98bef97"),
"text" : "First week of school is over :P",
"in_reply_to_status_id" : null,
"retweet_count" : null,
"contributors" : null,
"created_at" : "Thu Sep 02 18:11:25 +0000 2010",
"geo" : null,
"source" : "web",
"coordinates" : null,
"in_reply_to_screen_name" : null,
"truncated" : false,
"entities" : {
"user_mentions" : [ ],
"urls" : [ ],
"hashtags" : [ ]
},
"retweeted" : false,
"place" : null,
"user" : {
"friends_count" : 145,
"profile_sidebar_fill_color" : "E5507E",
"location" : "Ireland :)",
"verified" : false,
"follow_request_sent" : null,
"favourites_count" : 1,
"profile_sidebar_border_color" : "CC3366",
"profile_image_url" : "http://a1.twimg.com/profile_images/1107778717/phpkHoxzmAM_normal.jpg",
"geo_enabled" : false,
"created_at" : "Sun May 03 19:51:04 +0000 2009",
"description" : "",
"time_zone" : null,
"url" : null,
"screen_name" : "Catherinemull",
"notifications" : null,
"profile_background_color" : "FF6699",
"listed_count" : 77,
"lang" : "en",
"profile_background_image_url" : "http://a3.twimg.com/profile_background_images/138228501/149174881-8cd806890274b828ed56598091c84e71_4c6fd4d8-full.jpg",
"statuses_count" : 2475,
"following" : null,
"profile_text_color" : "362720",
"protected" : false,
"show_all_inline_media" : false,
"profile_background_tile" : true,
"name" : "Catherine Mullane",
"contributors_enabled" : false,
"profile_link_color" : "B40B43",
"followers_count" : 169,
"id" : 37486277,
"profile_use_background_image" : true,
"utc_offset" : null
},
"favorited" : false,
"in_reply_to_user_id" : null,
"id" : NumberLong("22819398300")
}
有人能发现我的错误吗?
假设您有几个包含最少测试用例的示例文档。将测试文档插入到 mongoshell 中的集合中:
db.collection.insert([
{
"_id" : ObjectId("5304e2e3cc9e684aa98bef97"),
"user" : {
"friends_count" : 145,
"statuses_count" : 457,
"screen_name" : "Catherinemull",
"time_zone" : "Brasilia",
"followers_count" : 169,
"id" : 37486277
},
"id" : NumberLong(22819398300)
},
{
"_id" : ObjectId("52fd2490bac3fa1975477702"),
"user" : {
"friends_count" : 145,
"statuses_count" : 12334,
"time_zone" : "Brasilia",
"screen_name" : "marble",
"followers_count" : 2597,
"id" : 37486278
},
"id" : NumberLong(22819398301)
}])
为了让您获得在时区 "Brasilia"
中拥有最多关注者并且发推文 100
次或更多次的用户,此管道实现了预期的结果但不使用 $group
运算符:
pipeline = [
{
"$match": {
"user.statuses_count": {
"$gt":99
},
"user.time_zone": "Brasilia"
}
},
{
"$project": {
"followers": "$user.followers_count",
"screen_name": "$user.screen_name",
"tweets": "$user.statuses_count"
}
},
{
"$sort": {
"followers": -1
}
},
{"$limit" : 1}
]
Pymongo 输出:
{u'ok': 1.0,
u'result': [{u'_id': ObjectId('52fd2490bac3fa1975477702'),
u'followers': 2597,
u'screen_name': u'marble',
u'tweets': 12334}]}
以下聚合管道也将为您提供所需的结果。在管道中,第一阶段是 $match
operator which filters those documents where the user has got the timezone
field value "Brasilia"
and has a tweet count (represented by the statuses_count
) greater than or equal to 100 matched via the $gte
比较运算符。
第二个管道阶段有 $group
运算符,它按指定的标识符表达式对过滤后的文档进行分组,该标识符表达式是 $user.id
字段并应用累加器表达式 $max
到 $user.followers_count
字段上的每个组以获取每个用户的最大关注者数量。系统变量 $$ROOT
引用根文档,即顶级文档,当前正在 $group
聚合管道阶段,被添加到一个额外的数组字段供以后使用。这是通过使用 $addToSet
数组运算符实现的。
下一个管道阶段$unwinds
为data
数组中的每个元素输出一个文档,以供下一步处理。
接下来的流水线步骤 $project
然后通过添加具有前一个流中的值的新字段来转换流中的每个文档。
最后两个管道阶段 $sort
and $limit
按指定的排序键 followers
和 returns 一个文档重新排序文档流,其中包含具有最多关注者的用户。
您最终的聚合管道应如下所示:
db.collection.aggregate([
{
'$match': {
"user.statuses_count": { "$gte": 100 },
"user.time_zone": "Brasilia"
}
},
{
"$group": {
"_id": "$user.id",
"max_followers": { "$max": "$user.followers_count" },
"data": { "$addToSet": "$$ROOT" }
}
},
{
"$unwind": "$data"
},
{
"$project": {
"_id": "$data._id",
"followers": "$max_followers",
"screen_name": "$data.user.screen_name",
"tweets": "$data.user.statuses_count"
}
},
{
"$sort": { "followers": -1 }
},
{
"$limit" : 1
}
])
在 Robomongo 中执行此操作会得到结果
/* 0 */
{
"result" : [
{
"_id" : ObjectId("52fd2490bac3fa1975477702"),
"followers" : 2597,
"screen_name" : "marble",
"tweets" : 12334
}
],
"ok" : 1
}
在python中,实现应该基本相同:
>>> pipeline = [
... {"$match": {"user.statuses_count": {"$gte":100 }, "user.time_zone": "Brasilia"}},
... {"$group": {"_id": "$user.id","max_followers": { "$max": "$user.followers_count" },"data": { "$addToSet": "$$ROO
T" }}},
... {"$unwind": "$data"},
... {"$project": {"_id": "$data._id","followers": "$max_followers","screen_name": "$data.user.screen_name","tweets":
"$data.user.statuses_count"}},
... {"$sort": { "followers": -1 }},
... {"$limit" : 1}
... ]
>>>
>>> for doc in collection.aggregate(pipeline):
... print(doc)
...
{u'tweets': 12334.0, u'_id': ObjectId('52fd2490bac3fa1975477702'), u'followers': 2597.0, u'screen_name': u'marble'}
>>>
其中
pipeline = [
{"$match": {"user.statuses_count": {"$gte":100 }, "user.time_zone": "Brasilia"}},
{"$group": {"_id": "$user.id","max_followers": { "$max": "$user.followers_count" },"data": { "$addToSet": "$$ROOT" }}},
{"$unwind": "$data"},
{"$project": {"_id": "$data._id","followers": "$max_followers","screen_name": "$data.user.screen_name","tweets": "$data.user.statuses_count"}},
{"$sort": { "followers": -1 }},
{"$limit" : 1}
]