mongo-db 中对 mapReduce 查询的错误响应
Incorrect response to mapReduce query in mongo-db
我在 collecton 有 1000 条用户记录,其中 459 条文档性别为男性,其余为女性
//document structure
> db.user_details.find().pretty()
{
"_id" : ObjectId("557e610d626754910f0974a4"),
"id" : 0,
"name" : "Leanne Flinn",
"email" : "leanne.flinn@unilogic.com",
"work" : "Unilogic",
"dob" : "Fri Jun 11 1965 20:50:58 GMT+0530 (IST)",
"age" : 5,
"gender" : "female",
"salary" : 35696,
"hobbies" : "Acrobatics,Meditation,Music"
}
{
"_id" : ObjectId("557e610d626754910f0974a5"),
"id" : 1,
"name" : "Edward Young",
"email" : "edward.young@solexis.com",
"work" : "Solexis",
"dob" : "Wed Feb 12 1941 16:45:53 GMT+0530 (IST)",
"age" : 1,
"gender" : "female",
"salary" : 72291,
"hobbies" : "Acrobatics,Meditation,Music"
}
{
"_id" : ObjectId("557e610d626754910f0974a6"),
"id" : 2,
"name" : "Haydee Milligan",
"email" : "haydee.milligan@dalserve.com",
"work" : "Dalserve",
"dob" : "Tue Sep 13 1994 13:45:04 GMT+0530 (IST)",
"age" : 17,
"gender" : "male",
"salary" : 20026,
"hobbies" : "Papier-Mache"
}
{
"_id" : ObjectId("557e610d626754910f0974a7"),
"id" : 3,
"name" : "Lyle Keesee",
"email" : "lyle.keesee@terrasys.com",
"work" : "Terrasys",
"dob" : "Tue Apr 25 1922 13:39:46 GMT+0530 (IST)",
"age" : 79,
"gender" : "female",
"salary" : 48032,
"hobbies" : "Acrobatics,Meditation,Music"
}
{
"_id" : ObjectId("557e610d626754910f0974a8"),
"id" : 4,
"name" : "Shea Mercer",
"email" : "shea.mercer@pancast.com",
"work" : "Pancast",
"dob" : "Mon Apr 08 1935 06:10:30 GMT+0530 (IST)",
"age" : 51,
"gender" : "male",
"salary" : 31511,
"hobbies" : "Acrobatics,Photography,Papier-Mache"
}
每个性别的用户数量
> db.user_details.find({gender:'male'}).count()
459
>
> db.user_details.find({gender:'female'}).count()
541
> db.user_details.find({name:{$ne:null}}).count()
1000
> db.user_details.find({age:{$ne:null}}).count()
1000
地图缩减代码
mapper = function(){
emit(this.gender, {name:this.name,age:this.age})
}
reducer = function(gender, users){
var res = 0;
users.forEach(function(user){
res = res + 1
})
return res;
}
db.user_details.mapReduce(mapper, reducer, {out: {inline:1}})
为什么map reduce结果只有112个文档?它应该包含男性和女性分别为 459 和 541,不是吗?
// Map reduce result
{
"results" : [
{
"_id" : "female",
"value" : 56
},
{
"_id" : "male",
"value" : 46
}
],
"timeMillis" : 45,
"counts" : {
"input" : 1000,
"emit" : 1000,
"reduce" : 20,
"output" : 2
},
"ok" : 1
}
注意:我知道这不是使用 map reduce 的正确方法,实际上我在 map reduce 中遇到了一些更令人毛骨悚然的问题。一旦我解决了这个问题,我就可以解决
这可能是错误的。
users.forEach(function(user){
res = res + 1
})
试试这个,
function(gender, users){
return Array.sum( users)
}
这里的问题是您错过了 mapReduce 工作原理的核心概念之一。找到解释这个的相关文档 here:
- MongoDB can invoke the reduce function more than once for the same key. In this case, the previous output from the reduce function for that key will become one of the input values to the next reduce function invocation for that key.
然后稍后:
- the type of the return object must be identical to the type of the value emitted by the map function
这两个语句的意思是您需要使用 mapper 和 发出的 完全相同的 签名]reducer 作为 reduce 进程确实会被调用 "multiple times".
这就是 mapReduce 处理大数据的方式,但不一定一次处理给定 "key" 的所有相同值,而是增量处理 "chunks":
因此,如果您在输出中想要的只是一个 "number",那么您 "emit" 也只是一个 "number":
db.collection.mapReduce(
function() {
emit(this.gender, this.age);
},
function(key,values) {
return Array.sum( values )
},
{ "out": { "inline": 1 } }
)
或每种类型 "count":
db.collection.mapReduce(
function() {
emit(this.gender, 1);
},
function(key,values) {
return Array.sum( values )
},
{ "out": { "inline": 1 } }
)
重点是 "you need to put out the same as what you put in",因为它很可能 "go back in again"。因此,无论您想收集什么数据,mapper 和 reducer 的输出结构 必须 相同。
reduce 函数有错误。
MONGODB reduce 函数可以为同一个 KEY 调用多次,所以在你的 reduce 代码中它被覆盖了。
同样在 map 函数中,您正在发送结构 { user, age} 的文档,但在 reduce 函数中,您返回的是计数。
reduce = function(gender, doc) {
reducedVal = { user: 0, age: 0 };
for (var idx = 0; idx < doc.length; idx++) {
reducedVal.user += 1 ;
reducedVal.age += 1;
}
return reducedVal;
};
请同时检查以下 link:
http://thejackalofjavascript.com/mapreduce-in-mongodb/
这是使用 map reduce() 的正确方法,用于按性别显示用户数量
db.yourCollectionName.mapReduce(
function(){
emit(this.gender,1);
},
function(k,v){
return Array.sum(v);
},
{out:"genderCount"}
);
db.genderCount.find();
我在 collecton 有 1000 条用户记录,其中 459 条文档性别为男性,其余为女性
//document structure
> db.user_details.find().pretty()
{
"_id" : ObjectId("557e610d626754910f0974a4"),
"id" : 0,
"name" : "Leanne Flinn",
"email" : "leanne.flinn@unilogic.com",
"work" : "Unilogic",
"dob" : "Fri Jun 11 1965 20:50:58 GMT+0530 (IST)",
"age" : 5,
"gender" : "female",
"salary" : 35696,
"hobbies" : "Acrobatics,Meditation,Music"
}
{
"_id" : ObjectId("557e610d626754910f0974a5"),
"id" : 1,
"name" : "Edward Young",
"email" : "edward.young@solexis.com",
"work" : "Solexis",
"dob" : "Wed Feb 12 1941 16:45:53 GMT+0530 (IST)",
"age" : 1,
"gender" : "female",
"salary" : 72291,
"hobbies" : "Acrobatics,Meditation,Music"
}
{
"_id" : ObjectId("557e610d626754910f0974a6"),
"id" : 2,
"name" : "Haydee Milligan",
"email" : "haydee.milligan@dalserve.com",
"work" : "Dalserve",
"dob" : "Tue Sep 13 1994 13:45:04 GMT+0530 (IST)",
"age" : 17,
"gender" : "male",
"salary" : 20026,
"hobbies" : "Papier-Mache"
}
{
"_id" : ObjectId("557e610d626754910f0974a7"),
"id" : 3,
"name" : "Lyle Keesee",
"email" : "lyle.keesee@terrasys.com",
"work" : "Terrasys",
"dob" : "Tue Apr 25 1922 13:39:46 GMT+0530 (IST)",
"age" : 79,
"gender" : "female",
"salary" : 48032,
"hobbies" : "Acrobatics,Meditation,Music"
}
{
"_id" : ObjectId("557e610d626754910f0974a8"),
"id" : 4,
"name" : "Shea Mercer",
"email" : "shea.mercer@pancast.com",
"work" : "Pancast",
"dob" : "Mon Apr 08 1935 06:10:30 GMT+0530 (IST)",
"age" : 51,
"gender" : "male",
"salary" : 31511,
"hobbies" : "Acrobatics,Photography,Papier-Mache"
}
每个性别的用户数量
> db.user_details.find({gender:'male'}).count()
459
>
> db.user_details.find({gender:'female'}).count()
541
> db.user_details.find({name:{$ne:null}}).count()
1000
> db.user_details.find({age:{$ne:null}}).count()
1000
地图缩减代码
mapper = function(){
emit(this.gender, {name:this.name,age:this.age})
}
reducer = function(gender, users){
var res = 0;
users.forEach(function(user){
res = res + 1
})
return res;
}
db.user_details.mapReduce(mapper, reducer, {out: {inline:1}})
为什么map reduce结果只有112个文档?它应该包含男性和女性分别为 459 和 541,不是吗?
// Map reduce result
{
"results" : [
{
"_id" : "female",
"value" : 56
},
{
"_id" : "male",
"value" : 46
}
],
"timeMillis" : 45,
"counts" : {
"input" : 1000,
"emit" : 1000,
"reduce" : 20,
"output" : 2
},
"ok" : 1
}
注意:我知道这不是使用 map reduce 的正确方法,实际上我在 map reduce 中遇到了一些更令人毛骨悚然的问题。一旦我解决了这个问题,我就可以解决
这可能是错误的。
users.forEach(function(user){
res = res + 1
})
试试这个,
function(gender, users){
return Array.sum( users)
}
这里的问题是您错过了 mapReduce 工作原理的核心概念之一。找到解释这个的相关文档 here:
- MongoDB can invoke the reduce function more than once for the same key. In this case, the previous output from the reduce function for that key will become one of the input values to the next reduce function invocation for that key.
然后稍后:
- the type of the return object must be identical to the type of the value emitted by the map function
这两个语句的意思是您需要使用 mapper 和 发出的 完全相同的 签名]reducer 作为 reduce 进程确实会被调用 "multiple times".
这就是 mapReduce 处理大数据的方式,但不一定一次处理给定 "key" 的所有相同值,而是增量处理 "chunks":
因此,如果您在输出中想要的只是一个 "number",那么您 "emit" 也只是一个 "number":
db.collection.mapReduce(
function() {
emit(this.gender, this.age);
},
function(key,values) {
return Array.sum( values )
},
{ "out": { "inline": 1 } }
)
或每种类型 "count":
db.collection.mapReduce(
function() {
emit(this.gender, 1);
},
function(key,values) {
return Array.sum( values )
},
{ "out": { "inline": 1 } }
)
重点是 "you need to put out the same as what you put in",因为它很可能 "go back in again"。因此,无论您想收集什么数据,mapper 和 reducer 的输出结构 必须 相同。
reduce 函数有错误。
MONGODB reduce 函数可以为同一个 KEY 调用多次,所以在你的 reduce 代码中它被覆盖了。
同样在 map 函数中,您正在发送结构 { user, age} 的文档,但在 reduce 函数中,您返回的是计数。
reduce = function(gender, doc) {
reducedVal = { user: 0, age: 0 };
for (var idx = 0; idx < doc.length; idx++) {
reducedVal.user += 1 ;
reducedVal.age += 1;
}
return reducedVal;
};
请同时检查以下 link:
http://thejackalofjavascript.com/mapreduce-in-mongodb/
这是使用 map reduce() 的正确方法,用于按性别显示用户数量
db.yourCollectionName.mapReduce(
function(){
emit(this.gender,1);
},
function(k,v){
return Array.sum(v);
},
{out:"genderCount"}
);
db.genderCount.find();