解决方法 16MB BSON 限制删除多个文档
Workaround 16MB BSON limit to delete multiple documents
我有这样的 MongoDB 数据
请看最后一个字段-时间,如您所见,我有一些"duplicate"数据已用颜色标记。
对于小型数据库,我可以使用以下代码删除重复值
var cursor = db.getCollection("light").aggregate([
{$group : {
"_id": {
index: "$index",
unit: "$unit",
min: "$min",
max: "$max",
node: "$node",
year: { "$year": "$time" },
dayOfYear: { "$dayOfYear": "$time" },
hour: { "$hour": "$time" },
minute: { "$minute": "$time" }
},
_id_not_delete: { $last: "$_id" }
}}
],
{
"allowDiskUse" : true
}
)
var ids_not_delete = cursor.map(function (doc) { return doc._id_not_delete; });
db.getCollection("light").remove({"_id": { "$nin": ids_not_delete }});
但是我的数据库有超过2000万条记录,所以我收到这个错误
E QUERY [js] Error: Converting from JavaScript to BSON failed: Object size 23146644 exceeds limit of 16793600 bytes. :
Bulk/addToOperationsList@src/mongo/shell/bulk_api.js:611:28
Bulk/findOperations.remove@src/mongo/shell/bulk_api.js:743:24
DBCollection.prototype.remove@src/mongo/shell/collection.js:404:13
@(shell):1:1
我知道根本原因是
The maximum BSON document size is 16 megabytes
我想我应该更改下面的代码,但我没有任何好的解决方案。
var ids_not_delete = cursor.map(function (doc) { return doc._id_not_delete; });
您有什么想法可以优化我的代码吗?
集合中的示例文档:
{
"_id" : ObjectId("5be22d5808c08300545effee"),
"index" : "LIGHT",
"unit" : "LUX",
"min" : NumberInt(5),
"max" : NumberInt(6),
"avg" : 5.5,
"node" : "TH",
"time" : ISODate("2018-11-07T00:10:00.091+0000")
},
{
"_id" : ObjectId("5be22b0052122e0047c3467c"),
"index" : "LIGHT",
"unit" : "LUX",
"min" : NumberInt(3),
"max" : NumberInt(5),
"avg" : NumberInt(4),
"node" : "TH",
"time" : ISODate("2018-11-07T00:00:00.204+0000")
},
{
"_id" : ObjectId("5be22b0008c08300545eff79"),
"index" : "LIGHT",
"unit" : "LUX",
"min" : NumberInt(3),
"max" : NumberInt(5),
"avg" : NumberInt(4),
"node" : "TH",
"time" : ISODate("2018-11-07T00:00:00.081+0000")
}
MongoDBshell版本v4.0.2
MongoDB 4.0.0
您可以将聚合反转为要删除的 select 个 ID,而不是要保留的 ID:
const toDelete = db.getCollection("light").aggregate([
{ $group : {
"_id": {
index: "$index",
unit: "$unit",
min: "$min",
max: "$max",
node: "$node",
year: { "$year": "$time" },
dayOfYear: { "$dayOfYear": "$time" },
hour: { "$hour": "$time" },
minute: { "$minute": "$time" }
},
ids: {$push: "$_id"}
} },
{$project: {_id: {$slice: ["$ids", 1, 10000]}}},
{$unwind: "$_id"},
{$project: {_id: 0, deleteOne: { "filter" : { "_id" : "$_id"} } } }
]).toArray()
这里的 10,000 是任何足够大的数字,明显大于组内重复项的预期数量。
那么你可以使用bulkWrite:
db.getCollection("light").bulkWrite(toDelete);
驱动程序将分批拆分数组,每批删除 100,000 次。
我有这样的 MongoDB 数据
请看最后一个字段-时间,如您所见,我有一些"duplicate"数据已用颜色标记。
对于小型数据库,我可以使用以下代码删除重复值
var cursor = db.getCollection("light").aggregate([
{$group : {
"_id": {
index: "$index",
unit: "$unit",
min: "$min",
max: "$max",
node: "$node",
year: { "$year": "$time" },
dayOfYear: { "$dayOfYear": "$time" },
hour: { "$hour": "$time" },
minute: { "$minute": "$time" }
},
_id_not_delete: { $last: "$_id" }
}}
],
{
"allowDiskUse" : true
}
)
var ids_not_delete = cursor.map(function (doc) { return doc._id_not_delete; });
db.getCollection("light").remove({"_id": { "$nin": ids_not_delete }});
但是我的数据库有超过2000万条记录,所以我收到这个错误
E QUERY [js] Error: Converting from JavaScript to BSON failed: Object size 23146644 exceeds limit of 16793600 bytes. :
Bulk/addToOperationsList@src/mongo/shell/bulk_api.js:611:28
Bulk/findOperations.remove@src/mongo/shell/bulk_api.js:743:24
DBCollection.prototype.remove@src/mongo/shell/collection.js:404:13
@(shell):1:1
我知道根本原因是
The maximum BSON document size is 16 megabytes
我想我应该更改下面的代码,但我没有任何好的解决方案。
var ids_not_delete = cursor.map(function (doc) { return doc._id_not_delete; });
您有什么想法可以优化我的代码吗?
集合中的示例文档:
{
"_id" : ObjectId("5be22d5808c08300545effee"),
"index" : "LIGHT",
"unit" : "LUX",
"min" : NumberInt(5),
"max" : NumberInt(6),
"avg" : 5.5,
"node" : "TH",
"time" : ISODate("2018-11-07T00:10:00.091+0000")
},
{
"_id" : ObjectId("5be22b0052122e0047c3467c"),
"index" : "LIGHT",
"unit" : "LUX",
"min" : NumberInt(3),
"max" : NumberInt(5),
"avg" : NumberInt(4),
"node" : "TH",
"time" : ISODate("2018-11-07T00:00:00.204+0000")
},
{
"_id" : ObjectId("5be22b0008c08300545eff79"),
"index" : "LIGHT",
"unit" : "LUX",
"min" : NumberInt(3),
"max" : NumberInt(5),
"avg" : NumberInt(4),
"node" : "TH",
"time" : ISODate("2018-11-07T00:00:00.081+0000")
}
MongoDBshell版本v4.0.2
MongoDB 4.0.0
您可以将聚合反转为要删除的 select 个 ID,而不是要保留的 ID:
const toDelete = db.getCollection("light").aggregate([
{ $group : {
"_id": {
index: "$index",
unit: "$unit",
min: "$min",
max: "$max",
node: "$node",
year: { "$year": "$time" },
dayOfYear: { "$dayOfYear": "$time" },
hour: { "$hour": "$time" },
minute: { "$minute": "$time" }
},
ids: {$push: "$_id"}
} },
{$project: {_id: {$slice: ["$ids", 1, 10000]}}},
{$unwind: "$_id"},
{$project: {_id: 0, deleteOne: { "filter" : { "_id" : "$_id"} } } }
]).toArray()
这里的 10,000 是任何足够大的数字,明显大于组内重复项的预期数量。
那么你可以使用bulkWrite:
db.getCollection("light").bulkWrite(toDelete);
驱动程序将分批拆分数组,每批删除 100,000 次。