pymongo 在大 collection 上更新相同的字段值
pymongo update same field value on large collection
我的 mongo collection 中有超过 1000 万个文档。在大约 ~600 万个文档 中,图像文件扩展名 .jpg
缺少表单字段 variants.image_url
,我想用缺少的扩展名 .jpg
更新此字段。
我运行先find
查询找到所有那些文件然后update
查询更新它,但是这样很慢。我该如何优化它?
示例:
{ "variants" : [ { "image_url" : "http://assets.myassets.com/assets/images/2020/3/5/158642332113146/Arrow-ShirtsFossil-Smart-WatchesLee-Cooper-Formal-ShoesRoadster-Jeans" } ] }
将更改为 .jpg
{ "variants" : [ { "image_url" : "http://assets.myassets.com/assets/images/2020/3/5/158642332113146/Arrow-ShirtsFossil-Smart-WatchesLee-Cooper-Formal-ShoesRoadster-Jeans.jpg" } ] }
# query through all where verion in not v2 and return only variants.image_url
cursor = collection.find({"version": {"$ne": "v2"}}, {"variants.image_url": 1, "_id": 0})
modified_count = 0
for record in cursor:
modified_count = modified_count + update_image_url(record)
return modified_count
def update_image_url(record)
for key1 in record:
# list
for idx, elem in enumerate(record[key1]):
# dict
for key2 in elem:
if str(elem[key2])[-4:] == ".jpg" or str(elem[key2])[-4:] == ".JPG":
print(".jpg or .JPG extension present. skipping")
return 0
else:
query = {"variants.image_url": {"$eq": elem[key2]}}
new_value = {"$set": {"variants." + str(idx) + ".image_url": str(elem[key2]) + ".jpg"}}
update_result = collection.update(query, new_value)
print(update_result["nModified"], "nModified documents updated.")
return update_result["nModified"]
我通过优化第二部分来解决这个问题,即 update
查询到 O(1) 时间。我没有在 variants.image_url
上使用查询,而是在 _id
上使用,因为数据是在字段 _id
上建立索引的,这需要 O(1) 时间。
# return _id as well
cursor = collection.find({"version": {"$ne": "v2"}}, {"variants.image_url": 1})
modified_count = 0
for record in cursor:
modified_count = modified_count + update_image_url(record)
return modified_count
def update_image_url(record)
for key1 in record:
# list
for idx, elem in enumerate(record[key1]):
# dict
for key2 in elem:
if str(elem[key2])[-4:] == ".jpg" or str(elem[key2])[-4:] == ".JPG":
print(".jpg or .JPG extension present. skipping")
return 0
else:
# query on _id field O(1) time
query = {"_id": {"$eq": record["_id"]}}
new_value = {"$set": {"variants." + str(idx) + ".image_url": str(elem[key2]) + ".jpg"}}
update_result = collection.update(query, new_value)
print(update_result["nModified"], "nModified documents updated.")
return update_result["nModified"]
我的 mongo collection 中有超过 1000 万个文档。在大约 ~600 万个文档 中,图像文件扩展名 .jpg
缺少表单字段 variants.image_url
,我想用缺少的扩展名 .jpg
更新此字段。
我运行先find
查询找到所有那些文件然后update
查询更新它,但是这样很慢。我该如何优化它?
示例:
{ "variants" : [ { "image_url" : "http://assets.myassets.com/assets/images/2020/3/5/158642332113146/Arrow-ShirtsFossil-Smart-WatchesLee-Cooper-Formal-ShoesRoadster-Jeans" } ] }
将更改为 .jpg
{ "variants" : [ { "image_url" : "http://assets.myassets.com/assets/images/2020/3/5/158642332113146/Arrow-ShirtsFossil-Smart-WatchesLee-Cooper-Formal-ShoesRoadster-Jeans.jpg" } ] }
# query through all where verion in not v2 and return only variants.image_url
cursor = collection.find({"version": {"$ne": "v2"}}, {"variants.image_url": 1, "_id": 0})
modified_count = 0
for record in cursor:
modified_count = modified_count + update_image_url(record)
return modified_count
def update_image_url(record)
for key1 in record:
# list
for idx, elem in enumerate(record[key1]):
# dict
for key2 in elem:
if str(elem[key2])[-4:] == ".jpg" or str(elem[key2])[-4:] == ".JPG":
print(".jpg or .JPG extension present. skipping")
return 0
else:
query = {"variants.image_url": {"$eq": elem[key2]}}
new_value = {"$set": {"variants." + str(idx) + ".image_url": str(elem[key2]) + ".jpg"}}
update_result = collection.update(query, new_value)
print(update_result["nModified"], "nModified documents updated.")
return update_result["nModified"]
我通过优化第二部分来解决这个问题,即 update
查询到 O(1) 时间。我没有在 variants.image_url
上使用查询,而是在 _id
上使用,因为数据是在字段 _id
上建立索引的,这需要 O(1) 时间。
# return _id as well
cursor = collection.find({"version": {"$ne": "v2"}}, {"variants.image_url": 1})
modified_count = 0
for record in cursor:
modified_count = modified_count + update_image_url(record)
return modified_count
def update_image_url(record)
for key1 in record:
# list
for idx, elem in enumerate(record[key1]):
# dict
for key2 in elem:
if str(elem[key2])[-4:] == ".jpg" or str(elem[key2])[-4:] == ".JPG":
print(".jpg or .JPG extension present. skipping")
return 0
else:
# query on _id field O(1) time
query = {"_id": {"$eq": record["_id"]}}
new_value = {"$set": {"variants." + str(idx) + ".image_url": str(elem[key2]) + ".jpg"}}
update_result = collection.update(query, new_value)
print(update_result["nModified"], "nModified documents updated.")
return update_result["nModified"]