Return 在两个日期之间创建的 gcp 对象
Return gcp objects created between two dates
这是 python 查找在两个日期之间创建的 blob 存储对象的最有效方法吗?
def objects_in_date_range(store, start_date, end_date):
storage_client = storage.Client()
bucket = storage_client.get_bucket(store)
bucket_list = storage_client.list_blobs(bucket)
for blob in bucket_list:
name = blob.name
created = blob.time_created
# print(type(created), created)
# print(type(start_date), start_date)
if created >= start_date and created < end_date:
yield name
return
测试了 5 种不同的随机列表方法。
TL:DR 您最多可以从删除变量 name = blob.name
和 created = blob.time_created
中获得。除此之外,你已经尽可能快了。
import random
import time
from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import List
@dataclass
class FakeBlob:
name: str
time_created: datetime
def func01(bucket_list: List[FakeBlob], start_date:datetime, end_date:datetime):
for blob in bucket_list:
name = blob.name
created = blob.time_created
if created >= start_date and created < end_date:
yield name
return
def func02(bucket_list: List[FakeBlob], start_date:datetime, end_date:datetime):
for blob in bucket_list:
name = blob.name
created = blob.time_created
if start_date <= created < end_date:
yield name
return
def func03(bucket_list: List[FakeBlob], start_date:datetime, end_date:datetime):
for blob in bucket_list:
if start_date <= blob.time_created < end_date:
yield blob.name
return
def func04(bucket_list: List[FakeBlob], start_date:datetime, end_date:datetime):
return [blob.name for blob in bucket_list if start_date <= blob.time_created < end_date]
def func05(bucket_list: List[FakeBlob], start_date:datetime, end_date:datetime):
for e in filter(lambda blob: start_date <= blob.time_created < end_date, bucket_list):
yield e
now = datetime.utcnow()
fake_blob_list = []
for i in range(10000000):
fake_blob_list.append(FakeBlob(name=f"foo{i}", time_created=now+timedelta(hours=i)))
random.shuffle(fake_blob_list)
for func in [func01, func02, func03, func04, func05]:
start_time = time.time()
[e for e in func(fake_blob_list, now+timedelta(hours=4000000), now+timedelta(hours=6000000))]
print(f"{func.__name__} - {time.time() - start_time} seconds")
尝试第 1 10000000 个元素:
func01 - 3.8724558353424072 seconds
func02 - 3.9978816509246826 seconds
func03 - 3.567375421524048 seconds
func04 - 3.6989762783050537 seconds
func05 - 4.644185304641724 seconds
尝试没有 2 10000000 个元素:
func01 - 3.899850606918335 seconds
func02 - 4.075393199920654 seconds
func03 - 3.6028025150299072 seconds
func04 - 3.732541084289551 seconds
func05 - 4.7251410484313965 seconds
用较少数量的元素列表进行的测试得到了类似的结果。
这是 python 查找在两个日期之间创建的 blob 存储对象的最有效方法吗?
def objects_in_date_range(store, start_date, end_date):
storage_client = storage.Client()
bucket = storage_client.get_bucket(store)
bucket_list = storage_client.list_blobs(bucket)
for blob in bucket_list:
name = blob.name
created = blob.time_created
# print(type(created), created)
# print(type(start_date), start_date)
if created >= start_date and created < end_date:
yield name
return
测试了 5 种不同的随机列表方法。
TL:DR 您最多可以从删除变量 name = blob.name
和 created = blob.time_created
中获得。除此之外,你已经尽可能快了。
import random
import time
from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import List
@dataclass
class FakeBlob:
name: str
time_created: datetime
def func01(bucket_list: List[FakeBlob], start_date:datetime, end_date:datetime):
for blob in bucket_list:
name = blob.name
created = blob.time_created
if created >= start_date and created < end_date:
yield name
return
def func02(bucket_list: List[FakeBlob], start_date:datetime, end_date:datetime):
for blob in bucket_list:
name = blob.name
created = blob.time_created
if start_date <= created < end_date:
yield name
return
def func03(bucket_list: List[FakeBlob], start_date:datetime, end_date:datetime):
for blob in bucket_list:
if start_date <= blob.time_created < end_date:
yield blob.name
return
def func04(bucket_list: List[FakeBlob], start_date:datetime, end_date:datetime):
return [blob.name for blob in bucket_list if start_date <= blob.time_created < end_date]
def func05(bucket_list: List[FakeBlob], start_date:datetime, end_date:datetime):
for e in filter(lambda blob: start_date <= blob.time_created < end_date, bucket_list):
yield e
now = datetime.utcnow()
fake_blob_list = []
for i in range(10000000):
fake_blob_list.append(FakeBlob(name=f"foo{i}", time_created=now+timedelta(hours=i)))
random.shuffle(fake_blob_list)
for func in [func01, func02, func03, func04, func05]:
start_time = time.time()
[e for e in func(fake_blob_list, now+timedelta(hours=4000000), now+timedelta(hours=6000000))]
print(f"{func.__name__} - {time.time() - start_time} seconds")
尝试第 1 10000000 个元素:
func01 - 3.8724558353424072 seconds
func02 - 3.9978816509246826 seconds
func03 - 3.567375421524048 seconds
func04 - 3.6989762783050537 seconds
func05 - 4.644185304641724 seconds
尝试没有 2 10000000 个元素:
func01 - 3.899850606918335 seconds
func02 - 4.075393199920654 seconds
func03 - 3.6028025150299072 seconds
func04 - 3.732541084289551 seconds
func05 - 4.7251410484313965 seconds
用较少数量的元素列表进行的测试得到了类似的结果。