使用 peewee 在 postgres 中迭代超过 1k+ 行时的开销
Overhead when iterating over 1k+ rows in postgres using peewee
在遍历 postgres table 时,我看到莫名其妙的大开销。
我分析了代码,还使用 SQLAlchemy
进行了冒烟测试,以确保它不是慢速连接或底层驱动程序 (psycopg2
)。
运行 这针对 postgres table 约 100 万条记录,但只获取其中的一小部分。
import time
import peewee
import sqlalchemy
from playhouse import postgres_ext
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.engine.url import URL as AlchemyURL
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker as alchemy_sessionmaker
user = 'XXX'
password = 'XXX'
database = 'XXX'
host = 'XXX'
port = 5432
table = 'person'
limit = 1000
peewee_db = postgres_ext.PostgresqlExtDatabase(
database=database,
host=host, port=port,
user=user, password=password,
use_speedups=True,
server_side_cursors=True,
register_hstore=False,
)
alchemy_engine = sqlalchemy.create_engine(AlchemyURL('postgresql', username=user, password=password,
database=database, host=host, port=port))
alchemy_session = alchemy_sessionmaker(bind=alchemy_engine)()
class PeeweePerson(peewee.Model):
class Meta:
database = peewee_db
db_table = table
id = peewee.CharField(primary_key=True, max_length=64)
data = postgres_ext.BinaryJSONField(index=True, index_type='GIN')
class SQLAlchemyPerson(declarative_base()):
__tablename__ = table
id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True)
data = sqlalchemy.Column(JSONB)
def run_raw_query():
ids = list(peewee_db.execute_sql(f"SELECT id from {table} order by id desc limit {limit}"))
return ids
def run_peewee_query():
query = PeeweePerson.select(PeeweePerson.id).order_by(PeeweePerson.id.desc()).limit(limit)
ids = list(query.tuples())
return ids
def run_sqlalchemy_query():
query = alchemy_session.query(SQLAlchemyPerson.id).order_by(sqlalchemy.desc(SQLAlchemyPerson.id)).limit(limit)
ids = list(query)
return ids
if __name__ == '__main__':
t0 = time.time()
raw_result = run_raw_query()
t1 = time.time()
print(f'Raw: {t1 - t0}')
t2 = time.time()
sqlalchemy_result = run_sqlalchemy_query()
t3 = time.time()
print(f'SQLAlchemy: {t3 - t2}')
t4 = time.time()
peewee_result = run_peewee_query()
t5 = time.time()
print(f'peewee: {t5 - t4}')
assert raw_result == sqlalchemy_result == peewee_result
限制 = 1000:
原始数据:0.02643609046936035
SQLAlchemy:0.03697466850280762
皮威:1.0509874820709229
限额=10000
原始数据:0.15931344032287598
SQLAlchemy:0.07229042053222656
皮威:10.82826042175293
两个示例都使用服务器端游标。
我简要介绍了这一点,看起来 95% 以上的时间都花在调用 cursor.fetchone
https://github.com/coleifer/peewee/blob/d8e34b0682d87bd56c1a3636445d9c0fccf2b1e2/peewee.py#L2340
知道怎么回事吗?
这似乎与 Peewee 2.x 中服务器端游标的实现效率低下有关。具体来说,我认为这是因为 peewee 的游标包装器使用 .fetchone() db-api 而不是获取许多行。 3.0a 有一个应该更快的新实现:https://github.com/coleifer/peewee/commit/0ae17c519475c935d9db3c338f36ef058a3f879c
此外,在 2.x 中使用客户端游标存在 none 这些效率问题,因此可以暂时用作解决方法。
在遍历 postgres table 时,我看到莫名其妙的大开销。
我分析了代码,还使用 SQLAlchemy
进行了冒烟测试,以确保它不是慢速连接或底层驱动程序 (psycopg2
)。
运行 这针对 postgres table 约 100 万条记录,但只获取其中的一小部分。
import time
import peewee
import sqlalchemy
from playhouse import postgres_ext
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.engine.url import URL as AlchemyURL
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker as alchemy_sessionmaker
user = 'XXX'
password = 'XXX'
database = 'XXX'
host = 'XXX'
port = 5432
table = 'person'
limit = 1000
peewee_db = postgres_ext.PostgresqlExtDatabase(
database=database,
host=host, port=port,
user=user, password=password,
use_speedups=True,
server_side_cursors=True,
register_hstore=False,
)
alchemy_engine = sqlalchemy.create_engine(AlchemyURL('postgresql', username=user, password=password,
database=database, host=host, port=port))
alchemy_session = alchemy_sessionmaker(bind=alchemy_engine)()
class PeeweePerson(peewee.Model):
class Meta:
database = peewee_db
db_table = table
id = peewee.CharField(primary_key=True, max_length=64)
data = postgres_ext.BinaryJSONField(index=True, index_type='GIN')
class SQLAlchemyPerson(declarative_base()):
__tablename__ = table
id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True)
data = sqlalchemy.Column(JSONB)
def run_raw_query():
ids = list(peewee_db.execute_sql(f"SELECT id from {table} order by id desc limit {limit}"))
return ids
def run_peewee_query():
query = PeeweePerson.select(PeeweePerson.id).order_by(PeeweePerson.id.desc()).limit(limit)
ids = list(query.tuples())
return ids
def run_sqlalchemy_query():
query = alchemy_session.query(SQLAlchemyPerson.id).order_by(sqlalchemy.desc(SQLAlchemyPerson.id)).limit(limit)
ids = list(query)
return ids
if __name__ == '__main__':
t0 = time.time()
raw_result = run_raw_query()
t1 = time.time()
print(f'Raw: {t1 - t0}')
t2 = time.time()
sqlalchemy_result = run_sqlalchemy_query()
t3 = time.time()
print(f'SQLAlchemy: {t3 - t2}')
t4 = time.time()
peewee_result = run_peewee_query()
t5 = time.time()
print(f'peewee: {t5 - t4}')
assert raw_result == sqlalchemy_result == peewee_result
限制 = 1000:
原始数据:0.02643609046936035
SQLAlchemy:0.03697466850280762
皮威:1.0509874820709229限额=10000
原始数据:0.15931344032287598
SQLAlchemy:0.07229042053222656
皮威:10.82826042175293
两个示例都使用服务器端游标。
我简要介绍了这一点,看起来 95% 以上的时间都花在调用 cursor.fetchone
https://github.com/coleifer/peewee/blob/d8e34b0682d87bd56c1a3636445d9c0fccf2b1e2/peewee.py#L2340
知道怎么回事吗?
这似乎与 Peewee 2.x 中服务器端游标的实现效率低下有关。具体来说,我认为这是因为 peewee 的游标包装器使用 .fetchone() db-api 而不是获取许多行。 3.0a 有一个应该更快的新实现:https://github.com/coleifer/peewee/commit/0ae17c519475c935d9db3c338f36ef058a3f879c
此外,在 2.x 中使用客户端游标存在 none 这些效率问题,因此可以暂时用作解决方法。