python open() 与 gzip.open() 和文件模式
python open() vs gzip.open() and file mode
为什么使用 open()
与官方 gzip module 的 gzip.open()
时文件模式不同?
Python 2.7 Linux.
在已经打开的文件句柄上使用 GzipFile
时会发生同样的事情。
我以为它应该是透明的,那为什么我看到的是数字模式而不是 rb
/ wb
?
测试脚本
#!/usr/bin/env python
"""
Write one file to another, with optional gzip on both sides.
Usage:
gzipcat.py <input file> <output file>
Examples:
gzipcat.py /etc/passwd passwd.bak.gz
gzipcat.py passwd.bak.gz passwd.bak
"""
import sys
import gzip
if len(sys.argv) < 3:
sys.exit(__doc__)
ifn = sys.argv[1]
if ifn.endswith('.gz'):
ifd = gzip.open(ifn, 'rb')
else:
ifd = open(ifn, 'rb')
ofn = sys.argv[2]
if ofn.endswith('.gz'):
ofd = gzip.open(ofn, 'wb')
else:
ofd = open(ofn, 'wb')
ifm = getattr(ifd, 'mode', None)
ofm = getattr(ofd, 'mode', None)
print('input file mode: {}, output file mode: {}'.format(ifm, ofm))
for ifl in ifd:
ofd.write(ifl)
测试脚本输出
$ python gzipcat.py /etc/passwd passwd.bak
input file mode: rb, output file mode: wb
$ python gzipcat.py /etc/passwd passwd.bak.gz
input file mode: rb, output file mode: 2
$ python gzipcat.py passwd.bak.gz passwd.txt
input file mode: 1, output file mode: wb
$ python gzipcat.py passwd.bak.gz passwd.txt.gz
input file mode: 1, output file mode: 2
第二个问题:这背后有什么充分的理由,还是只是 gzip 模块中的一个遗漏/未处理的案例?
背景
我的实际用例是 Google BigQuery 加载器,它要求模式为 rb
,然后才能将其用作数据源。回溯如下。但是我在上面准备了最小的测试用例,以使这个问题更具可读性。
# python -c 'import etl; etl.job001()'
Starting job001.
Processing table: reviews.
Extracting reviews, time range [2018-04-07 17:01:38.172129+00:00, 2018-04-07 18:09:50.763283)
Extracted 24 rows to reviews.tmp.gz in 2 s (8 rows/s).
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "etl.py", line 920, in wf_dimension_tables
ts_end=ts_end)
File "etl.py", line 680, in map_table_delta
rewrite=True
File "etl.py", line 624, in bq_load_csv
job_config=job_config)
File "/usr/lib/python2.7/site-packages/google/cloud/bigquery/client.py", line 797, in load_table_from_file
_check_mode(file_obj)
File "/usr/lib/python2.7/site-packages/google/cloud/bigquery/client.py", line 1419, in _check_mode
"Cannot upload files opened in text mode: use "
ValueError: Cannot upload files opened in text mode: use open(filename, mode='rb') or open(filename, mode='r+b')
这里是 使用 文件句柄的 bigquery API 调用:
def bq_load_csv(dataset_id, table_id, fileobj):
client = bigquery.Client()
dataset_ref = client.dataset(dataset_id)
table_ref = dataset_ref.table(table_id)
job_config = bigquery.LoadJobConfig()
job_config.source_format = 'text/csv'
job_config.field_delimiter = ','
job_config.skip_leading_rows = 0
job_config.allow_quoted_newlines = True
job_config.max_bad_records = 0
job = client.load_table_from_file(
fileobj,
table_ref,
job_config=job_config)
res = job.result() # Waits for job to complete
return res
更新
此问题已在 python bigquery 客户端 1.5.0 中修复。
感谢提交错误报告的@a-queue,并感谢 Google 实际修复它的开发人员。
处理此问题的正确方法是在 Python 和 Google 云客户端库中针对 Python 各自的问题跟踪器提出问题。
解决方法
您可以将 google.cloud.bigquery.client
中的 _check_mode
函数替换为接受 1
和 2
,就像我在下面所做的那样。我试过 运行 这段代码并且有效:
import gzip
from google.cloud import bigquery
def _check_mode(stream):
mode = getattr(stream, 'mode', None)
if mode is not None and mode not in ('rb', 'r+b', 'rb+', 1, 2):
raise ValueError(
"Cannot upload files opened in text mode: use "
"open(filename, mode='rb') or open(filename, mode='r+b')")
bigquery.client._check_mode = _check_mode
#...
def bq_load_csv(dataset_id, table_id, fileobj):
#...
解释
google-云-python
跟踪显示最后一个失败的是来自 google/cloud/bigquery/client.py
:
的函数 _check_mode
if mode is not None and mode not in ('rb', 'r+b', 'rb+'):
raise ValueError(
"Cannot upload files opened in text mode: use "
"open(filename, mode='rb') or open(filename, mode='r+b')")
gzip.py
并且在gzip库中的函数__init__
of the class GzipFile
you can see that the variable mode
was passed to this function but NOT assigned to self.mode but is used to assign an interger:
READ, WRITE = 1, 2 #line 18
...
class GzipFile(_compression.BaseStream):
...
def __init__(self, filename=None, mode=None,
...
elif mode.startswith(('w', 'a', 'x')): #line 179
self.mode = WRITE
根据怪线 18 已更改 21 years ago and line 180, self.mode = Write
, 20 years ago。
为什么使用 open()
与官方 gzip module 的 gzip.open()
时文件模式不同?
Python 2.7 Linux.
在已经打开的文件句柄上使用 GzipFile
时会发生同样的事情。
我以为它应该是透明的,那为什么我看到的是数字模式而不是 rb
/ wb
?
测试脚本
#!/usr/bin/env python
"""
Write one file to another, with optional gzip on both sides.
Usage:
gzipcat.py <input file> <output file>
Examples:
gzipcat.py /etc/passwd passwd.bak.gz
gzipcat.py passwd.bak.gz passwd.bak
"""
import sys
import gzip
if len(sys.argv) < 3:
sys.exit(__doc__)
ifn = sys.argv[1]
if ifn.endswith('.gz'):
ifd = gzip.open(ifn, 'rb')
else:
ifd = open(ifn, 'rb')
ofn = sys.argv[2]
if ofn.endswith('.gz'):
ofd = gzip.open(ofn, 'wb')
else:
ofd = open(ofn, 'wb')
ifm = getattr(ifd, 'mode', None)
ofm = getattr(ofd, 'mode', None)
print('input file mode: {}, output file mode: {}'.format(ifm, ofm))
for ifl in ifd:
ofd.write(ifl)
测试脚本输出
$ python gzipcat.py /etc/passwd passwd.bak
input file mode: rb, output file mode: wb
$ python gzipcat.py /etc/passwd passwd.bak.gz
input file mode: rb, output file mode: 2
$ python gzipcat.py passwd.bak.gz passwd.txt
input file mode: 1, output file mode: wb
$ python gzipcat.py passwd.bak.gz passwd.txt.gz
input file mode: 1, output file mode: 2
第二个问题:这背后有什么充分的理由,还是只是 gzip 模块中的一个遗漏/未处理的案例?
背景
我的实际用例是 Google BigQuery 加载器,它要求模式为 rb
,然后才能将其用作数据源。回溯如下。但是我在上面准备了最小的测试用例,以使这个问题更具可读性。
# python -c 'import etl; etl.job001()'
Starting job001.
Processing table: reviews.
Extracting reviews, time range [2018-04-07 17:01:38.172129+00:00, 2018-04-07 18:09:50.763283)
Extracted 24 rows to reviews.tmp.gz in 2 s (8 rows/s).
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "etl.py", line 920, in wf_dimension_tables
ts_end=ts_end)
File "etl.py", line 680, in map_table_delta
rewrite=True
File "etl.py", line 624, in bq_load_csv
job_config=job_config)
File "/usr/lib/python2.7/site-packages/google/cloud/bigquery/client.py", line 797, in load_table_from_file
_check_mode(file_obj)
File "/usr/lib/python2.7/site-packages/google/cloud/bigquery/client.py", line 1419, in _check_mode
"Cannot upload files opened in text mode: use "
ValueError: Cannot upload files opened in text mode: use open(filename, mode='rb') or open(filename, mode='r+b')
这里是 使用 文件句柄的 bigquery API 调用:
def bq_load_csv(dataset_id, table_id, fileobj):
client = bigquery.Client()
dataset_ref = client.dataset(dataset_id)
table_ref = dataset_ref.table(table_id)
job_config = bigquery.LoadJobConfig()
job_config.source_format = 'text/csv'
job_config.field_delimiter = ','
job_config.skip_leading_rows = 0
job_config.allow_quoted_newlines = True
job_config.max_bad_records = 0
job = client.load_table_from_file(
fileobj,
table_ref,
job_config=job_config)
res = job.result() # Waits for job to complete
return res
更新
此问题已在 python bigquery 客户端 1.5.0 中修复。 感谢提交错误报告的@a-queue,并感谢 Google 实际修复它的开发人员。
处理此问题的正确方法是在 Python 和 Google 云客户端库中针对 Python 各自的问题跟踪器提出问题。
解决方法
您可以将 google.cloud.bigquery.client
中的 _check_mode
函数替换为接受 1
和 2
,就像我在下面所做的那样。我试过 运行 这段代码并且有效:
import gzip
from google.cloud import bigquery
def _check_mode(stream):
mode = getattr(stream, 'mode', None)
if mode is not None and mode not in ('rb', 'r+b', 'rb+', 1, 2):
raise ValueError(
"Cannot upload files opened in text mode: use "
"open(filename, mode='rb') or open(filename, mode='r+b')")
bigquery.client._check_mode = _check_mode
#...
def bq_load_csv(dataset_id, table_id, fileobj):
#...
解释
google-云-python
跟踪显示最后一个失败的是来自 google/cloud/bigquery/client.py
:
_check_mode
if mode is not None and mode not in ('rb', 'r+b', 'rb+'):
raise ValueError(
"Cannot upload files opened in text mode: use "
"open(filename, mode='rb') or open(filename, mode='r+b')")
gzip.py
并且在gzip库中的函数__init__
of the class GzipFile
you can see that the variable mode
was passed to this function but NOT assigned to self.mode but is used to assign an interger:
READ, WRITE = 1, 2 #line 18
...
class GzipFile(_compression.BaseStream):
...
def __init__(self, filename=None, mode=None,
...
elif mode.startswith(('w', 'a', 'x')): #line 179
self.mode = WRITE
根据怪线 18 已更改 21 years ago and line 180, self.mode = Write
, 20 years ago。