在 Python 中读取大型 JSON 文件 (raw_decode)
reading large JSON file in Python (raw_decode)
我正在尝试读取 Python 中的大型 JSON 文件 (data.json)。因为JSON文件有多个JSON对象,而Python会创建多个字典(字典数量未知),所以我使用了decoder.raw_decode()和generator。
以下是代码:
import json
import pprint
import io
import pprint
def parse():
with open('data.json',encoding='utf-8') as jfile:
try:
while True:
decoder = json.JSONDecoder()
obj, idx = decoder.raw_decode(jfile)
yield obj
except ValueError as e:
print(e)
pass
else:
print("aha")
def main():
imputd=parse()
if imputd:
while True:
try:
print(str(next(imputd)).readlines())
except StopIteration as e:
print(e)
break
main()
我收到错误:
Traceback (most recent call last):
File "H:\Document\Python\j10.py", line 57, in <module>
main()
File "H:\Document\Python\j10.py", line 36, in main
print(str(next(imputd)).readlines())
File "H:\Document\Python\j10.py", line 21, in parse
obj, idx = decoder.raw_decode(jfile)
File "C:\Python34\lib\json\decoder.py", line 360, in raw_decode
obj, end = self.scan_once(s, idx)
TypeError: first argument must be a string, not _io.TextIOWrapper
我根据 Martijn 的回答编辑了代码:
import json
import io
file=open('data.json.txt')
def readin():
return file.read(2000)
def parse():
decoder = json.JSONDecoder()
buffer = ''
for chunk in iter(readin, ''):
buffer += chunk
while buffer:
try:
result, index = decoder.raw_decode(buffer)
yield result
buffer = buffer[index:]
except ValueError:
# Not enough data to decode, read more
break
def main():
imputd=parse()
if imputd:
while True:
try:
print(str(next(imputd)).readlines())
except StopIteration as e:
print(e)
break
main()
我得到一个 UnicodeError:
Traceback (most recent call last):
File "H:\Document\Python\j11.py", line 35, in <module>
main()
File "H:\Document\Python\j11.py", line 30, in main
print(str(next(imputd)).readlines())
File "H:\Document\Python\j11.py", line 14, in parse
for chunk in iter(readin, ''):
File "H:\Document\Python\j11.py", line 8, in readin
return file.read(2000)
File "C:\Python34\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 4217: character maps to <undefined>
您正在传递文件对象,但 decoder.raw_decode()
只需要 文本数据 。您需要自己阅读:
obj, idx = decoder.raw_decode(jfile.read())
然后您将生成从 JSON 数据 创建的 Python 个对象,因此您的 .readlines()
在 main()
中调用函数循环也会失败。
但是,您没有正确使用 raw_decode()
。您自己负责向它提供文本块,它不会为您从文件中读取该文本。如果您想分块处理文件,并且 JSON 条目之间没有明确的分隔符,您将被迫分块读取文件:
decoder = json.JSONDecoder()
buffer = ''
for chunk in iter(partial(jfile.read, buffersize), ''):
buffer += chunk
while buffer:
try:
result, index = decoder.raw_decode(buffer)
yield result
buffer = buffer[index:]
except ValueError:
# Not enough data to decode, read more
break
这仍然会产生完全解码的对象;如果您的文件是 一个长 JSON 对象 (例如一个顶级列表或字典),那么这将不会一一生成该对象的内容;它仍然会在屈服之前读取整个对象。
我正在尝试读取 Python 中的大型 JSON 文件 (data.json)。因为JSON文件有多个JSON对象,而Python会创建多个字典(字典数量未知),所以我使用了decoder.raw_decode()和generator。 以下是代码:
import json
import pprint
import io
import pprint
def parse():
with open('data.json',encoding='utf-8') as jfile:
try:
while True:
decoder = json.JSONDecoder()
obj, idx = decoder.raw_decode(jfile)
yield obj
except ValueError as e:
print(e)
pass
else:
print("aha")
def main():
imputd=parse()
if imputd:
while True:
try:
print(str(next(imputd)).readlines())
except StopIteration as e:
print(e)
break
main()
我收到错误:
Traceback (most recent call last):
File "H:\Document\Python\j10.py", line 57, in <module>
main()
File "H:\Document\Python\j10.py", line 36, in main
print(str(next(imputd)).readlines())
File "H:\Document\Python\j10.py", line 21, in parse
obj, idx = decoder.raw_decode(jfile)
File "C:\Python34\lib\json\decoder.py", line 360, in raw_decode
obj, end = self.scan_once(s, idx)
TypeError: first argument must be a string, not _io.TextIOWrapper
我根据 Martijn 的回答编辑了代码:
import json
import io
file=open('data.json.txt')
def readin():
return file.read(2000)
def parse():
decoder = json.JSONDecoder()
buffer = ''
for chunk in iter(readin, ''):
buffer += chunk
while buffer:
try:
result, index = decoder.raw_decode(buffer)
yield result
buffer = buffer[index:]
except ValueError:
# Not enough data to decode, read more
break
def main():
imputd=parse()
if imputd:
while True:
try:
print(str(next(imputd)).readlines())
except StopIteration as e:
print(e)
break
main()
我得到一个 UnicodeError:
Traceback (most recent call last):
File "H:\Document\Python\j11.py", line 35, in <module>
main()
File "H:\Document\Python\j11.py", line 30, in main
print(str(next(imputd)).readlines())
File "H:\Document\Python\j11.py", line 14, in parse
for chunk in iter(readin, ''):
File "H:\Document\Python\j11.py", line 8, in readin
return file.read(2000)
File "C:\Python34\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 4217: character maps to <undefined>
您正在传递文件对象,但 decoder.raw_decode()
只需要 文本数据 。您需要自己阅读:
obj, idx = decoder.raw_decode(jfile.read())
然后您将生成从 JSON 数据 创建的 Python 个对象,因此您的 .readlines()
在 main()
中调用函数循环也会失败。
但是,您没有正确使用 raw_decode()
。您自己负责向它提供文本块,它不会为您从文件中读取该文本。如果您想分块处理文件,并且 JSON 条目之间没有明确的分隔符,您将被迫分块读取文件:
decoder = json.JSONDecoder()
buffer = ''
for chunk in iter(partial(jfile.read, buffersize), ''):
buffer += chunk
while buffer:
try:
result, index = decoder.raw_decode(buffer)
yield result
buffer = buffer[index:]
except ValueError:
# Not enough data to decode, read more
break
这仍然会产生完全解码的对象;如果您的文件是 一个长 JSON 对象 (例如一个顶级列表或字典),那么这将不会一一生成该对象的内容;它仍然会在屈服之前读取整个对象。