Python 无法保存多个池处理文件
Python troubles saving several pool processed files
我需要在 praralel 中处理一些文件。
我正在使用池,但我无法保存池处理的文件。
这是代码:
... All imports...
def extract(text_lines):
line_tr01 = []
line_tr02 = []
line_tr03 = []
line_tr03 = []
for line in text_lines:
treatment01 = treatment_a(line, args)
line_tr01.append(treatment01)
treatment02 = treatment_b(line, args)
line_tr02.append(treatment02)
treatment03 = treatment_c(line, args)
line_tr03.append(treatment03)
treatment04 = treatment_d(line, args)
line_tr04.append(treatment04)
for file in folder:
text_lines = read_file_into_list(file_path)
chunk_size=len(text_lines)/6
divided=[]
divided.append(text_lines[0:chunk_size])
divided.append(text_lines[chunk_size:2*chunk_size])
divided.append(text_lines[2*chunk_size:3*chunk_size])
divided.append(text_lines[3*chunk_size:4*chunk_size])
divided.append(text_lines[4*chunk_size:5*chunk_size])
divided.append(text_lines[5*chunk_size:6*chunk_size])
lines=[]
p = Pool(6)
lines.extend(p.map(extract(text_lines),divided))
p.close()
p.join()
p.terminate()
line_tr01=lines[0]
with open(pkl_filename, 'wb') as f:
pickle.dump(line_tr01, f)
line_tr02=lines[1]
with open(pkl_filename, 'wb') as f:
pickle.dump(line_tr02, f)
line_tr03=lines[2]
with open(pkl_filename, 'wb') as f:
pickle.dump(line_tr03, f)
line_tr04=lines[3]
with open(pkl_filename, 'wb') as f:
pickle.dump(line_tr04, f)
关于如何停止覆盖文件的任何说明
欢迎任何帮助。
提前致谢
所以问题是,当您将内容分解到池中时,您不再拥有当前(滥用)使用的公共全局命名空间。因此,让我们重写它以正确传递内容。
def extract(text_lines):
treatments = dict(tr01=[], tr02=[], tr03=[], tr04=[])
for line in text_lines:
treatments['tr01'].append(treatment_a(line, args))
treatments['tr02'].append(treatment_b(line, args))
treatments['tr03'].append(treatment_c(line, args))
treatments['tr04'].append(treatment_d(line, args))
return treatments
def line_gen(lines, chunk_size=1):
for i in range(0, len(lines), chunk_size):
yield lines[i:i + chunk_size]
for file in folder:
text_lines = read_file_into_list(file_path)
treatments = dict(tr01=[], tr02=[], tr03=[], tr04=[])
p = Pool(6)
for treat_data in p.imap(extract, line_gen(text_lines, chunk_size=int(len(text_lines)/6))):
for tr, data in treat_data.items():
treatments[tr].extend(data)
# Do something with all your data in the treatments dict
这应该将所有数据堆积到一个名为 treatments
的字典中,因为它 returns 来自您的子进程的数据 运行 extract
,然后你可以用任何你喜欢的方式写出数据。
我需要在 praralel 中处理一些文件。 我正在使用池,但我无法保存池处理的文件。 这是代码:
... All imports...
def extract(text_lines):
line_tr01 = []
line_tr02 = []
line_tr03 = []
line_tr03 = []
for line in text_lines:
treatment01 = treatment_a(line, args)
line_tr01.append(treatment01)
treatment02 = treatment_b(line, args)
line_tr02.append(treatment02)
treatment03 = treatment_c(line, args)
line_tr03.append(treatment03)
treatment04 = treatment_d(line, args)
line_tr04.append(treatment04)
for file in folder:
text_lines = read_file_into_list(file_path)
chunk_size=len(text_lines)/6
divided=[]
divided.append(text_lines[0:chunk_size])
divided.append(text_lines[chunk_size:2*chunk_size])
divided.append(text_lines[2*chunk_size:3*chunk_size])
divided.append(text_lines[3*chunk_size:4*chunk_size])
divided.append(text_lines[4*chunk_size:5*chunk_size])
divided.append(text_lines[5*chunk_size:6*chunk_size])
lines=[]
p = Pool(6)
lines.extend(p.map(extract(text_lines),divided))
p.close()
p.join()
p.terminate()
line_tr01=lines[0]
with open(pkl_filename, 'wb') as f:
pickle.dump(line_tr01, f)
line_tr02=lines[1]
with open(pkl_filename, 'wb') as f:
pickle.dump(line_tr02, f)
line_tr03=lines[2]
with open(pkl_filename, 'wb') as f:
pickle.dump(line_tr03, f)
line_tr04=lines[3]
with open(pkl_filename, 'wb') as f:
pickle.dump(line_tr04, f)
关于如何停止覆盖文件的任何说明 欢迎任何帮助。 提前致谢
所以问题是,当您将内容分解到池中时,您不再拥有当前(滥用)使用的公共全局命名空间。因此,让我们重写它以正确传递内容。
def extract(text_lines):
treatments = dict(tr01=[], tr02=[], tr03=[], tr04=[])
for line in text_lines:
treatments['tr01'].append(treatment_a(line, args))
treatments['tr02'].append(treatment_b(line, args))
treatments['tr03'].append(treatment_c(line, args))
treatments['tr04'].append(treatment_d(line, args))
return treatments
def line_gen(lines, chunk_size=1):
for i in range(0, len(lines), chunk_size):
yield lines[i:i + chunk_size]
for file in folder:
text_lines = read_file_into_list(file_path)
treatments = dict(tr01=[], tr02=[], tr03=[], tr04=[])
p = Pool(6)
for treat_data in p.imap(extract, line_gen(text_lines, chunk_size=int(len(text_lines)/6))):
for tr, data in treat_data.items():
treatments[tr].extend(data)
# Do something with all your data in the treatments dict
这应该将所有数据堆积到一个名为 treatments
的字典中,因为它 returns 来自您的子进程的数据 运行 extract
,然后你可以用任何你喜欢的方式写出数据。