多处理问题
Problems with multiprocessing
我正在尝试实现一个 python 脚本,该脚本读取 pdf 文件的内容并将该文件移动到特定目录。
在我的 Debian 机器上,它可以正常工作。但是在我的 Xubuntu 系统上出现以下错误:
Traceback (most recent call last):
File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
self.run()
File "/usr/lib/python3.6/threading.py", line 864, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.6/multiprocessing/pool.py", line 463, in _handle_results
task = get()
File "/usr/lib/python3.6/multiprocessing/connection.py", line 251, in recv
return _ForkingPickler.loads(buf.getbuffer())
TypeError: __init__() takes 1 positional argument but 2 were given
此时,脚本暂停,直到我用 KeyboardInerrupt 取消它,这给了我其余的错误:
Process ForkPoolWorker-5:
Process ForkPoolWorker-6:
Process ForkPoolWorker-3:
Traceback (most recent call last):
Traceback (most recent call last):
File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.6/multiprocessing/pool.py", line 108, in worker
task = get()
File "/usr/lib/python3.6/multiprocessing/pool.py", line 108, in worker
task = get()
Traceback (most recent call last):
File "/usr/lib/python3.6/multiprocessing/queues.py", line 334, in get
with self._rlock:
File "/usr/lib/python3.6/multiprocessing/queues.py", line 334, in get
with self._rlock:
File "/usr/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
return self._semlock.__enter__()
File "/usr/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
return self._semlock.__enter__()
KeyboardInterrupt
KeyboardInterrupt
Process ForkPoolWorker-1:
File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.6/multiprocessing/pool.py", line 108, in worker
task = get()
File "/usr/lib/python3.6/multiprocessing/queues.py", line 334, in get
with self._rlock:
File "/usr/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
return self._semlock.__enter__()
KeyboardInterrupt
Traceback (most recent call last):
File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.6/multiprocessing/pool.py", line 108, in worker
task = get()
File "/usr/lib/python3.6/multiprocessing/queues.py", line 335, in get
res = self._reader.recv_bytes()
File "/usr/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/usr/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/usr/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
KeyboardInterrupt
Traceback (most recent call last):
File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.6/multiprocessing/pool.py", line 108, in worker
task = get()
File "/usr/lib/python3.6/multiprocessing/queues.py", line 334, in get
with self._rlock:
File "/usr/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
return self._semlock.__enter__()
KeyboardInterrupt
我不知道如何解决这个问题。希望大家指点。
到目前为止谢谢大家!
编辑脚本代码:
from datetime import date
from multiprocessing import Pool
from pdf2image import convert_from_path
from os import listdir, remove
from os.path import isfile, join, abspath, split, exists
import pytesseract
import sys
import os
import re
import tempfile
tmp_path = tempfile.gettempdir() # replace with given output directory
def run(path):
PDF_file = abspath(path) # use absolute path of pdf file
pages = convert_from_path(PDF_file, 500)
page = pages[0]
imgFile = abspath(join(tmp_path, "document"+str(date.today())+".jpg"))
# save image to temp path
page.save(imgFile, 'JPEG')
# get text from image of page 1
text = str(((pytesseract.image_to_string(Image.open(imgFile)))))
if exists(imgFile):
os.remove(imgFile)
match = re.search(r"(Vertragsnummer\:\s)(\d+)\w+", text)
if match == None:
print("Could not find contract id")
exit(1)
else:
f = split(PDF_file)
d = join(tmp_path, match.group(2))
if not exists(d):
os.mkdir(d)
PDF_file_new = join(d, f[1])
print("New file: "+PDF_file_new)
os.rename(PDF_file, PDF_file_new)
def run_in_dir(directory):
files = [join(directory, f)
for f in listdir(directory) if isfile(join(directory, f))]
with Pool() as p:
p.map_async(run, files)
p.close()
p.join()
if __name__ == "__main__":
import argparse
import cProfile
parser = argparse.ArgumentParser(description="")
parser.add_argument("-p", "--path", help="Path to specific PDF file.")
parser.add_argument("-d", "--directory",
help="Path to folder containing PDF files.")
args = parser.parse_args()
# run(args.path)
print(cProfile.run("run_in_dir(args.directory)"))
尝试 运行 没有多处理的脚本。就我而言,我发现
pytesseract.pytesseract.TesseractNotFoundError: tesseract is not installed or it's not in your path
Here's如何安装。
我不知道为什么 multiprocessing 的错误消息如此不清楚。
此外,删除 exit(1)
,因为 it's intended 用于交互式 shell 而不是脚本。
我正在尝试实现一个 python 脚本,该脚本读取 pdf 文件的内容并将该文件移动到特定目录。 在我的 Debian 机器上,它可以正常工作。但是在我的 Xubuntu 系统上出现以下错误:
Traceback (most recent call last):
File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
self.run()
File "/usr/lib/python3.6/threading.py", line 864, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.6/multiprocessing/pool.py", line 463, in _handle_results
task = get()
File "/usr/lib/python3.6/multiprocessing/connection.py", line 251, in recv
return _ForkingPickler.loads(buf.getbuffer())
TypeError: __init__() takes 1 positional argument but 2 were given
此时,脚本暂停,直到我用 KeyboardInerrupt 取消它,这给了我其余的错误:
Process ForkPoolWorker-5:
Process ForkPoolWorker-6:
Process ForkPoolWorker-3:
Traceback (most recent call last):
Traceback (most recent call last):
File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.6/multiprocessing/pool.py", line 108, in worker
task = get()
File "/usr/lib/python3.6/multiprocessing/pool.py", line 108, in worker
task = get()
Traceback (most recent call last):
File "/usr/lib/python3.6/multiprocessing/queues.py", line 334, in get
with self._rlock:
File "/usr/lib/python3.6/multiprocessing/queues.py", line 334, in get
with self._rlock:
File "/usr/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
return self._semlock.__enter__()
File "/usr/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
return self._semlock.__enter__()
KeyboardInterrupt
KeyboardInterrupt
Process ForkPoolWorker-1:
File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.6/multiprocessing/pool.py", line 108, in worker
task = get()
File "/usr/lib/python3.6/multiprocessing/queues.py", line 334, in get
with self._rlock:
File "/usr/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
return self._semlock.__enter__()
KeyboardInterrupt
Traceback (most recent call last):
File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.6/multiprocessing/pool.py", line 108, in worker
task = get()
File "/usr/lib/python3.6/multiprocessing/queues.py", line 335, in get
res = self._reader.recv_bytes()
File "/usr/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/usr/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/usr/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
KeyboardInterrupt
Traceback (most recent call last):
File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.6/multiprocessing/pool.py", line 108, in worker
task = get()
File "/usr/lib/python3.6/multiprocessing/queues.py", line 334, in get
with self._rlock:
File "/usr/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
return self._semlock.__enter__()
KeyboardInterrupt
我不知道如何解决这个问题。希望大家指点。 到目前为止谢谢大家!
编辑脚本代码:
from datetime import date
from multiprocessing import Pool
from pdf2image import convert_from_path
from os import listdir, remove
from os.path import isfile, join, abspath, split, exists
import pytesseract
import sys
import os
import re
import tempfile
tmp_path = tempfile.gettempdir() # replace with given output directory
def run(path):
PDF_file = abspath(path) # use absolute path of pdf file
pages = convert_from_path(PDF_file, 500)
page = pages[0]
imgFile = abspath(join(tmp_path, "document"+str(date.today())+".jpg"))
# save image to temp path
page.save(imgFile, 'JPEG')
# get text from image of page 1
text = str(((pytesseract.image_to_string(Image.open(imgFile)))))
if exists(imgFile):
os.remove(imgFile)
match = re.search(r"(Vertragsnummer\:\s)(\d+)\w+", text)
if match == None:
print("Could not find contract id")
exit(1)
else:
f = split(PDF_file)
d = join(tmp_path, match.group(2))
if not exists(d):
os.mkdir(d)
PDF_file_new = join(d, f[1])
print("New file: "+PDF_file_new)
os.rename(PDF_file, PDF_file_new)
def run_in_dir(directory):
files = [join(directory, f)
for f in listdir(directory) if isfile(join(directory, f))]
with Pool() as p:
p.map_async(run, files)
p.close()
p.join()
if __name__ == "__main__":
import argparse
import cProfile
parser = argparse.ArgumentParser(description="")
parser.add_argument("-p", "--path", help="Path to specific PDF file.")
parser.add_argument("-d", "--directory",
help="Path to folder containing PDF files.")
args = parser.parse_args()
# run(args.path)
print(cProfile.run("run_in_dir(args.directory)"))
尝试 运行 没有多处理的脚本。就我而言,我发现
pytesseract.pytesseract.TesseractNotFoundError: tesseract is not installed or it's not in your path
Here's如何安装。 我不知道为什么 multiprocessing 的错误消息如此不清楚。
此外,删除 exit(1)
,因为 it's intended 用于交互式 shell 而不是脚本。