Python 中的多个线程写入同一个 CSV
Multiple threads writing to the same CSV in Python
我是 Python 中多线程的新手,目前正在编写附加到 csv 文件的脚本。如果我要将多个线程提交给一个 concurrent.futures.ThreadPoolExecutor
,它将行附加到一个 csv 文件。如果追加是这些线程执行的唯一与文件相关的操作,我能做些什么来保证线程安全?
我的代码的简化版本:
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
for count,ad_id in enumerate(advertisers):
downloadFutures.append(executor.submit(downloadThread, arguments.....))
time.sleep(random.randint(1,3))
我的线程 class 是:
def downloadThread(arguments......):
#Some code.....
writer.writerow(re.split(',', line.decode()))
我应该设置一个单独的单线程执行器来处理写入,还是我只是追加是否值得担心?
编辑:我应该详细说明,写入操作发生的时间可能会随着下一次附加文件之间的分钟数而有很大差异,我只是担心在测试我的脚本时没有发生这种情况,我更愿意为此。
我不确定 csvwriter
是否线程安全。 documentation 没有指定,所以为了安全起见,如果多个线程使用同一个对象,你应该用 threading.Lock
:
来保护使用
# create the lock
import threading
csv_writer_lock = threading.Lock()
def downloadThread(arguments......):
# pass csv_writer_lock somehow
# Note: use csv_writer_lock on *any* access
# Some code.....
with csv_writer_lock:
writer.writerow(re.split(',', line.decode()))
话虽这么说,downloadThread
将写入任务提交给执行程序可能确实更优雅,而不是像这样显式地使用锁。
这是一些代码,它还处理了令人头疼的 unicode 问题:
def ensure_bytes(s):
return s.encode('utf-8') if isinstance(s, unicode) else s
class ThreadSafeWriter(object):
'''
>>> from StringIO import StringIO
>>> f = StringIO()
>>> wtr = ThreadSafeWriter(f)
>>> wtr.writerow(['a', 'b'])
>>> f.getvalue() == "a,b\r\n"
True
'''
def __init__(self, *args, **kwargs):
self._writer = csv.writer(*args, **kwargs)
self._lock = threading.Lock()
def _encode(self, row):
return [ensure_bytes(cell) for cell in row]
def writerow(self, row):
row = self._encode(row)
with self._lock:
return self._writer.writerow(row)
def writerows(self, rows):
rows = (self._encode(row) for row in rows)
with self._lock:
return self._writer.writerows(rows)
# example:
with open('some.csv', 'w') as f:
writer = ThreadSafeWriter(f)
writer.write([u'中文', 'bar'])
更详细的解决方案是here
晚会注意事项:您可以通过让单个编写器从共享队列中消费,并由执行处理的线程将行推送到队列中,以不锁定的不同方式处理此问题.
from threading import Thread
from queue import Queue
from concurrent.futures import ThreadPoolExecutor
# CSV writer setup goes here
queue = Queue()
def consume():
while True:
if not queue.empty():
i = queue.get()
# Row comes out of queue; CSV writing goes here
print(i)
if i == 4999:
return
consumer = Thread(target=consume)
consumer.setDaemon(True)
consumer.start()
def produce(i):
# Data processing goes here; row goes into queue
queue.put(i)
with ThreadPoolExecutor(max_workers=10) as executor:
for i in range(5000):
executor.submit(produce, i)
consumer.join()
我是 Python 中多线程的新手,目前正在编写附加到 csv 文件的脚本。如果我要将多个线程提交给一个 concurrent.futures.ThreadPoolExecutor
,它将行附加到一个 csv 文件。如果追加是这些线程执行的唯一与文件相关的操作,我能做些什么来保证线程安全?
我的代码的简化版本:
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
for count,ad_id in enumerate(advertisers):
downloadFutures.append(executor.submit(downloadThread, arguments.....))
time.sleep(random.randint(1,3))
我的线程 class 是:
def downloadThread(arguments......):
#Some code.....
writer.writerow(re.split(',', line.decode()))
我应该设置一个单独的单线程执行器来处理写入,还是我只是追加是否值得担心?
编辑:我应该详细说明,写入操作发生的时间可能会随着下一次附加文件之间的分钟数而有很大差异,我只是担心在测试我的脚本时没有发生这种情况,我更愿意为此。
我不确定 csvwriter
是否线程安全。 documentation 没有指定,所以为了安全起见,如果多个线程使用同一个对象,你应该用 threading.Lock
:
# create the lock
import threading
csv_writer_lock = threading.Lock()
def downloadThread(arguments......):
# pass csv_writer_lock somehow
# Note: use csv_writer_lock on *any* access
# Some code.....
with csv_writer_lock:
writer.writerow(re.split(',', line.decode()))
话虽这么说,downloadThread
将写入任务提交给执行程序可能确实更优雅,而不是像这样显式地使用锁。
这是一些代码,它还处理了令人头疼的 unicode 问题:
def ensure_bytes(s):
return s.encode('utf-8') if isinstance(s, unicode) else s
class ThreadSafeWriter(object):
'''
>>> from StringIO import StringIO
>>> f = StringIO()
>>> wtr = ThreadSafeWriter(f)
>>> wtr.writerow(['a', 'b'])
>>> f.getvalue() == "a,b\r\n"
True
'''
def __init__(self, *args, **kwargs):
self._writer = csv.writer(*args, **kwargs)
self._lock = threading.Lock()
def _encode(self, row):
return [ensure_bytes(cell) for cell in row]
def writerow(self, row):
row = self._encode(row)
with self._lock:
return self._writer.writerow(row)
def writerows(self, rows):
rows = (self._encode(row) for row in rows)
with self._lock:
return self._writer.writerows(rows)
# example:
with open('some.csv', 'w') as f:
writer = ThreadSafeWriter(f)
writer.write([u'中文', 'bar'])
更详细的解决方案是here
晚会注意事项:您可以通过让单个编写器从共享队列中消费,并由执行处理的线程将行推送到队列中,以不锁定的不同方式处理此问题.
from threading import Thread
from queue import Queue
from concurrent.futures import ThreadPoolExecutor
# CSV writer setup goes here
queue = Queue()
def consume():
while True:
if not queue.empty():
i = queue.get()
# Row comes out of queue; CSV writing goes here
print(i)
if i == 4999:
return
consumer = Thread(target=consume)
consumer.setDaemon(True)
consumer.start()
def produce(i):
# Data processing goes here; row goes into queue
queue.put(i)
with ThreadPoolExecutor(max_workers=10) as executor:
for i in range(5000):
executor.submit(produce, i)
consumer.join()