Python 多处理:读取文件并更新字典
Python multiprocessing: Reading a file and updating a dictionary
假设我有一个只有 2 行的文本文件,如下所示:
File.txt:
100022441 @DavidBartonWB Guarding Constitution
100022441 RT @frankgaffney 2nd Amendment Guy.
第一列是用户 ID,第二列是用户推文。我想阅读上面的文本文件并更新以下字典:
d={'100022441':{'@frankgaffney': 0, '@DavidBartonWB': 0}}.
这是我的代码:
def f(line):
data = line.split('\t')
uid = data[0]
tweet = data[1]
if uid in d.keys():
for gn in d[uid].keys():
if gn in tweet:
return uid, gn, 1
else:
return uid, gn, 0
p = Pool(4)
with open('~/File.txt') as source_file:
for uid, gn, r in p.map(f, source_file):
d[uid][gn] += r
所以基本上我需要读取文件的每一行并确定用户是否在我的字典中,如果是,推文是否包含用户在字典中的键(例如'@frankgaffney'和'@DavidBartonWB ').所以根据我上面写的两行,代码应该是:
d = {{'100022441':{'@frankgaffney': 1, '@DavidBartonWB': 1 }}
但它给出:
d = {{'100022441':{'@frankgaffney': 1, '@DavidBartonWB': 0 }}
出于某种原因,代码总是丢失所有用户的其中一个密钥。知道我的代码有什么问题吗?
第二列是数据[1],不是数据[2]
data[2] 有效的事实意味着您正在拆分成单词,而不是列
如果您想将用户密钥作为一个单独的词(而不是子字符串)进行搜索,您需要 tweet=data[1:]
如果你想搜索一个子字符串,你需要把它分成两部分:uid,tweet=line.split(None,1)
您的文件是用制表符分隔的,并且您总是检查第三列是否提及;它在第一次提到时工作正常,因为您将整个文件传递给函数,而不是每一行。你这样做很有效:
>>> s = '100022441\t@DavidBartonWB Guarding Constitution\n100022441\tRT@frankgaffney 2nd Amendment Guy.'
>>> s.split('\t')
['100022441', '@DavidBartonWB Guarding Constitution\n100022441', 'RT@frankgaffney 2nd Amendment Guy.']
我推荐两种方法:
- 将您的函数映射到文件中的每个 行。
- 使用正则表达式进行更强大的搜索。
试试这个版本:
import re
d = {'100022441':{'@frankgaffney': 0, '@DavidBartonWB': 0}}
e = r'(@\w+)'
def parser(line):
key, tweet = line.split('\t')
data = d.get(key)
if data:
mentions = re.findall(e, tweet)
for mention in mentions:
if mention in data.keys():
d[key][mention] += 1
with open('~/File.txt') as f:
for line in f:
parser(line)
print(d)
确认其正常工作后,您可以对其进行多处理:
import itertools, re
from multiprocessing import Process, Manager
def parse(queue, d, m):
while True:
line = queue.get()
if line is None:
return # we are done with this thread
key, tweet = line.split('\t')
data = d.get(key)
e = r'(@\w+)'
if data:
mentions = re.findall(e, tweet)
for mention in mentions:
if mention in data:
if mention not in m:
m[mention] = 1
else:
m[mention] += 1
if __name__ == '__main__':
workers = 2
manager = Manager()
d = manager.dict()
d2 = manager.dict()
d = {'100022441': ['@frankgaffney', '@DavidBartonWB']}
queue = manager.Queue(workers)
worker_pool = []
for i in range(workers):
p = Process(target=parse, args=(queue, d, d2))
p.start()
worker_pool.append(p)
# Fill the queue with data for the workers
with open(r'tweets2.txt') as f:
iters = itertools.chain(f, (None,)*workers)
for line in iters:
queue.put(line)
for p in worker_pool:
p.join()
for i,data in d.iteritems():
print('For ID: {}'.format(i))
for key in data:
print(' {} - {}'.format(key, d2[key]))
假设我有一个只有 2 行的文本文件,如下所示:
File.txt:
100022441 @DavidBartonWB Guarding Constitution
100022441 RT @frankgaffney 2nd Amendment Guy.
第一列是用户 ID,第二列是用户推文。我想阅读上面的文本文件并更新以下字典:
d={'100022441':{'@frankgaffney': 0, '@DavidBartonWB': 0}}.
这是我的代码:
def f(line):
data = line.split('\t')
uid = data[0]
tweet = data[1]
if uid in d.keys():
for gn in d[uid].keys():
if gn in tweet:
return uid, gn, 1
else:
return uid, gn, 0
p = Pool(4)
with open('~/File.txt') as source_file:
for uid, gn, r in p.map(f, source_file):
d[uid][gn] += r
所以基本上我需要读取文件的每一行并确定用户是否在我的字典中,如果是,推文是否包含用户在字典中的键(例如'@frankgaffney'和'@DavidBartonWB ').所以根据我上面写的两行,代码应该是:
d = {{'100022441':{'@frankgaffney': 1, '@DavidBartonWB': 1 }}
但它给出:
d = {{'100022441':{'@frankgaffney': 1, '@DavidBartonWB': 0 }}
出于某种原因,代码总是丢失所有用户的其中一个密钥。知道我的代码有什么问题吗?
第二列是数据[1],不是数据[2]
data[2] 有效的事实意味着您正在拆分成单词,而不是列
如果您想将用户密钥作为一个单独的词(而不是子字符串)进行搜索,您需要 tweet=data[1:]
如果你想搜索一个子字符串,你需要把它分成两部分:uid,tweet=line.split(None,1)
您的文件是用制表符分隔的,并且您总是检查第三列是否提及;它在第一次提到时工作正常,因为您将整个文件传递给函数,而不是每一行。你这样做很有效:
>>> s = '100022441\t@DavidBartonWB Guarding Constitution\n100022441\tRT@frankgaffney 2nd Amendment Guy.'
>>> s.split('\t')
['100022441', '@DavidBartonWB Guarding Constitution\n100022441', 'RT@frankgaffney 2nd Amendment Guy.']
我推荐两种方法:
- 将您的函数映射到文件中的每个 行。
- 使用正则表达式进行更强大的搜索。
试试这个版本:
import re
d = {'100022441':{'@frankgaffney': 0, '@DavidBartonWB': 0}}
e = r'(@\w+)'
def parser(line):
key, tweet = line.split('\t')
data = d.get(key)
if data:
mentions = re.findall(e, tweet)
for mention in mentions:
if mention in data.keys():
d[key][mention] += 1
with open('~/File.txt') as f:
for line in f:
parser(line)
print(d)
确认其正常工作后,您可以对其进行多处理:
import itertools, re
from multiprocessing import Process, Manager
def parse(queue, d, m):
while True:
line = queue.get()
if line is None:
return # we are done with this thread
key, tweet = line.split('\t')
data = d.get(key)
e = r'(@\w+)'
if data:
mentions = re.findall(e, tweet)
for mention in mentions:
if mention in data:
if mention not in m:
m[mention] = 1
else:
m[mention] += 1
if __name__ == '__main__':
workers = 2
manager = Manager()
d = manager.dict()
d2 = manager.dict()
d = {'100022441': ['@frankgaffney', '@DavidBartonWB']}
queue = manager.Queue(workers)
worker_pool = []
for i in range(workers):
p = Process(target=parse, args=(queue, d, d2))
p.start()
worker_pool.append(p)
# Fill the queue with data for the workers
with open(r'tweets2.txt') as f:
iters = itertools.chain(f, (None,)*workers)
for line in iters:
queue.put(line)
for p in worker_pool:
p.join()
for i,data in d.iteritems():
print('For ID: {}'.format(i))
for key in data:
print(' {} - {}'.format(key, d2[key]))