使用 python 在大文件中搜索字符串的更快方法

Question

我有两个文本文件需要处理。这是我的情况：

这两个文件非常大，一个1.21GB，一个1.1GB。他们每一个都包含大约超过3000万行的中文字符串。
每个文件中的每个字符串都是唯一的。
我不需要修改这些文件，一旦加载，它们就不会改变。

问题是，其中一个文件已损坏。我们称它为 N5。 N5 的每一行字符串应该如下所示：'a5 b5 c5 d5 e5\tf5'

相反，它是：'a5b5 c5 d5 e5\tf5'

我正在尝试从另一个文件中恢复它，我们称它为 N4，它看起来像这样：'a4 b4 c4 d4\tf4'

我想做的是用N4把N5中的a5b5分开，可能会有三种结果：

'a4 b4 c4 d4' 等于 'a5 b5 c5 d5'
'a4 b4 c4 d4' 等于 'b5 c5 d5 e5'
N5 中没有 N4 的匹配项。

在情况1和2中，我可以得到答案。而在3中，N4中查找完成需要140秒左右。

我现在用list来存储N4和N5，下面是我比较它们的代码。

# test data
N4 = ['a1 b1 c1 e1\t3', 'a2 b2 c2 e2\t2', 'c3 e3 f3 g3\t3']
N5 = ['a1b1 c1 e1 f1\t2', 'a2b c2 e2 f2\t1', 'b3c3 e3 f3 g3\t3']

# result stroage
list_result = []
list_result_no_none = []

counter_none = 0

list_len = len(N4)

for each_item in N5:
    counter_list_len = 0
    list_str_2 = str(each_item).split(' ')
    list_str_2_2 = str(list_str_2[3]).split('\t')
    str_list_str_2_0 = str(list_str_2[0])
    for each_item in N4:
        list_str_1 = str(each_item).split(' ')
        list_str_1_2 = str(list_str_1[3]).split('\t')

        # if n4 y == n5
        if (str(list_str_1[0])+str(list_str_1[1]) == str(list_str_2[0]) and \
           (str(list_str_1[2]) == str(list_str_2[1]) and \
           (str(list_str_1_2[0]) == str(list_str_2[2])) and \
           (str(list_str_1_2[1]) >= str(list_str_2_2[1])))) :

            list_result.append(list_str_1[0] +' '+ list_str_1[1] +' '+ list_str_1[2] +' '+ list_str_1_2[0] +' '+ list_str_2[3])
            list_result_no_none.append(list_str_1[0] +' '+ list_str_1[1] +' '+ list_str_1[2] +' '+ list_str_1_2[0] +' '+ list_str_2[3])
        break

        # if x n4 == n5
        elif ((str(list_str_1[0]) in (str(list_str_2[0]))) and \
            (str(list_str_1[1]) == str(list_str_2[1])) and \
            (str(list_str_1[2]) == str(list_str_2[2])) and \
            (str(list_str_1_2[0]) == str(list_str_2_2[0]) and \
            (str(list_str_1_2[1]) >= str(list_str_2_2[1])))):

            list_result.append(str_list_str_2_0[0:(str(list_str_2[0]).find(str(list_str_1[0])))]\
            +' '+ str_list_str_2_0[(str(list_str_2[0]).find(str(list_str_1[0]))):len(list_str_2[0])]\
            +' '+ list_str_1[1] +' '+ list_str_1[2] +' '+ list_str_2[3])
        list_result_no_none.append(str_list_str_2_0[0:(str(list_str_2[0]).find(str(list_str_1[0])))]\
            +' '+ str_list_str_2_0[(str(list_str_2[0]).find(str(list_str_1[0]))):len(list_str_2[0])]\
            +' '+ list_str_1[1] +' '+ list_str_1[2] +' '+ list_str_2[3])
        break

        # not found
        else:
            counter_list_len += 1
            if counter_list_len == list_len:
                list_result.append('none' +' '+ list_str_2[0] +' '+ list_str_2[1] +' '+ list_str_2[2] +' '+ list_str_2[3])
                counter_none += 1


print(list_result)
print(list_result_no_none)
print("Percentage of not found: %.2f" % ((100*(counter_none/len(N5)))) + '%')

它适用于小规模，但在实际文件上需要运行的时间。

我是 python 的新手，对其他编程语言几乎没有经验。所以如果我的问题对你来说看起来很愚蠢，我很抱歉。另外，我不是母语人士，所以为我糟糕的英语道歉。

Answer 1

您可以将一些列表转换为生成器，这会大大减少内存消耗。只有N4列表必须在内存中，因为遍历多次：

def iter_file(filename):
    with open(filename) as inp:
        for line in inp:
            line = line.split(' ')
            yield line[:-1] + line[-1].split('\t')

def do_correction(n4, n5):
    n4 = list(n4)

    for words_n5 in n5:
        for words_n4 in n4:

            # if n4 y == n5
            if (words_n4[0]+words_n4[1] == words_n5[0] and
                words_n4[2] == words_n5[1] and
                words_n4[3] == words_n5[2] and
                words_n4[4] >= words_n5[3]):
                yield words_n4[:-1] + words_n5[3:]
                break

            # if x n4 == n5
            elif (words_n4[0] in words_n5[0] and
                words_n4[1] == words_n5[1] and
                words_n4[2] == words_n5[2] and
                words_n4[3] == words_n5[3] and
                words_n4[4] >= words_n5[4]):
                idx = words_n5[0].find(words_n4[0])
                yield [words_n5[:idx], words_n5[idx:]], words_n5[1:]
                break
        else: # not found
            yield ['none'] + words_n5

with open('corrected', 'w') as output:
    for words in do_correction(iter_file('N4'), iter_file('N5')):
        output.write('%s\t%s' %(' '.join(words[:-1]), words[-1]))

接下来，您可以将 N4 的部分内容转换成字典，这样查找速度会更快：

from collections import defaultdict

def iter_file(filename):
    with open(filename) as inp:
        for line in inp:
            line = line.split(' ')
            yield line[:-1] + line[-1].split('\t')

def do_correction(n4, n5):
    n4_dict = defaultdict(list)
    for words_n4 in n4:
        n4[words_n4[2], words_n4[3]].append(words_n4)

    for words_n5 in n5:
        words_n4 = next(
            (words_n4 for words_n4 in n4_dict[words_n5[1], words_n5[2]]
                if (words_n4[0]+words_n4[1] == words_n5[0] and
                words_n4[4] >= words_n5[3])),
            None)
        if words_n4:
            yield words_n4[:-1] + words_n5[3:]
        else:
            words_n4 = next(
                (words_n4 for words_n4 in n4_dict[words_n5[2], words_n5[3]]
                    if (words_n4[0] in words_n5[0] and
                    words_n4[1] == words_n5[1] and
                    words_n4[4] >= words_n5[4])),
                None)
            if words_n4:
                idx = words_n5[0].find(words_n4[0])
                yield [words_n5[:idx], words_n5[idx:]], words_n5[1:]
            else: # not found
                yield ['none'] + words_n5

with open('corrected', 'w') as output:
    for words in do_correction(iter_file('N4'), iter_file('N5')):
        output.write('%s\t%s' %(' '.join(words[:-1]), words[-1]))

使用 python 在大文件中搜索字符串的更快方法

Faster way to search strings in big file with python

python

nlp