使用 Python 查找和替换大型文本文件中特定行的最快方法
Fastest way to find and replace specific line in a large text file with Python
我有一个 numbers.txt
文件,它由几行 100K 行组成,每行由两个用 :
符号分隔的唯一数字组成:
407597693:1604722326.2426915
510905857:1604722326.2696202
76792361:1604722331.120079
112854912:1604722333.4496727
470822611:1604722335.283259
我的目标是找到左侧带有数字 407597693
的一行,然后通过向其添加 3600
继续更改右侧的数字。之后,我必须用所有更改重写 numbers.txt
文件。我必须尽快对同一个 txt 文件执行相同的(只是不同的数字)操作。
我设法通过 with open:
文件操作和对每一行的 for
循环,搜索所需的数字,修改行,然后重写整个文件来使其工作。但是,我注意到不断执行这样的操作对我的程序来说确实需要一些时间,大约 0.2-0.5 秒,随着时间的推移加起来并大大减慢一切。
这是我使用的代码:
number = 407597693
with open("numbers.txt", "r+") as library:
file = library.read()
if (str(number) + ":") in file:
lines = file.splitlines()
with open("numbers_temp.txt", "a+") as library_temp:
for line in lines:
if (str(number) + ":") in line:
library_temp.write(
"\n" + str(number) + ":" + str(time.time() + 3600)
)
else:
library_temp.write("\n" + line)
library_temp.seek(0)
new_file = library_temp.read()
with open("numbers.txt", "w+") as library_2:
library_2.write(new_file)
os.remove("numbers_temp.txt")
非常感谢任何有关如何加快此过程的意见,在此先感谢!
我假设你的内存可以存储整个文件。使用正则表达式应该会更快:
import re
number = 407597693
with open("numbers.txt", "r") as f:
data = f.read()
# data = re.sub(f'({number}):(.*)', lambda x:f"{x.group(1)}:{float(x.group(2))+3600}", data)
data = re.sub("^" + str(number) + ".*\n", str(number) + ":" + str(int(time.time()) + 3600) + "\n", data, flags=re.MULTILINE)
with open("numbers.txt", "w") as f:
f.write(data)
不必 运行 多个循环,我们可以在单个循环中执行此操作,如下所示:
number = 407597693
numbers = ''
with open('numbers.txt', "r+") as inputfile:
file = inputfile.read()
if(file.find(str(number))) != -1 :
for line in file.splitlines():
if (line.find(str(number))) == 0:
numbers += line.split(':')[0] + ':' + str(float(line.split(':')[1]) + float(3600)) + '\n'
else:
numbers += line + '\n'
with open('numbers.txt', 'w') as updatedFile:
updatedFile.writelines(numbers)
希望这会有所帮助..
您可以打开一个内存映射文件,使用正则表达式找到您想要的行,运气好的话您只需更改文件中的一页。我正在使用 decimal 模块,这样您就不会遇到十进制到二进制 float 转换的问题。通常新号码和旧号码的宽度相同,文件内容不需要移动。我正在展示一个 linux 示例。 Windows mmap.map
有点不同,但应该很容易使用。
import mmap
import re
from decimal import Decimal
def increment_record(filename, findval, increment):
with open(filename, "rb+") as fp:
with mmap.mmap(fp.fileno(), 0) as fmap:
search = re.search(rf"{findme}:([\d\.]+)".encode("ascii"), fmap,
re.MULTILINE)
if search:
# found float to change. Use Decimal for base 10 precision
newval = Decimal(search.group(1).decode("ascii")) + increment
newval = f"{newval}".encode("ascii")
delta = len(newval) - len(search.group(1))
if delta:
# need to expand file and copy
fsize = fmap.size()
fmap.resize(fsize + delta)
fmap.move(search.end(1) + delta, search.end(1),
fsize - search.end(1))
# change just the number
fmap[search.start(1):search.start(1) + len(newval)] = newval
# test parameters
filename = "test.txt"
findme = "76792361"
increment = 3600
testdata = """407597693:1604722326.2426915
510905857:1604722326.2696202
76792361:1604722331.120079
112854912:1604722333.4496727
470822611:1604722335.283259"""
open(filename, "w").write(testdata)
increment_record(filename, findme, increment)
print("changes:")
for old,new in zip(testdata.split("\n"), open(filename)):
new = new.strip()
if old != new:
print((old,new))
print("done")
我有一个 numbers.txt
文件,它由几行 100K 行组成,每行由两个用 :
符号分隔的唯一数字组成:
407597693:1604722326.2426915
510905857:1604722326.2696202
76792361:1604722331.120079
112854912:1604722333.4496727
470822611:1604722335.283259
我的目标是找到左侧带有数字 407597693
的一行,然后通过向其添加 3600
继续更改右侧的数字。之后,我必须用所有更改重写 numbers.txt
文件。我必须尽快对同一个 txt 文件执行相同的(只是不同的数字)操作。
我设法通过 with open:
文件操作和对每一行的 for
循环,搜索所需的数字,修改行,然后重写整个文件来使其工作。但是,我注意到不断执行这样的操作对我的程序来说确实需要一些时间,大约 0.2-0.5 秒,随着时间的推移加起来并大大减慢一切。
这是我使用的代码:
number = 407597693
with open("numbers.txt", "r+") as library:
file = library.read()
if (str(number) + ":") in file:
lines = file.splitlines()
with open("numbers_temp.txt", "a+") as library_temp:
for line in lines:
if (str(number) + ":") in line:
library_temp.write(
"\n" + str(number) + ":" + str(time.time() + 3600)
)
else:
library_temp.write("\n" + line)
library_temp.seek(0)
new_file = library_temp.read()
with open("numbers.txt", "w+") as library_2:
library_2.write(new_file)
os.remove("numbers_temp.txt")
非常感谢任何有关如何加快此过程的意见,在此先感谢!
我假设你的内存可以存储整个文件。使用正则表达式应该会更快:
import re
number = 407597693
with open("numbers.txt", "r") as f:
data = f.read()
# data = re.sub(f'({number}):(.*)', lambda x:f"{x.group(1)}:{float(x.group(2))+3600}", data)
data = re.sub("^" + str(number) + ".*\n", str(number) + ":" + str(int(time.time()) + 3600) + "\n", data, flags=re.MULTILINE)
with open("numbers.txt", "w") as f:
f.write(data)
不必 运行 多个循环,我们可以在单个循环中执行此操作,如下所示:
number = 407597693
numbers = ''
with open('numbers.txt', "r+") as inputfile:
file = inputfile.read()
if(file.find(str(number))) != -1 :
for line in file.splitlines():
if (line.find(str(number))) == 0:
numbers += line.split(':')[0] + ':' + str(float(line.split(':')[1]) + float(3600)) + '\n'
else:
numbers += line + '\n'
with open('numbers.txt', 'w') as updatedFile:
updatedFile.writelines(numbers)
希望这会有所帮助..
您可以打开一个内存映射文件,使用正则表达式找到您想要的行,运气好的话您只需更改文件中的一页。我正在使用 decimal 模块,这样您就不会遇到十进制到二进制 float 转换的问题。通常新号码和旧号码的宽度相同,文件内容不需要移动。我正在展示一个 linux 示例。 Windows mmap.map
有点不同,但应该很容易使用。
import mmap
import re
from decimal import Decimal
def increment_record(filename, findval, increment):
with open(filename, "rb+") as fp:
with mmap.mmap(fp.fileno(), 0) as fmap:
search = re.search(rf"{findme}:([\d\.]+)".encode("ascii"), fmap,
re.MULTILINE)
if search:
# found float to change. Use Decimal for base 10 precision
newval = Decimal(search.group(1).decode("ascii")) + increment
newval = f"{newval}".encode("ascii")
delta = len(newval) - len(search.group(1))
if delta:
# need to expand file and copy
fsize = fmap.size()
fmap.resize(fsize + delta)
fmap.move(search.end(1) + delta, search.end(1),
fsize - search.end(1))
# change just the number
fmap[search.start(1):search.start(1) + len(newval)] = newval
# test parameters
filename = "test.txt"
findme = "76792361"
increment = 3600
testdata = """407597693:1604722326.2426915
510905857:1604722326.2696202
76792361:1604722331.120079
112854912:1604722333.4496727
470822611:1604722335.283259"""
open(filename, "w").write(testdata)
increment_record(filename, findme, increment)
print("changes:")
for old,new in zip(testdata.split("\n"), open(filename)):
new = new.strip()
if old != new:
print((old,new))
print("done")