Python - 查找单词频率和字符串频率以及可能的拼写错误并保存为 txt 文件或 CSV
Python - Finding word frequencies and string frequency with possible misspellings and saving as txt file or CSV
我想要做的是从非常混乱的文本文件中抓取文本,以查找有时会出现拼写错误或不属于的字符的特定单词。我已经能够在一个目录中的多个文件中完成具有精确拼写的单个单词,这很接近,但不完全是我正在寻找的。最后一件事是,我想将这个包含单词和短语计数的列表保存到一个文本文件中,而不仅仅是将其作为摘要打印出来,这就是我的代码现在所做的。
如果找不到相近的匹配项,那没关系,但那将是最理想的。
感谢您的帮助。
import os
from collections import Counter
import glob
def word_frequency(fileobj, words):
"""Build a Counter of specified words in fileobj"""
# initialise the counter to 0 for each word
ct = Counter(dict((w, 0) for w in words))
file_words = (word for line in fileobj for word in line.split())
filtered_words = (word for word in file_words if word in words)
return Counter(filtered_words)
def count_words_in_dir(dirpath, words, action=None):
"""For each .txt file in a dir, count the specified words"""
for filepath in glob.iglob(os.path.join(path, '*.txt')):
with open(filepath) as f:
ct = word_frequency(f, words)
if action:
action(filepath, ct)
def print_summary(filename, ct):
words = sorted(ct.keys())
counts = [str(ct[k]) for k in words]
print('{0}\n{1}\n{2}\n\n'.format(
filepath,
', '.join(words),
', '.join(counts)))
words = set(['JUSTICE', "policy payment", "payment", "annuity", "CYNTHEA" ])
count_words_in_dir('./', words, action=print_summary)
import sys
import os
from collections import Counter
import glob
# def count_words_in_dir(dirpath, words, action=None):
# """For each .txt file in a dir, count the specified words"""
# for filepath in glob.iglob(os.path.join(path, '*.txt')):
# with open(filepath) as f:
# data = f.read()
# for key,val in words.items():
# print("key is " + key + "\n")
# ct = data.count(key)
# words[key] = ct
# if action:
# action(filepath, ct)
stdoutOrigin=sys.stdout
sys.stdout = open("log.txt", "w")
def count_words_in_dir(dirpath, words, action=None):
for filepath in glob.iglob(os.path.join("path", '*.txt')):
with open(filepath) as f:
data = f.read()
for key,val in words.items():
#print("key is " + key + "\n")
ct = data.count(key)
words[key] = ct
if action:
action(filepath, words)
def print_summary(filepath, words):
print(filepath)
for key,val in sorted(words.items()):
print('{0}:\t{1}'.format(
key,
val))
filepath = sys.argv[1]
keys = ["keyword",
"keyword"]
words = dict.fromkeys(keys,0)
count_words_in_dir(filepath, words, action=print_summary)
sys.stdout.close()
sys.stdout=stdoutOrigin
我想要做的是从非常混乱的文本文件中抓取文本,以查找有时会出现拼写错误或不属于的字符的特定单词。我已经能够在一个目录中的多个文件中完成具有精确拼写的单个单词,这很接近,但不完全是我正在寻找的。最后一件事是,我想将这个包含单词和短语计数的列表保存到一个文本文件中,而不仅仅是将其作为摘要打印出来,这就是我的代码现在所做的。
如果找不到相近的匹配项,那没关系,但那将是最理想的。
感谢您的帮助。
import os
from collections import Counter
import glob
def word_frequency(fileobj, words):
"""Build a Counter of specified words in fileobj"""
# initialise the counter to 0 for each word
ct = Counter(dict((w, 0) for w in words))
file_words = (word for line in fileobj for word in line.split())
filtered_words = (word for word in file_words if word in words)
return Counter(filtered_words)
def count_words_in_dir(dirpath, words, action=None):
"""For each .txt file in a dir, count the specified words"""
for filepath in glob.iglob(os.path.join(path, '*.txt')):
with open(filepath) as f:
ct = word_frequency(f, words)
if action:
action(filepath, ct)
def print_summary(filename, ct):
words = sorted(ct.keys())
counts = [str(ct[k]) for k in words]
print('{0}\n{1}\n{2}\n\n'.format(
filepath,
', '.join(words),
', '.join(counts)))
words = set(['JUSTICE', "policy payment", "payment", "annuity", "CYNTHEA" ])
count_words_in_dir('./', words, action=print_summary)
import sys
import os
from collections import Counter
import glob
# def count_words_in_dir(dirpath, words, action=None):
# """For each .txt file in a dir, count the specified words"""
# for filepath in glob.iglob(os.path.join(path, '*.txt')):
# with open(filepath) as f:
# data = f.read()
# for key,val in words.items():
# print("key is " + key + "\n")
# ct = data.count(key)
# words[key] = ct
# if action:
# action(filepath, ct)
stdoutOrigin=sys.stdout
sys.stdout = open("log.txt", "w")
def count_words_in_dir(dirpath, words, action=None):
for filepath in glob.iglob(os.path.join("path", '*.txt')):
with open(filepath) as f:
data = f.read()
for key,val in words.items():
#print("key is " + key + "\n")
ct = data.count(key)
words[key] = ct
if action:
action(filepath, words)
def print_summary(filepath, words):
print(filepath)
for key,val in sorted(words.items()):
print('{0}:\t{1}'.format(
key,
val))
filepath = sys.argv[1]
keys = ["keyword",
"keyword"]
words = dict.fromkeys(keys,0)
count_words_in_dir(filepath, words, action=print_summary)
sys.stdout.close()
sys.stdout=stdoutOrigin