如何分隔单词列表中的字符以查找二元组频率
How to separate characters in a word list to find bigram frequency
我试图在大约 10,000 个单词的列表中找到每个声音序列的二元组频率。到目前为止,我能够获得二元组频率,但它计算的是列表中两个单词的顺序,而不是单词中的发音。有什么方法可以指明我要计算的单位是什么?
这是我的 python 代码:
from collections import Counter
import pandas from pd
CMU_data = pd.read_csv("CMU.csv") #opening the csv file
transcript = CMU_data["Transcription"] #storing transcriptions column as a variable
def converter(x): #converting dataframe column from series to tuple
if isinstance(x, pd.Series):
return tuple(x.values)
else:
return x
transcript2 = transcript.apply(converter).unique()
print(transcript2)
#finding bigrams
data = transcript2
bigrams = Counter(x+y for x, y in zip(*[data[i:] for i in range(2)]))
for bigram, count in bigrams.most_common():
print(bigram, '=', count)
这是当前输出的示例(散列表示单词边界):
# P OY1 N T # # S L AE1 SH # = 1
# S L AE1 SH # # TH R IY1 D IY2 # = 1
# TH R IY1 D IY2 # # K OW1 L AH0 N # = 1
# K OW1 L AH0 N # # S EH1 M IY0 K OW1 L AH0 N # = 1
# S EH1 M IY0 K OW1 L AH0 N # # S EH1 M IH0 K OW2 L AH0 N # = 1
# S EH1 M IH0 K OW2 L AH0 N # # K W EH1 S CH AH0 N M AA1 R K # = 1
# K W EH1 S CH AH0 N M AA1 R K # # AH0 # = 1
# AH0 # # EY1 # = 1
# EY1 # # EY1 Z # = 1
# EY1 Z # # EY1 F AO1 R T UW1 W AH1 N T UW1 EY1 T # = 1
(...)
这是我输入的示例(在它被转换为数组时):
['# P OY1 N T # ' '# S L AE1 SH # ' '# TH R IY1 D IY2 # ' ...
'# L EH1 F T B R EY1 S # ' '# OW1 P EH0 N B R EY1 S # '
'# K L OW1 Z B R EY1 S # ']
我想要得到如下所示的输出:
TH R = 70
IY1 D = 100
IY2 # = 100
# K = 500
OW1 L = 100
AH0 N # = 200
N # = 500
这是一种方法:
from nltk.util import ngrams
from collections import Counter
import pandas as pd
inp = ['# P OY1 N T # ', '# S L AE1 SH # ', '# TH R IY1 D IY2 # ',
'# L EH1 F T B R EY1 S # ', '# OW1 P EH0 N B R EY1 S # ',
'# K L OW1 Z B R EY1 S # ']
def tokenise(s):
toks = s.strip().split(' ')
# Join starting # with second element
toks[0] = ' '.join(toks[:2])
toks.pop(1)
# Join penultimate element with end #
toks[-1] = ' '.join(toks[-2:])
toks.pop(-2)
return toks
def count_ngrams(tups,n):
df = pd.DataFrame(Counter(tups).items(),columns=['bigram','count'])\
.sort_values(by='count',ascending=False)\
.reset_index(drop=True)
return df
def counts(inp,n,unit='sound'):
if unit == 'sound':
tokenised = [tokenise(s) for s in inp]
# Create ngram tuples and flatten nested list
tups = [item for sublist in [list(ngrams(t,n)) for t in tokenised] for item in sublist]
elif unit == 'word':
tups = list(ngrams(inp,n))
return count_ngrams(tups,n)
发音二元组计数
counts(inp,2,unit='sound')
# bigram count
# 0 (EY1, S #) 3
# 1 (R, EY1) 3
# 2 (B, R) 3
# 3 (# P, OY1) 1
# 4 (T, B) 1
# 5 (OW1, Z) 1
# 6 (L, OW1) 1
# 7 (# K, L) 1
# 8 (N, B) 1
# 9 (EH0, N) 1
# 10 (P, EH0) 1
# 11 (# OW1, P) 1
# 12 (F, T) 1
# 13 (OY1, N) 1
# 14 (EH1, F) 1
# 15 (# L, EH1) 1
# 16 (D, IY2 #) 1
# 17 (IY1, D) 1
# 18 (R, IY1) 1
# 19 (# TH, R) 1
# 20 (AE1, SH #) 1
# 21 (L, AE1) 1
# 22 (# S, L) 1
# 23 (N, T #) 1
# 24 (Z, B) 1
双字母词数
counts(inp,2,unit='word')
# bigram count
# 0 (# P OY1 N T # , # S L AE1 SH # ) 1
# 1 (# S L AE1 SH # , # TH R IY1 D IY2 # ) 1
# 2 (# TH R IY1 D IY2 # , # L EH1 F T B R EY1 S # ) 1
# 3 (# L EH1 F T B R EY1 S # , # OW1 P EH0 N B R E... 1
# 4 (# OW1 P EH0 N B R EY1 S # , # K L OW1 Z B R E... 1
我试图在大约 10,000 个单词的列表中找到每个声音序列的二元组频率。到目前为止,我能够获得二元组频率,但它计算的是列表中两个单词的顺序,而不是单词中的发音。有什么方法可以指明我要计算的单位是什么?
这是我的 python 代码:
from collections import Counter
import pandas from pd
CMU_data = pd.read_csv("CMU.csv") #opening the csv file
transcript = CMU_data["Transcription"] #storing transcriptions column as a variable
def converter(x): #converting dataframe column from series to tuple
if isinstance(x, pd.Series):
return tuple(x.values)
else:
return x
transcript2 = transcript.apply(converter).unique()
print(transcript2)
#finding bigrams
data = transcript2
bigrams = Counter(x+y for x, y in zip(*[data[i:] for i in range(2)]))
for bigram, count in bigrams.most_common():
print(bigram, '=', count)
这是当前输出的示例(散列表示单词边界):
# P OY1 N T # # S L AE1 SH # = 1
# S L AE1 SH # # TH R IY1 D IY2 # = 1
# TH R IY1 D IY2 # # K OW1 L AH0 N # = 1
# K OW1 L AH0 N # # S EH1 M IY0 K OW1 L AH0 N # = 1
# S EH1 M IY0 K OW1 L AH0 N # # S EH1 M IH0 K OW2 L AH0 N # = 1
# S EH1 M IH0 K OW2 L AH0 N # # K W EH1 S CH AH0 N M AA1 R K # = 1
# K W EH1 S CH AH0 N M AA1 R K # # AH0 # = 1
# AH0 # # EY1 # = 1
# EY1 # # EY1 Z # = 1
# EY1 Z # # EY1 F AO1 R T UW1 W AH1 N T UW1 EY1 T # = 1
(...)
这是我输入的示例(在它被转换为数组时):
['# P OY1 N T # ' '# S L AE1 SH # ' '# TH R IY1 D IY2 # ' ...
'# L EH1 F T B R EY1 S # ' '# OW1 P EH0 N B R EY1 S # '
'# K L OW1 Z B R EY1 S # ']
我想要得到如下所示的输出:
TH R = 70
IY1 D = 100
IY2 # = 100
# K = 500
OW1 L = 100
AH0 N # = 200
N # = 500
这是一种方法:
from nltk.util import ngrams
from collections import Counter
import pandas as pd
inp = ['# P OY1 N T # ', '# S L AE1 SH # ', '# TH R IY1 D IY2 # ',
'# L EH1 F T B R EY1 S # ', '# OW1 P EH0 N B R EY1 S # ',
'# K L OW1 Z B R EY1 S # ']
def tokenise(s):
toks = s.strip().split(' ')
# Join starting # with second element
toks[0] = ' '.join(toks[:2])
toks.pop(1)
# Join penultimate element with end #
toks[-1] = ' '.join(toks[-2:])
toks.pop(-2)
return toks
def count_ngrams(tups,n):
df = pd.DataFrame(Counter(tups).items(),columns=['bigram','count'])\
.sort_values(by='count',ascending=False)\
.reset_index(drop=True)
return df
def counts(inp,n,unit='sound'):
if unit == 'sound':
tokenised = [tokenise(s) for s in inp]
# Create ngram tuples and flatten nested list
tups = [item for sublist in [list(ngrams(t,n)) for t in tokenised] for item in sublist]
elif unit == 'word':
tups = list(ngrams(inp,n))
return count_ngrams(tups,n)
发音二元组计数
counts(inp,2,unit='sound')
# bigram count
# 0 (EY1, S #) 3
# 1 (R, EY1) 3
# 2 (B, R) 3
# 3 (# P, OY1) 1
# 4 (T, B) 1
# 5 (OW1, Z) 1
# 6 (L, OW1) 1
# 7 (# K, L) 1
# 8 (N, B) 1
# 9 (EH0, N) 1
# 10 (P, EH0) 1
# 11 (# OW1, P) 1
# 12 (F, T) 1
# 13 (OY1, N) 1
# 14 (EH1, F) 1
# 15 (# L, EH1) 1
# 16 (D, IY2 #) 1
# 17 (IY1, D) 1
# 18 (R, IY1) 1
# 19 (# TH, R) 1
# 20 (AE1, SH #) 1
# 21 (L, AE1) 1
# 22 (# S, L) 1
# 23 (N, T #) 1
# 24 (Z, B) 1
双字母词数
counts(inp,2,unit='word')
# bigram count
# 0 (# P OY1 N T # , # S L AE1 SH # ) 1
# 1 (# S L AE1 SH # , # TH R IY1 D IY2 # ) 1
# 2 (# TH R IY1 D IY2 # , # L EH1 F T B R EY1 S # ) 1
# 3 (# L EH1 F T B R EY1 S # , # OW1 P EH0 N B R E... 1
# 4 (# OW1 P EH0 N B R EY1 S # , # K L OW1 Z B R E... 1