如何分隔单词列表中的字符以查找二元组频率

Question

我试图在大约 10,000 个单词的列表中找到每个声音序列的二元组频率。到目前为止，我能够获得二元组频率，但它计算的是列表中两个单词的顺序，而不是单词中的发音。有什么方法可以指明我要计算的单位是什么？

这是我的 python 代码：

from collections import Counter
import pandas from pd

CMU_data = pd.read_csv("CMU.csv")         #opening the csv file 
transcript = CMU_data["Transcription"]    #storing transcriptions column as a variable


def converter(x):                         #converting dataframe column from series to tuple
    if isinstance(x, pd.Series):
        return tuple(x.values)
    else:
        return x

transcript2 = transcript.apply(converter).unique() 
print(transcript2) 
                       

#finding bigrams

data = transcript2
bigrams = Counter(x+y for x, y in zip(*[data[i:] for i in range(2)]))
for bigram, count in bigrams.most_common():
    print(bigram, '=', count)

这是当前输出的示例（散列表示单词边界）：

# P OY1 N T # # S L AE1 SH #  = 1
# S L AE1 SH # # TH R IY1 D IY2 #  = 1
# TH R IY1 D IY2 # # K OW1 L AH0 N #  = 1
# K OW1 L AH0 N # # S EH1 M IY0 K OW1 L AH0 N #  = 1
# S EH1 M IY0 K OW1 L AH0 N # # S EH1 M IH0 K OW2 L AH0 N #  = 1
# S EH1 M IH0 K OW2 L AH0 N # # K W EH1 S CH AH0 N M AA1 R K #  = 1
# K W EH1 S CH AH0 N M AA1 R K # # AH0 #  = 1
# AH0 # # EY1 #  = 1
# EY1 # # EY1 Z #  = 1
# EY1 Z # # EY1 F AO1 R T UW1 W AH1 N T UW1 EY1 T #  = 1
(...)

这是我输入的示例（在它被转换为数组时）：

['# P OY1 N T # ' '# S L AE1 SH # ' '# TH R IY1 D IY2 # ' ...
 '# L EH1 F T B R EY1 S # ' '# OW1 P EH0 N B R EY1 S # '
 '# K L OW1 Z B R EY1 S # ']

我想要得到如下所示的输出：

TH R = 70
IY1 D = 100
IY2 # = 100 
# K  = 500
OW1 L = 100
AH0 N #  = 200
N # = 500

Answer 1

这是一种方法：

from nltk.util import ngrams 
from collections import Counter
import pandas as pd


inp = ['# P OY1 N T # ', '# S L AE1 SH # ', '# TH R IY1 D IY2 # ',
       '# L EH1 F T B R EY1 S # ', '# OW1 P EH0 N B R EY1 S # ',
       '# K L OW1 Z B R EY1 S # ']

def tokenise(s):
    toks = s.strip().split(' ')
    # Join starting # with second element
    toks[0] = ' '.join(toks[:2])
    toks.pop(1)
    # Join penultimate element with end #
    toks[-1] = ' '.join(toks[-2:])
    toks.pop(-2)
    return toks

def count_ngrams(tups,n):
    
    df = pd.DataFrame(Counter(tups).items(),columns=['bigram','count'])\
           .sort_values(by='count',ascending=False)\
           .reset_index(drop=True)
    
    return df

def counts(inp,n,unit='sound'):

    if unit == 'sound':
        tokenised = [tokenise(s) for s in inp]
        # Create ngram tuples and flatten nested list
        tups = [item for sublist in [list(ngrams(t,n)) for t in tokenised] for item in sublist]
        
    elif unit == 'word':
        tups = list(ngrams(inp,n))

    return count_ngrams(tups,n)

发音二元组计数

counts(inp,2,unit='sound')

#          bigram  count
# 0    (EY1, S #)      3
# 1      (R, EY1)      3
# 2        (B, R)      3
# 3    (# P, OY1)      1
# 4        (T, B)      1
# 5      (OW1, Z)      1
# 6      (L, OW1)      1
# 7      (# K, L)      1
# 8        (N, B)      1
# 9      (EH0, N)      1
# 10     (P, EH0)      1
# 11   (# OW1, P)      1
# 12       (F, T)      1
# 13     (OY1, N)      1
# 14     (EH1, F)      1
# 15   (# L, EH1)      1
# 16   (D, IY2 #)      1
# 17     (IY1, D)      1
# 18     (R, IY1)      1
# 19    (# TH, R)      1
# 20  (AE1, SH #)      1
# 21     (L, AE1)      1
# 22     (# S, L)      1
# 23     (N, T #)      1
# 24       (Z, B)      1

双字母词数

counts(inp,2,unit='word')

#                                               bigram  count
# 0                  (# P OY1 N T # , # S L AE1 SH # )      1
# 1             (# S L AE1 SH # , # TH R IY1 D IY2 # )      1
# 2    (# TH R IY1 D IY2 # , # L EH1 F T B R EY1 S # )      1
# 3  (# L EH1 F T B R EY1 S # , # OW1 P EH0 N B R E...      1
# 4  (# OW1 P EH0 N B R EY1 S # , # K L OW1 Z B R E...      1

如何分隔单词列表中的字符以查找二元组频率

How to separate characters in a word list to find bigram frequency

python

frequency

dataframe

pandas