使用 Browns Corpus NLTK 的条件频率分布 Python
Conditional Frequency Distribution using Browns Corpus NLTK Python
我正在尝试确定以 'ing' 或 'ed' 结尾的单词。计算条件频率分布,其中条件为 ['government'、'hobbies'],事件为 'ing' 或 'ed'。将条件频率分布存储在变量 inged_cfd.
中
下面是我的代码:-
from nltk.corpus import brown
import nltk
genre_word = [ (genre, word.lower())
for genre in ['government', 'hobbies']
for word in brown.words(categories = genre) if (word.endswith('ing') or word.endswith('ed')) ]
genre_word_list = [list(x) for x in genre_word]
for wd in genre_word_list:
if wd[1].endswith('ing'):
wd[1] = 'ing'
elif wd[1].endswith('ed'):
wd[1] = 'ed'
inged_cfd = nltk.ConditionalFreqDist(genre_word_list)
inged_cfd.tabulate(conditions = ['government', 'hobbies'], samples = ['ed','ing'])
我想以表格格式输出,使用上面的代码我得到的输出为:-
ed ing
government 2507 1605
hobbies 2561 2262
而实际输出是:-
ed ing
government 2507 1474
hobbies 2561 2169
请解决我的问题,并帮助我获得准确的输出。
需要排除停用词。此外,在检查条件结束时,将大小写更改为较低。工作代码如下:
from nltk.corpus import brown
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
genre_word = [ (genre, word.lower())
for genre in brown.categories() for word in brown.words(categories=genre) if (word.lower().endswith('ing') or word.lower().endswith('ed')) ]
genre_word_list = [list(x) for x in genre_word]
for wd in genre_word_list:
if wd[1].endswith('ing') and wd[1] not in stop_words:
wd[1] = 'ing'
elif wd[1].endswith('ed') and wd[1] not in stop_words:
wd[1] = 'ed'
inged_cfd = nltk.ConditionalFreqDist(genre_word_list)
inged_cfd.tabulate(conditions = cfdconditions, samples = ['ed','ing'])
在这两个地方使用相同的 cfdconditions
变量会产生问题。实际上,在 python 中,所有内容都作为对象引用工作,因此当您第一次使用 cfdconditions
时,当您传递给 cdev_cfd.tabulate
时它可能会发生变化,而当您下次传递时,它会作为更改传递一。如果您再初始化一个列表,然后将那个列表传递给第二个调用,那就更好了。
这是我的修改
from nltk.corpus import brown
from nltk.corpus import stopwords
def calculateCFD(cfdconditions, cfdevents):
stop_words= stopwords.words('english')
at=[i for i in cfdconditions]
nt = [(genre, word.lower())
for genre in cfdconditions
for word in brown.words(categories=genre) if word not in stop_words and word.isalpha()]
cdv_cfd = nltk.ConditionalFreqDist(nt)
cdv_cfd.tabulate(conditions=cfdconditions, samples=cfdevents)
nt1 = [(genre, word.lower())
for genre in cfdconditions
for word in brown.words(categories=genre) ]
temp =[]
for we in nt1:
wd = we[1]
if wd[-3:] == 'ing' and wd not in stop_words:
temp.append((we[0] ,'ing'))
if wd[-2:] == 'ed':
temp.append((we[0] ,'ed'))
inged_cfd = nltk.ConditionalFreqDist(temp)
a=['ed','ing']
inged_cfd.tabulate(conditions=at, samples=a)
希望对您有所帮助!
预期输出为 -
many years
fiction 29 44
adventure 24 32
science_fiction 11 16
ed ing
fiction 2943 1767
adventure 3281 1844
science_fiction 574 293
和
good bad better
adventure 39 9 30
fiction 60 17 27
science_fiction 14 1 4
mystery 45 13 29
ed ing
adventure 3281 1844
fiction 2943 1767
science_fiction 574 293
mystery 2382 1374
我用过这种方法,代码行数少速度快
from nltk.corpus import brown
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
cdev_cfd = nltk.ConditionalFreqDist([(genre, word.lower()) for genre in cfdconditions
for word in brown.words(categories=genre) if word.lower() not in stop_words])
inged_cfd = nltk.ConditionalFreqDist([(genre, word[-3:].lower() if word.lower().endswith('ing') else word[-2:].lower())
for genre in conditions for word in brown.words(categories=genre)
if word.lower() not in stop_words and (word.lower().endswith('ing') or word.lower().endswith('ed'))])
cdev_cfd.tabulate(conditions=conditions, samples=cfdevents)
inged_cfd.tabulate(conditions=conditions, samples=['ed','ing'])
from nltk.corpus import stopwords,brown
def calculateCFD(cfdconditions, cfdevents):
# Write your code here
stop_words=set(stopwords.w`enter code here`ords("english"))
list1=[(genre,word.lower()) for genre in cfdconditions for word in brown.words(categories=genre) if word.lower() not in stop_words]
cfd1=nltk.ConditionalFreqDist(list1)
cfd1_tabulate=cfd1.tabulate(conditions=cfdconditions,samples=cfdevents)
#print(cfd1_tabulate)
list2=[[genre,word.lower()] for genre in cfdconditions for word in brown.words(categories=genre) if word.lower() not in stop_words if (word.lower().endswith("ed") or word.lower().endswith("ing"))]
for elem in list2:
if elem[1].endswith("ed"):
elem[1]="ed"
else:
elem[1]="ing"
cfd2=nltk.ConditionalFreqDist(list2)
cfd2_tabulate=cfd2.tabulate(conditions=cfdconditions,samples=["ed","ing"])
#print(cfd2_tabulate)
return cfd1_tabulate,cfd2_tabulate
我正在尝试确定以 'ing' 或 'ed' 结尾的单词。计算条件频率分布,其中条件为 ['government'、'hobbies'],事件为 'ing' 或 'ed'。将条件频率分布存储在变量 inged_cfd.
中下面是我的代码:-
from nltk.corpus import brown
import nltk
genre_word = [ (genre, word.lower())
for genre in ['government', 'hobbies']
for word in brown.words(categories = genre) if (word.endswith('ing') or word.endswith('ed')) ]
genre_word_list = [list(x) for x in genre_word]
for wd in genre_word_list:
if wd[1].endswith('ing'):
wd[1] = 'ing'
elif wd[1].endswith('ed'):
wd[1] = 'ed'
inged_cfd = nltk.ConditionalFreqDist(genre_word_list)
inged_cfd.tabulate(conditions = ['government', 'hobbies'], samples = ['ed','ing'])
我想以表格格式输出,使用上面的代码我得到的输出为:-
ed ing
government 2507 1605
hobbies 2561 2262
而实际输出是:-
ed ing
government 2507 1474
hobbies 2561 2169
请解决我的问题,并帮助我获得准确的输出。
需要排除停用词。此外,在检查条件结束时,将大小写更改为较低。工作代码如下:
from nltk.corpus import brown
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
genre_word = [ (genre, word.lower())
for genre in brown.categories() for word in brown.words(categories=genre) if (word.lower().endswith('ing') or word.lower().endswith('ed')) ]
genre_word_list = [list(x) for x in genre_word]
for wd in genre_word_list:
if wd[1].endswith('ing') and wd[1] not in stop_words:
wd[1] = 'ing'
elif wd[1].endswith('ed') and wd[1] not in stop_words:
wd[1] = 'ed'
inged_cfd = nltk.ConditionalFreqDist(genre_word_list)
inged_cfd.tabulate(conditions = cfdconditions, samples = ['ed','ing'])
在这两个地方使用相同的 cfdconditions
变量会产生问题。实际上,在 python 中,所有内容都作为对象引用工作,因此当您第一次使用 cfdconditions
时,当您传递给 cdev_cfd.tabulate
时它可能会发生变化,而当您下次传递时,它会作为更改传递一。如果您再初始化一个列表,然后将那个列表传递给第二个调用,那就更好了。
这是我的修改
from nltk.corpus import brown
from nltk.corpus import stopwords
def calculateCFD(cfdconditions, cfdevents):
stop_words= stopwords.words('english')
at=[i for i in cfdconditions]
nt = [(genre, word.lower())
for genre in cfdconditions
for word in brown.words(categories=genre) if word not in stop_words and word.isalpha()]
cdv_cfd = nltk.ConditionalFreqDist(nt)
cdv_cfd.tabulate(conditions=cfdconditions, samples=cfdevents)
nt1 = [(genre, word.lower())
for genre in cfdconditions
for word in brown.words(categories=genre) ]
temp =[]
for we in nt1:
wd = we[1]
if wd[-3:] == 'ing' and wd not in stop_words:
temp.append((we[0] ,'ing'))
if wd[-2:] == 'ed':
temp.append((we[0] ,'ed'))
inged_cfd = nltk.ConditionalFreqDist(temp)
a=['ed','ing']
inged_cfd.tabulate(conditions=at, samples=a)
希望对您有所帮助!
预期输出为 -
many years
fiction 29 44
adventure 24 32
science_fiction 11 16
ed ing
fiction 2943 1767
adventure 3281 1844
science_fiction 574 293
和
good bad better
adventure 39 9 30
fiction 60 17 27
science_fiction 14 1 4
mystery 45 13 29
ed ing
adventure 3281 1844
fiction 2943 1767
science_fiction 574 293
mystery 2382 1374
我用过这种方法,代码行数少速度快
from nltk.corpus import brown
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
cdev_cfd = nltk.ConditionalFreqDist([(genre, word.lower()) for genre in cfdconditions
for word in brown.words(categories=genre) if word.lower() not in stop_words])
inged_cfd = nltk.ConditionalFreqDist([(genre, word[-3:].lower() if word.lower().endswith('ing') else word[-2:].lower())
for genre in conditions for word in brown.words(categories=genre)
if word.lower() not in stop_words and (word.lower().endswith('ing') or word.lower().endswith('ed'))])
cdev_cfd.tabulate(conditions=conditions, samples=cfdevents)
inged_cfd.tabulate(conditions=conditions, samples=['ed','ing'])
from nltk.corpus import stopwords,brown
def calculateCFD(cfdconditions, cfdevents):
# Write your code here
stop_words=set(stopwords.w`enter code here`ords("english"))
list1=[(genre,word.lower()) for genre in cfdconditions for word in brown.words(categories=genre) if word.lower() not in stop_words]
cfd1=nltk.ConditionalFreqDist(list1)
cfd1_tabulate=cfd1.tabulate(conditions=cfdconditions,samples=cfdevents)
#print(cfd1_tabulate)
list2=[[genre,word.lower()] for genre in cfdconditions for word in brown.words(categories=genre) if word.lower() not in stop_words if (word.lower().endswith("ed") or word.lower().endswith("ing"))]
for elem in list2:
if elem[1].endswith("ed"):
elem[1]="ed"
else:
elem[1]="ing"
cfd2=nltk.ConditionalFreqDist(list2)
cfd2_tabulate=cfd2.tabulate(conditions=cfdconditions,samples=["ed","ing"])
#print(cfd2_tabulate)
return cfd1_tabulate,cfd2_tabulate