Python 中使用 NLTK 的短语的索引
concordance for a phrase using NLTK in Python
是否可以在 NLTK 中获取短语的索引?
import nltk
from nltk.corpus import PlaintextCorpusReader
corpus_loc = "c://temp//text//"
files = ".*\.txt"
read_corpus = PlaintextCorpusReader(corpus_loc, files)
corpus = nltk.Text(read_corpus.words())
test = nltk.TextCollection(corpus_loc)
corpus.concordance("claim")
比如上面的returns
on okay okay okay i can give you the claim number and my information and
decide on the shop okay okay so the claim number is xxxx - xx - xxxx got
现在如果我尝试 corpus.concordance("claim number")
它不起作用...我确实有代码可以通过使用 .partition()
方法和一些相同的进一步编码来做到这一点...但我想知道是否可以使用 concordance
.
来做同样的事情
根据这个 issue 还不能用 concordance()
函数搜索多个单词。
如果您阅读了@b3000 挖掘出的 issue 下的讨论,您会发现很奇怪,实际上可以使用多词索引——但只能在图形索引工具中使用,你可以这样启动:
>>> from nltk.app import concordance
>>> concordance()
我把这个解决方案放在一起...
def n_concordance_tokenised(text,phrase,left_margin=5,right_margin=5):
#concordance replication via https://simplypython.wordpress.com/2014/03/14/saving-output-of-nltk-text-concordance/
phraseList=phrase.split(' ')
c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower())
#Find the offset for each token in the phrase
offsets=[c.offsets(x) for x in phraseList]
offsets_norm=[]
#For each token in the phraselist, find the offsets and rebase them to the start of the phrase
for i in range(len(phraseList)):
offsets_norm.append([x-i for x in offsets[i]])
#We have found the offset of a phrase if the rebased values intersect
#--
#
#the intersection method takes an arbitrary amount of arguments
#result = set(d[0]).intersection(*d[1:])
#--
intersects=set(offsets_norm[0]).intersection(*offsets_norm[1:])
concordance_txt = ([text.tokens[map(lambda x: x-left_margin if (x-left_margin)>0 else 0,[offset])[0]:offset+len(phraseList)+right_margin]
for offset in intersects])
outputs=[''.join([x+' ' for x in con_sub]) for con_sub in concordance_txt]
return outputs
def n_concordance(txt,phrase,left_margin=5,right_margin=5):
tokens = nltk.word_tokenize(txt)
text = nltk.Text(tokens)
return
n_concordance_tokenised(text,phrase,left_margin=left_margin,right_margin=right_margin)
n_concordance_tokenised(text1,'monstrous size')
>> [u'one was of a most monstrous size . ... This came towards ',
u'; for Whales of a monstrous size are oftentimes cast up dead ']
是否可以在 NLTK 中获取短语的索引?
import nltk
from nltk.corpus import PlaintextCorpusReader
corpus_loc = "c://temp//text//"
files = ".*\.txt"
read_corpus = PlaintextCorpusReader(corpus_loc, files)
corpus = nltk.Text(read_corpus.words())
test = nltk.TextCollection(corpus_loc)
corpus.concordance("claim")
比如上面的returns
on okay okay okay i can give you the claim number and my information and
decide on the shop okay okay so the claim number is xxxx - xx - xxxx got
现在如果我尝试 corpus.concordance("claim number")
它不起作用...我确实有代码可以通过使用 .partition()
方法和一些相同的进一步编码来做到这一点...但我想知道是否可以使用 concordance
.
根据这个 issue 还不能用 concordance()
函数搜索多个单词。
如果您阅读了@b3000 挖掘出的 issue 下的讨论,您会发现很奇怪,实际上可以使用多词索引——但只能在图形索引工具中使用,你可以这样启动:
>>> from nltk.app import concordance
>>> concordance()
我把这个解决方案放在一起...
def n_concordance_tokenised(text,phrase,left_margin=5,right_margin=5):
#concordance replication via https://simplypython.wordpress.com/2014/03/14/saving-output-of-nltk-text-concordance/
phraseList=phrase.split(' ')
c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower())
#Find the offset for each token in the phrase
offsets=[c.offsets(x) for x in phraseList]
offsets_norm=[]
#For each token in the phraselist, find the offsets and rebase them to the start of the phrase
for i in range(len(phraseList)):
offsets_norm.append([x-i for x in offsets[i]])
#We have found the offset of a phrase if the rebased values intersect
#--
#
#the intersection method takes an arbitrary amount of arguments
#result = set(d[0]).intersection(*d[1:])
#--
intersects=set(offsets_norm[0]).intersection(*offsets_norm[1:])
concordance_txt = ([text.tokens[map(lambda x: x-left_margin if (x-left_margin)>0 else 0,[offset])[0]:offset+len(phraseList)+right_margin]
for offset in intersects])
outputs=[''.join([x+' ' for x in con_sub]) for con_sub in concordance_txt]
return outputs
def n_concordance(txt,phrase,left_margin=5,right_margin=5):
tokens = nltk.word_tokenize(txt)
text = nltk.Text(tokens)
return
n_concordance_tokenised(text,phrase,left_margin=left_margin,right_margin=right_margin)
n_concordance_tokenised(text1,'monstrous size')
>> [u'one was of a most monstrous size . ... This came towards ',
u'; for Whales of a monstrous size are oftentimes cast up dead ']