Select 至少包含一个数字的 n-gram
Select n-grams which contain at least one number
我有一个 n-gram 列表
('allo', 'stesso', 'modo', 'dell’italia,', 'che')
('stesso', 'modo', 'dell’italia,', 'che', 'sta')
('modo', 'dell’italia,', 'che', 'sta', 'già')
('dell’italia,', 'che', 'sta', 'già', 'pensando')
('che', 'sta', 'già', 'pensando', 'alla')
('sta', 'già', 'pensando', 'alla', 'riapertura')
('soli', '2.900,', 'contando', 'un', 'crollo')
('2.900,', 'contando', 'un', 'crollo', 'del')
('contando', 'un', 'crollo', 'del', '99.9%')
('un', 'crollo', 'del', '99.9%', 'rispetto')
('che', 'prevede', '12,5', 'miliardi', 'di')
('prevede', '12,5', 'miliardi', 'di', 'dollari')
('12,5', 'miliardi', 'di', 'dollari', 'per')
...
创建者
from nltk import ngrams
n = 5
list_ngrams=[]
for i in my_list:
grams = ngrams(i.split(), n)
for gram in grams:
print(gram)
list_ngrams.append(gram)
我只想 select 至少包含一个数字的 n-gram,例如
('soli', '2.900,', 'contando', 'un', 'crollo')
('2.900,', 'contando', 'un', 'crollo', 'del')
('contando', 'un', 'crollo', 'del', '99.9%')
('un', 'crollo', 'del', '99.9%', 'rispetto')
('che', 'prevede', '12,5', 'miliardi', 'di')
('prevede', '12,5', 'miliardi', 'di', 'dollari')
('12,5', 'miliardi', 'di', 'dollari', 'per')
你能帮我select吗?
你可以这样做:
l = [('allo', 'stesso', 'modo', 'dell’italia,', 'che'),
('stesso', 'modo', 'dell’italia,', 'che', 'sta'),
('modo', 'dell’italia,', 'che', 'sta', 'già'),
('dell’italia,', 'che', 'sta', 'già', 'pensando'),
('che', 'sta', 'già', 'pensando', 'alla'),
('sta', 'già', 'pensando', 'alla', 'riapertura'),
('soli', '2.900,', 'contando', 'un', 'crollo'),
('2.900,', 'contando', 'un', 'crollo', 'del'),
('contando', 'un', 'crollo', 'del', '99.9%'),
('un', 'crollo', 'del', '99.9%', 'rispetto'),
('che', 'prevede', '12,5', 'miliardi', 'di'),
('prevede', '12,5', 'miliardi', 'di', 'dollari'),
('12,5', 'miliardi', 'di', 'dollari', 'per')]
l2 = [i for i in l if any(any(w.isdigit() for w in s) for s in i)]
print(l2)
输出:
[('soli', '2.900,', 'contando', 'un', 'crollo'), ('2.900,', 'contando', 'un', 'crollo', 'del'), ('contando', 'un', 'crollo', 'del', '99.9%'), ('un', 'crollo', 'del', '99.9%', 'rispetto'), ('che', 'prevede', '12,5', 'miliardi', 'di'), ('prevede', '12,5', 'miliardi', 'di', 'dollari'), ('12,5', 'miliardi', 'di', 'dollari', 'per')]
这将 select 所有包含任何元素的元组,其中某些字符在 0 - 9
范围内。如果需要更严格,请修改正则表达式:
import re
def filter_ngrams(ngrams):
return [ngram for ngram in ngrams
if any(re.match('\d', word) for word in ngram)]
print(filter_ngrams(ngrams))
输出:
[('soli', '2.900,', 'contando', 'un', 'crollo'), ('2.900,', 'contando', 'un', 'crollo', 'del'), ('contando', 'un', 'crollo', 'del', '99.9%'), ('un', 'crollo', 'del', '99.9%', 'rispetto'), ('che', 'prevede', '12,5', 'miliardi', 'di'), ('prevede', '12,5', 'miliardi', 'di', 'dollari'), ('12,5', 'miliardi', 'di', 'dollari', 'per')]
我有一个 n-gram 列表
('allo', 'stesso', 'modo', 'dell’italia,', 'che')
('stesso', 'modo', 'dell’italia,', 'che', 'sta')
('modo', 'dell’italia,', 'che', 'sta', 'già')
('dell’italia,', 'che', 'sta', 'già', 'pensando')
('che', 'sta', 'già', 'pensando', 'alla')
('sta', 'già', 'pensando', 'alla', 'riapertura')
('soli', '2.900,', 'contando', 'un', 'crollo')
('2.900,', 'contando', 'un', 'crollo', 'del')
('contando', 'un', 'crollo', 'del', '99.9%')
('un', 'crollo', 'del', '99.9%', 'rispetto')
('che', 'prevede', '12,5', 'miliardi', 'di')
('prevede', '12,5', 'miliardi', 'di', 'dollari')
('12,5', 'miliardi', 'di', 'dollari', 'per')
...
创建者
from nltk import ngrams
n = 5
list_ngrams=[]
for i in my_list:
grams = ngrams(i.split(), n)
for gram in grams:
print(gram)
list_ngrams.append(gram)
我只想 select 至少包含一个数字的 n-gram,例如
('soli', '2.900,', 'contando', 'un', 'crollo')
('2.900,', 'contando', 'un', 'crollo', 'del')
('contando', 'un', 'crollo', 'del', '99.9%')
('un', 'crollo', 'del', '99.9%', 'rispetto')
('che', 'prevede', '12,5', 'miliardi', 'di')
('prevede', '12,5', 'miliardi', 'di', 'dollari')
('12,5', 'miliardi', 'di', 'dollari', 'per')
你能帮我select吗?
你可以这样做:
l = [('allo', 'stesso', 'modo', 'dell’italia,', 'che'),
('stesso', 'modo', 'dell’italia,', 'che', 'sta'),
('modo', 'dell’italia,', 'che', 'sta', 'già'),
('dell’italia,', 'che', 'sta', 'già', 'pensando'),
('che', 'sta', 'già', 'pensando', 'alla'),
('sta', 'già', 'pensando', 'alla', 'riapertura'),
('soli', '2.900,', 'contando', 'un', 'crollo'),
('2.900,', 'contando', 'un', 'crollo', 'del'),
('contando', 'un', 'crollo', 'del', '99.9%'),
('un', 'crollo', 'del', '99.9%', 'rispetto'),
('che', 'prevede', '12,5', 'miliardi', 'di'),
('prevede', '12,5', 'miliardi', 'di', 'dollari'),
('12,5', 'miliardi', 'di', 'dollari', 'per')]
l2 = [i for i in l if any(any(w.isdigit() for w in s) for s in i)]
print(l2)
输出:
[('soli', '2.900,', 'contando', 'un', 'crollo'), ('2.900,', 'contando', 'un', 'crollo', 'del'), ('contando', 'un', 'crollo', 'del', '99.9%'), ('un', 'crollo', 'del', '99.9%', 'rispetto'), ('che', 'prevede', '12,5', 'miliardi', 'di'), ('prevede', '12,5', 'miliardi', 'di', 'dollari'), ('12,5', 'miliardi', 'di', 'dollari', 'per')]
这将 select 所有包含任何元素的元组,其中某些字符在 0 - 9
范围内。如果需要更严格,请修改正则表达式:
import re
def filter_ngrams(ngrams):
return [ngram for ngram in ngrams
if any(re.match('\d', word) for word in ngram)]
print(filter_ngrams(ngrams))
输出:
[('soli', '2.900,', 'contando', 'un', 'crollo'), ('2.900,', 'contando', 'un', 'crollo', 'del'), ('contando', 'un', 'crollo', 'del', '99.9%'), ('un', 'crollo', 'del', '99.9%', 'rispetto'), ('che', 'prevede', '12,5', 'miliardi', 'di'), ('prevede', '12,5', 'miliardi', 'di', 'dollari'), ('12,5', 'miliardi', 'di', 'dollari', 'per')]