停用词不会在 NLTK 中删除单词 - 与原始文本相同
Stopwords not dropping words in NLTK - same as original text
我在删除特殊字符等后标记了句子。停用词在不删除填充词的情况下返回文本。
import nltk
import re
import string
from nltk.corpus import stopwords
""" Function to remove special characters etc."""
def remove_characters_before_tokenization(sentence, keep_apostrophes=False):
sentence = sentence.strip()
if keep_apostrophes:
PATTERN = r'[?|$|&|*|%|@|(|)|~]'
filtered_sentence = re.sub(PATTERN, r'', sentence)
else:
PATTERN = r'[^a-zA-Z0-9 ]'
filtered_sentence = re.sub(PATTERN, r'', sentence)
return filtered_sentence
""" Generic function to word tokenize"""
def tokenize_text(text):
sentences = nltk.sent_tokenize(text)
word_tokens = [nltk.word_tokenize(sentence) for sentence in sentences]
return word_tokens
Sample= open("Sample.txt", "r") # open a text file
cleaned_text= remove_characters_before_tokenization(Sample.read())
words=tokenize_text(cleaned_text) # tokenised word without special characters
""" Function to remove stopwords"""
def remove_stopwords(tokens):
stopword_list = nltk.corpus.stopwords.words('english')
for token in tokens:
if not token in stopword_list:
filtered_tokens= token
return filtered_tokens
stop_removed = remove_stopwords(words)
print(stop_removed)
输出“stop_removed”与“words”相同。我想我在令牌中的 FOR 循环令牌中犯了错误,但我不确定如何更正它。
filtered_tokens = token
只存储一个token,你需要使用存储一组项目的数据结构(例如嵌套列表)。
stop = set(stopwords.words('english'))
def remove_stopwords(text):
filtered_text = [[tok for tok in sent if tok not in stop] for sent in text]
return filtered_text
我在删除特殊字符等后标记了句子。停用词在不删除填充词的情况下返回文本。
import nltk
import re
import string
from nltk.corpus import stopwords
""" Function to remove special characters etc."""
def remove_characters_before_tokenization(sentence, keep_apostrophes=False):
sentence = sentence.strip()
if keep_apostrophes:
PATTERN = r'[?|$|&|*|%|@|(|)|~]'
filtered_sentence = re.sub(PATTERN, r'', sentence)
else:
PATTERN = r'[^a-zA-Z0-9 ]'
filtered_sentence = re.sub(PATTERN, r'', sentence)
return filtered_sentence
""" Generic function to word tokenize"""
def tokenize_text(text):
sentences = nltk.sent_tokenize(text)
word_tokens = [nltk.word_tokenize(sentence) for sentence in sentences]
return word_tokens
Sample= open("Sample.txt", "r") # open a text file
cleaned_text= remove_characters_before_tokenization(Sample.read())
words=tokenize_text(cleaned_text) # tokenised word without special characters
""" Function to remove stopwords"""
def remove_stopwords(tokens):
stopword_list = nltk.corpus.stopwords.words('english')
for token in tokens:
if not token in stopword_list:
filtered_tokens= token
return filtered_tokens
stop_removed = remove_stopwords(words)
print(stop_removed)
输出“stop_removed”与“words”相同。我想我在令牌中的 FOR 循环令牌中犯了错误,但我不确定如何更正它。
filtered_tokens = token
只存储一个token,你需要使用存储一组项目的数据结构(例如嵌套列表)。
stop = set(stopwords.words('english'))
def remove_stopwords(text):
filtered_text = [[tok for tok in sent if tok not in stop] for sent in text]
return filtered_text