预处理推文,删除 @ 和 # ,删除停用词并从 python 中的列表列表中删除用户
preprocessing tweets, remove @ and # , eliminate stop words and remove user from list of list in python
我写了下面的代码,但是现在我想重新处理,所以我转换到较低的,我写了一些词来消除停用词但是没有用,我想删除@和#并且删除用户, 你能帮助我吗?
! pip install wget
import wget
url = 'https://raw.githubusercontent.com/dirkhovy/NLPclass/master/data/tweets_en.txt'
wget.download(url, 'tweets_en.txt')
tweets = [line.strip() for line in open('tweets_en.txt', encoding='utf8')]
import spacy
from collections import Counter
# your code here
import itertools
nlp = spacy.load('en')
#Creates a list of lists of tokens
tokens = [[token.text for token in nlp(sentence)] for sentence in tweets[:200]]
print(tokens)
#to lower
token_l=[[w.lower() for w in line] for line in tokens]
token_l[:1]
#remove #
#remove stop word
#remove user
#remove @
from nltk.corpus import stopwords
# filtered_words = [[w for w in line] for line in tokens if w not in # stopwords.words('english')]
始终尝试将代码组织成函数:它们是可重用、可读和可循环的。
from nltk.corpus import stopwords
import spacy, re
nlp = spacy.load('en')
stop_words = [w.lower() for w in stopwords.words()]
def sanitize(input_string):
""" Sanitize one string """
# normalize to lowercase
string = input_string.lower()
# spacy tokenizer
string_split = [token.text for token in nlp(string)]
# in case the string is empty
if not string_split:
return ''
# remove user
# assuming user is the first word and contains an @
if '@' in string_split[0]:
del string_split[0]
# join back to string
string = ' '.join(string_split)
#remove # and @
for punc in '@#':
string = string.replace(punc, '')
# remove 't.co/' links
string = re.sub(r't.co\/[^\s]+', '', string, flags=re.MULTILINE)
# removing stop words
string = ' '.join([w for w in string.split() if w not in stop_words])
return string
list = ['@Jeff_Atwood Thank you for #Whosebug', 'All hail @Joel_Spolsky t.co/Gsb7V1oVLU #Whosebug' ]
list_sanitized = [sanitize(string) for string in list]
输出:
['thank Whosebug', 'hail joel_spolsky Whosebug']
我写了下面的代码,但是现在我想重新处理,所以我转换到较低的,我写了一些词来消除停用词但是没有用,我想删除@和#并且删除用户, 你能帮助我吗?
! pip install wget
import wget
url = 'https://raw.githubusercontent.com/dirkhovy/NLPclass/master/data/tweets_en.txt'
wget.download(url, 'tweets_en.txt')
tweets = [line.strip() for line in open('tweets_en.txt', encoding='utf8')]
import spacy
from collections import Counter
# your code here
import itertools
nlp = spacy.load('en')
#Creates a list of lists of tokens
tokens = [[token.text for token in nlp(sentence)] for sentence in tweets[:200]]
print(tokens)
#to lower
token_l=[[w.lower() for w in line] for line in tokens]
token_l[:1]
#remove #
#remove stop word
#remove user
#remove @
from nltk.corpus import stopwords
# filtered_words = [[w for w in line] for line in tokens if w not in # stopwords.words('english')]
始终尝试将代码组织成函数:它们是可重用、可读和可循环的。
from nltk.corpus import stopwords
import spacy, re
nlp = spacy.load('en')
stop_words = [w.lower() for w in stopwords.words()]
def sanitize(input_string):
""" Sanitize one string """
# normalize to lowercase
string = input_string.lower()
# spacy tokenizer
string_split = [token.text for token in nlp(string)]
# in case the string is empty
if not string_split:
return ''
# remove user
# assuming user is the first word and contains an @
if '@' in string_split[0]:
del string_split[0]
# join back to string
string = ' '.join(string_split)
#remove # and @
for punc in '@#':
string = string.replace(punc, '')
# remove 't.co/' links
string = re.sub(r't.co\/[^\s]+', '', string, flags=re.MULTILINE)
# removing stop words
string = ' '.join([w for w in string.split() if w not in stop_words])
return string
list = ['@Jeff_Atwood Thank you for #Whosebug', 'All hail @Joel_Spolsky t.co/Gsb7V1oVLU #Whosebug' ]
list_sanitized = [sanitize(string) for string in list]
输出:
['thank Whosebug', 'hail joel_spolsky Whosebug']