情绪分析代码 (word2vec) 在我的 python 版本中无法正常工作(未构建词汇表)
Sentiment Analysis Code (word2vec) not properly working in my python version (vocabulary not built)
我已经在线获取了一个代码,用于对 twitter 数据库进行情感分析。我尝试了 运行 它并在开始时给了我打印错误,我发现 python 的较新版本已经改变了它的打印方式。我收到错误消息,显示我的数据未填充到数组中,如果有人使用过 python 并且有敏锐的眼光可以看出我哪里出错了,请帮忙。
import numpy as np
from copy import deepcopy
from string import punctuation
from random import shuffle
import chardet
from sklearn.manifold import TSNE
from sklearn.preprocessing import scale
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook
import gensim
from gensim.models.word2vec import Word2Vec
LabeledSentence = gensim.models.doc2vec.LabeledSentence
import pandas as pd
pd.options.mode.chained_assignment = None
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
def ingest(filename):
with open(filename, 'rb') as f:
result = chardet.detect(f.read())
data = pd.read_csv(filename, encoding=result['encoding'])
data.drop(['ItemID', 'Date', 'Blank', 'SentimentSource'], axis=1, inplace=True)
data = data[data.Sentiment.isnull() == False]
data['Sentiment'] = data['Sentiment'].map({4:1, 0:0})
data = data[data['SentimentText'].isnull() == False]
data.reset_index(inplace=True)
data.drop('index', axis=1, inplace=True)
print('dataset loaded with shape {}', format(data.shape))
return data
def tokenize(tweet):
try:
tweet = unicode(tweet.decode('utf-8').lower())
tokens = tokenizer.tokenize(tweet)
tokens = filter(lambda t: not t.startswith('@'), tokens)
tokens = filter(lambda t: not t.startswith('#'), tokens)
tokens = filter(lambda t: not t.startswith('http'), tokens)
return tokens
except:
return 'NC'
def postprocess(data, n=100):
data = data.head(n)
data['tokens'] = data['SentimentText'].progress_map(tokenize)
data = data[data.tokens != 'NC']
data.reset_index(inplace=True)
data.drop('index', inplace=True, axis=1)
return data
def labelizeTweets(tweets, label_type):
labelized = []
for i,v in enumerate(tweets):
label = '%s_%s'%(label_type,i)
labelized.append(LabeledSentence(v, [label]))
print(":::::::::::::::::::::::::")
return labelized
def labelizeTweets(tweets, label_type):
labelized = []
for i,v in tqdm(enumerate(tweets)):
label = '%s_%s'%(label_type,i)
labelized.append(LabeledSentence(v, [label]))
return labelized
def buildWordVector(tokens, size):
vec = np.zeros(size).reshape((1, size))
count = 0.
for word in tokens:
try:
vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
count += 1.
except KeyError:
continue
if count != 0:
vec /= count
return vec
if __name__ == '__main__':
filename = './training.csv'
#n = 1000000
n = 100
n_dim = 200
data = ingest(filename)
#data = data.head(5)
data = postprocess(data, n)
x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(n).tokens), np.array(data.head(n).Sentiment), test_size=0.2)
print("training length X", len(x_train))
print("training length Y", len(y_train))
x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')
print("jljkjkjlkjlj", len(x_train))
tweet_w2v = Word2Vec(size=n_dim, min_count=10)
#tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
tweet_w2v.build_vocab([x.words for x in x_train])
#tweet_w2v.train([x.words for x in tqdm(x_train)],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
tweet_w2v.train([x.words for x in x_train],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
print(tweet_w2v.most_similar('good'))
if True:
print('building tf-idf matrix ...')
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in x_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print('vocab size :', len(tfidf))
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)
test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=200))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])
model.fit(train_vecs_w2v, y_train, epochs=20, batch_size=32, verbose=2)
score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2)
print (score[1])
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
x_axis_type=None, y_axis_type=None, min_border=1)
word_vectors = [tweet_w2v[w] for w in tweet_w2v.wv.vocab.keys()[:5000]]
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)
tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = tweet_w2v.wv.vocab.keys()[:5000]
plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)
这是我遇到的错误
C:\Users\lenovo\AppData\Local\Programs\Python\Python35\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
dataset loaded with shape {} (505, 2)
progress-bar: 100%|##########################################################################| 505/505 [00:00<?, ?it/s]
training length X 0
training length Y 0
0it [00:00, ?it/s]
0it [00:00, ?it/s]
jljkjkjlkjlj 0
Traceback (most recent call last):
File "Sentiment_Analysis.py", line 127, in <module>
tweet_w2v.train([x.words for x in x_train],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
File "C:\Users\lenovo\AppData\Local\Programs\Python\Python35\lib\site-packages\gensim\models\word2vec.py", line 951, in train
raise RuntimeError("you must first build vocabulary before training the model")
RuntimeError: you must first build vocabulary before training the model
我用相同的代码遇到了同样的问题。网站上的代码绝对没有问题,但是怎么排都是returns空词汇。
我的解决方法是,当您 运行 在 Python 2.7 而不是 3.x 中使用完全相同的代码时,它 运行 很顺利。但是,如果您成功地将它移植到 Python 3.x,您将拥有更快的数据/内存访问速率,这是非常可取的。
编辑:发现问题,现在它也适用于 Python 3。编辑相应的代码段,词汇表应该没有任何问题。
def tokenize(tweet):
try:
tweet = unicode(tweet.decode('utf-8').lower())
tokens = tokenizer.tokenize(tweet)
tokens = list(filter(lambda t: not t.startswith('@'), tokens))
tokens = list(filter(lambda t: not t.startswith('#'), tokens))
tokens = list(filter(lambda t: not t.startswith('http'), tokens))
return tokens
except:
return 'NC'
我已经在线获取了一个代码,用于对 twitter 数据库进行情感分析。我尝试了 运行 它并在开始时给了我打印错误,我发现 python 的较新版本已经改变了它的打印方式。我收到错误消息,显示我的数据未填充到数组中,如果有人使用过 python 并且有敏锐的眼光可以看出我哪里出错了,请帮忙。
import numpy as np
from copy import deepcopy
from string import punctuation
from random import shuffle
import chardet
from sklearn.manifold import TSNE
from sklearn.preprocessing import scale
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook
import gensim
from gensim.models.word2vec import Word2Vec
LabeledSentence = gensim.models.doc2vec.LabeledSentence
import pandas as pd
pd.options.mode.chained_assignment = None
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
def ingest(filename):
with open(filename, 'rb') as f:
result = chardet.detect(f.read())
data = pd.read_csv(filename, encoding=result['encoding'])
data.drop(['ItemID', 'Date', 'Blank', 'SentimentSource'], axis=1, inplace=True)
data = data[data.Sentiment.isnull() == False]
data['Sentiment'] = data['Sentiment'].map({4:1, 0:0})
data = data[data['SentimentText'].isnull() == False]
data.reset_index(inplace=True)
data.drop('index', axis=1, inplace=True)
print('dataset loaded with shape {}', format(data.shape))
return data
def tokenize(tweet):
try:
tweet = unicode(tweet.decode('utf-8').lower())
tokens = tokenizer.tokenize(tweet)
tokens = filter(lambda t: not t.startswith('@'), tokens)
tokens = filter(lambda t: not t.startswith('#'), tokens)
tokens = filter(lambda t: not t.startswith('http'), tokens)
return tokens
except:
return 'NC'
def postprocess(data, n=100):
data = data.head(n)
data['tokens'] = data['SentimentText'].progress_map(tokenize)
data = data[data.tokens != 'NC']
data.reset_index(inplace=True)
data.drop('index', inplace=True, axis=1)
return data
def labelizeTweets(tweets, label_type):
labelized = []
for i,v in enumerate(tweets):
label = '%s_%s'%(label_type,i)
labelized.append(LabeledSentence(v, [label]))
print(":::::::::::::::::::::::::")
return labelized
def labelizeTweets(tweets, label_type):
labelized = []
for i,v in tqdm(enumerate(tweets)):
label = '%s_%s'%(label_type,i)
labelized.append(LabeledSentence(v, [label]))
return labelized
def buildWordVector(tokens, size):
vec = np.zeros(size).reshape((1, size))
count = 0.
for word in tokens:
try:
vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
count += 1.
except KeyError:
continue
if count != 0:
vec /= count
return vec
if __name__ == '__main__':
filename = './training.csv'
#n = 1000000
n = 100
n_dim = 200
data = ingest(filename)
#data = data.head(5)
data = postprocess(data, n)
x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(n).tokens), np.array(data.head(n).Sentiment), test_size=0.2)
print("training length X", len(x_train))
print("training length Y", len(y_train))
x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')
print("jljkjkjlkjlj", len(x_train))
tweet_w2v = Word2Vec(size=n_dim, min_count=10)
#tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
tweet_w2v.build_vocab([x.words for x in x_train])
#tweet_w2v.train([x.words for x in tqdm(x_train)],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
tweet_w2v.train([x.words for x in x_train],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
print(tweet_w2v.most_similar('good'))
if True:
print('building tf-idf matrix ...')
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in x_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print('vocab size :', len(tfidf))
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)
test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=200))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])
model.fit(train_vecs_w2v, y_train, epochs=20, batch_size=32, verbose=2)
score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2)
print (score[1])
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
x_axis_type=None, y_axis_type=None, min_border=1)
word_vectors = [tweet_w2v[w] for w in tweet_w2v.wv.vocab.keys()[:5000]]
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)
tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = tweet_w2v.wv.vocab.keys()[:5000]
plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)
这是我遇到的错误
C:\Users\lenovo\AppData\Local\Programs\Python\Python35\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
dataset loaded with shape {} (505, 2)
progress-bar: 100%|##########################################################################| 505/505 [00:00<?, ?it/s]
training length X 0
training length Y 0
0it [00:00, ?it/s]
0it [00:00, ?it/s]
jljkjkjlkjlj 0
Traceback (most recent call last):
File "Sentiment_Analysis.py", line 127, in <module>
tweet_w2v.train([x.words for x in x_train],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
File "C:\Users\lenovo\AppData\Local\Programs\Python\Python35\lib\site-packages\gensim\models\word2vec.py", line 951, in train
raise RuntimeError("you must first build vocabulary before training the model")
RuntimeError: you must first build vocabulary before training the model
我用相同的代码遇到了同样的问题。网站上的代码绝对没有问题,但是怎么排都是returns空词汇。
我的解决方法是,当您 运行 在 Python 2.7 而不是 3.x 中使用完全相同的代码时,它 运行 很顺利。但是,如果您成功地将它移植到 Python 3.x,您将拥有更快的数据/内存访问速率,这是非常可取的。
编辑:发现问题,现在它也适用于 Python 3。编辑相应的代码段,词汇表应该没有任何问题。
def tokenize(tweet):
try:
tweet = unicode(tweet.decode('utf-8').lower())
tokens = tokenizer.tokenize(tweet)
tokens = list(filter(lambda t: not t.startswith('@'), tokens))
tokens = list(filter(lambda t: not t.startswith('#'), tokens))
tokens = list(filter(lambda t: not t.startswith('http'), tokens))
return tokens
except:
return 'NC'