情绪分析代码 (word2vec) 在我的 python 版本中无法正常工作(未构建词汇表)

Sentiment Analysis Code (word2vec) not properly working in my python version (vocabulary not built)

我已经在线获取了一个代码,用于对 twitter 数据库进行情感分析。我尝试了 运行 它并在开始时给了我打印错误,我发现 python 的较新版本已经改变了它的打印方式。我收到错误消息,显示我的数据未填充到数组中,如果有人使用过 python 并且有敏锐的眼光可以看出我哪里出错了,请帮忙。

    import numpy as np 
    from copy import deepcopy
    from string import punctuation
    from random import shuffle
    import chardet
    from sklearn.manifold import TSNE
    from sklearn.preprocessing import scale


    import bokeh.plotting as bp
    from bokeh.models import HoverTool, BoxSelectTool
    from bokeh.plotting import figure, show, output_notebook

    import gensim
    from gensim.models.word2vec import Word2Vec 
    LabeledSentence = gensim.models.doc2vec.LabeledSentence 

    import pandas as pd 
    pd.options.mode.chained_assignment = None

    from tqdm import tqdm
    tqdm.pandas(desc="progress-bar")

    from nltk.tokenize import TweetTokenizer 
    tokenizer = TweetTokenizer()

    from sklearn.model_selection import train_test_split
    from sklearn.feature_extraction.text import TfidfVectorizer

    def ingest(filename):
        with open(filename, 'rb') as f:
            result = chardet.detect(f.read())
        data = pd.read_csv(filename, encoding=result['encoding'])
        data.drop(['ItemID', 'Date', 'Blank', 'SentimentSource'], axis=1, inplace=True)
        data = data[data.Sentiment.isnull() == False]
        data['Sentiment'] = data['Sentiment'].map({4:1, 0:0})
        data = data[data['SentimentText'].isnull() == False]
        data.reset_index(inplace=True)
        data.drop('index', axis=1, inplace=True)
        print('dataset loaded with shape {}', format(data.shape)) 

        return data

    def tokenize(tweet):
        try:
            tweet = unicode(tweet.decode('utf-8').lower())
            tokens = tokenizer.tokenize(tweet)
            tokens = filter(lambda t: not t.startswith('@'), tokens)
            tokens = filter(lambda t: not t.startswith('#'), tokens)
            tokens = filter(lambda t: not t.startswith('http'), tokens)
            return tokens
        except:
            return 'NC'

    def postprocess(data, n=100):
        data = data.head(n)
        data['tokens'] = data['SentimentText'].progress_map(tokenize)  
        data = data[data.tokens != 'NC']
        data.reset_index(inplace=True)
        data.drop('index', inplace=True, axis=1)
        return data


    def labelizeTweets(tweets, label_type):
        labelized = []
        for i,v in  enumerate(tweets):
            label = '%s_%s'%(label_type,i)
            labelized.append(LabeledSentence(v, [label]))
            print(":::::::::::::::::::::::::")
        return labelized


    def labelizeTweets(tweets, label_type):
        labelized = []
        for i,v in tqdm(enumerate(tweets)):
            label = '%s_%s'%(label_type,i)
            labelized.append(LabeledSentence(v, [label]))
        return labelized


    def buildWordVector(tokens, size):
        vec = np.zeros(size).reshape((1, size))
        count = 0.
        for word in tokens:
            try:
                vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
                count += 1.
            except KeyError: 

                continue
        if count != 0:
            vec /= count
        return vec



    if __name__ == '__main__':

        filename = './training.csv'

        #n = 1000000
        n = 100
        n_dim = 200

        data = ingest(filename)
        #data = data.head(5)
        data = postprocess(data, n)

        x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(n).tokens), np.array(data.head(n).Sentiment), test_size=0.2)


        print("training length X", len(x_train))

        print("training length Y", len(y_train))


        x_train = labelizeTweets(x_train, 'TRAIN')
        x_test = labelizeTweets(x_test, 'TEST')

        print("jljkjkjlkjlj", len(x_train))

        tweet_w2v = Word2Vec(size=n_dim, min_count=10)
        #tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
        tweet_w2v.build_vocab([x.words for x in x_train])

        #tweet_w2v.train([x.words for x in tqdm(x_train)],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
        tweet_w2v.train([x.words for x in x_train],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)




        print(tweet_w2v.most_similar('good'))

        if True:
            print('building tf-idf matrix ...')
            vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
            matrix = vectorizer.fit_transform([x.words for x in x_train])
            tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
            print('vocab size :', len(tfidf))

            train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
            train_vecs_w2v = scale(train_vecs_w2v)

            test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
            test_vecs_w2v = scale(test_vecs_w2v)

            model = Sequential()
            model.add(Dense(32, activation='relu', input_dim=200))
            model.add(Dense(1, activation='sigmoid'))
            model.compile(optimizer='rmsprop',
                                        loss='binary_crossentropy',
                                        metrics=['accuracy'])

            model.fit(train_vecs_w2v, y_train, epochs=20, batch_size=32, verbose=2)

            score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2)
            print (score[1])

    output_notebook()
    plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors",
        tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
        x_axis_type=None, y_axis_type=None, min_border=1)

    word_vectors = [tweet_w2v[w] for w in tweet_w2v.wv.vocab.keys()[:5000]]

    tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
    tsne_w2v = tsne_model.fit_transform(word_vectors)

    tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
    tsne_df['words'] = tweet_w2v.wv.vocab.keys()[:5000]

    plot_tfidf.scatter(x='x', y='y', source=tsne_df)
    hover = plot_tfidf.select(dict(type=HoverTool))
    hover.tooltips={"word": "@words"}
    show(plot_tfidf)

这是我遇到的错误

    C:\Users\lenovo\AppData\Local\Programs\Python\Python35\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
dataset loaded with shape {} (505, 2)
progress-bar: 100%|##########################################################################| 505/505 [00:00<?, ?it/s]
training length X 0
training length Y 0
0it [00:00, ?it/s]
0it [00:00, ?it/s]
jljkjkjlkjlj 0
Traceback (most recent call last):
  File "Sentiment_Analysis.py", line 127, in <module>
    tweet_w2v.train([x.words for x in x_train],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
  File "C:\Users\lenovo\AppData\Local\Programs\Python\Python35\lib\site-packages\gensim\models\word2vec.py", line 951, in train
    raise RuntimeError("you must first build vocabulary before training the model")
RuntimeError: you must first build vocabulary before training the model

我用相同的代码遇到了同样的问题。网站上的代码绝对没有问题,但是怎么排都是returns空词汇。

我的解决方法是,当您 运行 在 Python 2.7 而不是 3.x 中使用完全相同的代码时,它 运行 很顺利。但是,如果您成功地将它移植到 Python 3.x,您将拥有更快的数据/内存访问速率,这是非常可取的。

编辑:发现问题,现在它也适用于 Python 3。编辑相应的代码段,词汇表应该没有任何问题。

def tokenize(tweet):
        try:
            tweet = unicode(tweet.decode('utf-8').lower())
            tokens = tokenizer.tokenize(tweet)
            tokens = list(filter(lambda t: not t.startswith('@'), tokens))
            tokens = list(filter(lambda t: not t.startswith('#'), tokens))
            tokens = list(filter(lambda t: not t.startswith('http'), tokens))
            return tokens
        except:
            return 'NC'