主题建模错误太多值无法解包

Question

我正在尝试使用 tsne 和 pyldavis 作为可视化来执行 lda 主题建模。然而，在获得主要主题的同时执行 lda 后，错误给出了太多的值无法解包。代码和错误如下。非常感谢任何帮助。

LdaMulticore 主题建模代码：

import sys
# !{sys.executable} -m spacy download en
import re, numpy as np, pandas as pd
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])
def Make_String(text):
    return str(text)

#Reviews.columns=['Reviews']
#print(Reviews.head(10))

df['text']=df['text'].apply(lambda x: Make_String(x))
%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
# Import Dataset
#df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
#df = df.loc[df.target_names.isin(['soc.religion.christian', 'rec.sport.hockey', 'talk.politics.mideast', 'rec.motorcycles']) , :]
df=pd.read_csv("/content/drive/My Drive/Negative_data.csv", encoding="ISO-8859-1")

print(df.shape)  #> (2361, 3)
df.head()

    # Create Dictionary
    id2word = corpora.Dictionary(data_ready)
    from gensim.models import LdaMulticore
    
    # Create Corpus: Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in data_ready]
    lda_model = LdaMulticore( corpus, num_topics=10, id2word=id2word , passes=2, workers=2)
    
    pprint(lda_model.print_topics())
    #> [(0,
    #>   '0.017*"write" + 0.015*"people" + 0.014*"organization" + 0.014*"article" + '
    #>   '0.013*"time" + 0.008*"give" + 0.008*"first" + 0.007*"tell" + 0.007*"new" + '
    #>   '0.007*"question"'),
    #>  (1,
    #>   '0.008*"christian" + 0.008*"believe" + 0.007*"god" + 0.007*"law" + '
    #>   '0.006*"state" + 0.006*"israel" + 0.006*"israeli" + 0.005*"exist" + '
    #>   '0.005*"way" + 0.004*"bible"'),
    #>  (2,
    #>   '0.024*"armenian" + 0.012*"bike" + 0.006*"kill" + 0.006*"work" + '
    #>   '0.005*"well" + 0.005*"year" + 0.005*"sumgait" + 0.005*"soldier" + '
    #>   '0.004*"way" + 0.004*"ride"'),
    #>  (3,
    #>   '0.019*"team" + 0.019*"game" + 0.013*"hockey" + 0.010*"player" + '
    #>   '0.009*"play" + 0.009*"win" + 0.009*"nhl" + 0.009*"year" + 0.009*"hawk" + '
    #>   '0.009*"season"')]

Output:

    [(0,
      '0.340*"seriously" + 0.017*"time" + 0.015*"samsung" + 0.014*"day" + '
      '0.013*"phone" + 0.012*"order" + 0.012*"wait" + 0.011*"week" + 0.011*"damn" '
      '+ 0.011*"next"'),
     (1,
      '0.081*"puma" + 0.068*"shoe" + 0.046*"adida" + 0.017*"site" + 0.017*"como" + '
      '0.014*"wear" + 0.014*"ugly" + 0.011*"shirt" + 0.010*"era" + 0.009*"pumas"'),
     (2,
      '0.033*"watch" + 0.021*"hate" + 0.021*"wear" + 0.020*"shit" + 0.020*"buy" + '
      '0.016*"game" + 0.014*"man" + 0.014*"stop" + 0.014*"time" + 0.013*"still"'),
     (3,
      '0.037*"bad" + 0.014*"year" + 0.013*"pay" + 0.013*"feel" + 0.011*"thing" + '
      '0.011*"really" + 0.011*"last" + 0.011*"ever" + 0.009*"never" + '
      '0.009*"people"'),
     (4,
      '0.332*"com" + 0.173*"twitter" + 0.078*"pic" + 0.036*"status" + '
      '0.036*"https" + 0.029*"nintendo" + 0.015*"apple" + 0.008*"pue" + '
      '0.006*"photo" + 0.004*"iphone"'),
     (5,
      '0.162*"http" + 0.028*"pace" + 0.027*"low" + 0.019*"new" + 0.019*"price" + '
      '0.017*"crushed_km" + 0.017*"size" + 0.014*"video" + 0.012*"sale" + '
      '0.012*"dlvr"'),
     (6,
      '0.062*"nike" + 0.019*"phone" + 0.019*"drop" + 0.018*"work" + 0.013*"tell" + '
      '0.013*"hard" + 0.012*"call" + 0.011*"crazy" + 0.011*"lol" + 0.010*"ass"'),
     (7,
      '0.036*"sin" + 0.036*"die" + 0.024*"kill" + 0.018*"pero" + 0.012*"android" + '
      '0.012*"pro" + 0.009*"death" + 0.008*"igual" + 0.008*"final" + '
      '0.008*"problem"'),
     (8,
      '0.039*"black" + 0.036*"http" + 0.034*"netflix" + 0.020*"fire" + '
      '0.018*"dead" + 0.014*"son" + 0.013*"lose" + 0.011*"tv" + 0.011*"tinyurl" + '
      '0.010*"steal"'),
     (9,
      '0.299*"live" + 0.295*"alone" + 0.038*"seriously" + 0.013*"switch" + '
      '0.008*"mad" + 0.006*"screen" + 0.006*"wrong" + 0.006*"season" + '
      '0.005*"hour" + 0.005*"people"')]

主导主题代码：

# Sentence Coloring of N Sentences
def topics_per_document(model, corpus, start=0, end=1):
    corpus_sel = corpus[start:end]
    dominant_topics = []
    topic_percentages = []
    for i, corp in enumerate(corpus_sel):
        topic_percs, wordid_topics, wordid_phivalues = model[corp]
        dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
        dominant_topics.append((i, dominant_topic))
        topic_percentages.append(topic_percs)
    return(dominant_topics, topic_percentages)

dominant_topics, topic_percentages = topics_per_document(model=lda_model, corpus=corpus, end=-1)            

# Distribution of Dominant Topics in Each Document
df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()

# Total Topic Distribution by actual weight
topic_weightage_by_doc = pd.DataFrame([dict(t) for t in topic_percentages])
df_topic_weightage_by_doc = topic_weightage_by_doc.sum().to_frame(name='count').reset_index()

# Top 3 Keywords for each Topic
topic_top3words = [(i, topic) for i, topics in lda_model.show_topics(formatted=False) 
                                 for j, (topic, wt) in enumerate(topics) if j < 3]

df_top3words_stacked = pd.DataFrame(topic_top3words, columns=['topic_id', 'words'])
df_top3words = df_top3words_stacked.groupby('topic_id').agg(', \n'.join)
df_top3words.reset_index(level=0,inplace=True)

错误：

<ipython-input-13-5ea2ada44643> in topics_per_document(model, corpus, start, end)
      5     topic_percentages = []
      6     for i, corp in enumerate(corpus_sel):
----> 7         topic_percs, wordid_topics, wordid_phivalues = model[corp]
      8         dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
      9         dominant_topics.append((i, dominant_topic))

ValueError: too many values to unpack (expected 3)

非常感谢

Answer 1

model[corp] 不会 return 您的代码期望的元组 (topic_percs, wordid_topics, wordid_phivalues)。相反，它 return 是 corp 的成员向量，即模型中每个主题的 corp 是从该主题生成的概率。这里 corp 是来自 corpus 的单个文档，因为您正在遍历 enumerate(corpus[0:1])，因此您要为 corpus.

中的每个文档请求成员资格向量

这可以从documentation中给出的例子看出（对于LdaMulticore的父class LdaModel但是他们return是同一个对象):

>>> from gensim.test.utils import common_corpus
>>> from gensim.models.ldamodel import LdaModel
>>> lda = LdaModel(common_corpus, num_topics=10, iterations=1)
>>> doc_bow = [(1, 0.3), (2, 0.1), (0, 0.09)]
>>> doc_lda = lda[doc_bow]
>>> doc_lda
[(0, 0.08579318),
 (1, 0.0858944),
 (2, 0.079572774),
 (3, 0.09752562),
 (4, 0.08426655),
 (5, 0.1231114),
 (6, 0.17063272),
 (7, 0.08766636),
 (8, 0.083353266),
 (9, 0.102183744)]

你好像想打电话 model.get_document_topics(corp) 用于语料库中的每个文档词袋（您称之为 corp）。这个 return 整个文档的主题分布的三元组，每个词最可能的主题，并且 phi 相关值乘以每个 word-topic 组合的特征长度。

否则，您可以更改

topic_percs, wordid_topics, wordid_phivalues = model[corp]

至

topic_percs = model[corp]

甚至更清楚

topic_percs = model.get_document_topics(corp)

假设wordid_topics是每个话题中每个wordid的概率，那么你可以调用 model.get_topic_terms(topicid) 到 return 主题生成的最相关单词的概率对，或者 model.get_topics() 得到 term-topic 矩阵。

Answer 2

您应该将 topic_percs, wordid_topics, wordid_phivalues = model[corp] 更改为 topic_percs, wordid_topics, wordid_phivalues = model.get_document_topics(corp_cur, per_word_topics = True)

主题建模错误太多值无法解包

Topic modelling error too many values to unpack

python

dataset

dataframe

topic-modeling

data-science