主题建模错误太多值无法解包
Topic modelling error too many values to unpack
我正在尝试使用 tsne 和 pyldavis 作为可视化来执行 lda 主题建模。然而,在获得主要主题的同时执行 lda 后,错误给出了太多的值无法解包。代码和错误如下。非常感谢任何帮助。
LdaMulticore 主题建模代码:
import sys
# !{sys.executable} -m spacy download en
import re, numpy as np, pandas as pd
from pprint import pprint
# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])
def Make_String(text):
return str(text)
#Reviews.columns=['Reviews']
#print(Reviews.head(10))
df['text']=df['text'].apply(lambda x: Make_String(x))
%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
# Import Dataset
#df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
#df = df.loc[df.target_names.isin(['soc.religion.christian', 'rec.sport.hockey', 'talk.politics.mideast', 'rec.motorcycles']) , :]
df=pd.read_csv("/content/drive/My Drive/Negative_data.csv", encoding="ISO-8859-1")
print(df.shape) #> (2361, 3)
df.head()
# Create Dictionary
id2word = corpora.Dictionary(data_ready)
from gensim.models import LdaMulticore
# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]
lda_model = LdaMulticore( corpus, num_topics=10, id2word=id2word , passes=2, workers=2)
pprint(lda_model.print_topics())
#> [(0,
#> '0.017*"write" + 0.015*"people" + 0.014*"organization" + 0.014*"article" + '
#> '0.013*"time" + 0.008*"give" + 0.008*"first" + 0.007*"tell" + 0.007*"new" + '
#> '0.007*"question"'),
#> (1,
#> '0.008*"christian" + 0.008*"believe" + 0.007*"god" + 0.007*"law" + '
#> '0.006*"state" + 0.006*"israel" + 0.006*"israeli" + 0.005*"exist" + '
#> '0.005*"way" + 0.004*"bible"'),
#> (2,
#> '0.024*"armenian" + 0.012*"bike" + 0.006*"kill" + 0.006*"work" + '
#> '0.005*"well" + 0.005*"year" + 0.005*"sumgait" + 0.005*"soldier" + '
#> '0.004*"way" + 0.004*"ride"'),
#> (3,
#> '0.019*"team" + 0.019*"game" + 0.013*"hockey" + 0.010*"player" + '
#> '0.009*"play" + 0.009*"win" + 0.009*"nhl" + 0.009*"year" + 0.009*"hawk" + '
#> '0.009*"season"')]
Output:
[(0,
'0.340*"seriously" + 0.017*"time" + 0.015*"samsung" + 0.014*"day" + '
'0.013*"phone" + 0.012*"order" + 0.012*"wait" + 0.011*"week" + 0.011*"damn" '
'+ 0.011*"next"'),
(1,
'0.081*"puma" + 0.068*"shoe" + 0.046*"adida" + 0.017*"site" + 0.017*"como" + '
'0.014*"wear" + 0.014*"ugly" + 0.011*"shirt" + 0.010*"era" + 0.009*"pumas"'),
(2,
'0.033*"watch" + 0.021*"hate" + 0.021*"wear" + 0.020*"shit" + 0.020*"buy" + '
'0.016*"game" + 0.014*"man" + 0.014*"stop" + 0.014*"time" + 0.013*"still"'),
(3,
'0.037*"bad" + 0.014*"year" + 0.013*"pay" + 0.013*"feel" + 0.011*"thing" + '
'0.011*"really" + 0.011*"last" + 0.011*"ever" + 0.009*"never" + '
'0.009*"people"'),
(4,
'0.332*"com" + 0.173*"twitter" + 0.078*"pic" + 0.036*"status" + '
'0.036*"https" + 0.029*"nintendo" + 0.015*"apple" + 0.008*"pue" + '
'0.006*"photo" + 0.004*"iphone"'),
(5,
'0.162*"http" + 0.028*"pace" + 0.027*"low" + 0.019*"new" + 0.019*"price" + '
'0.017*"crushed_km" + 0.017*"size" + 0.014*"video" + 0.012*"sale" + '
'0.012*"dlvr"'),
(6,
'0.062*"nike" + 0.019*"phone" + 0.019*"drop" + 0.018*"work" + 0.013*"tell" + '
'0.013*"hard" + 0.012*"call" + 0.011*"crazy" + 0.011*"lol" + 0.010*"ass"'),
(7,
'0.036*"sin" + 0.036*"die" + 0.024*"kill" + 0.018*"pero" + 0.012*"android" + '
'0.012*"pro" + 0.009*"death" + 0.008*"igual" + 0.008*"final" + '
'0.008*"problem"'),
(8,
'0.039*"black" + 0.036*"http" + 0.034*"netflix" + 0.020*"fire" + '
'0.018*"dead" + 0.014*"son" + 0.013*"lose" + 0.011*"tv" + 0.011*"tinyurl" + '
'0.010*"steal"'),
(9,
'0.299*"live" + 0.295*"alone" + 0.038*"seriously" + 0.013*"switch" + '
'0.008*"mad" + 0.006*"screen" + 0.006*"wrong" + 0.006*"season" + '
'0.005*"hour" + 0.005*"people"')]
主导主题代码:
# Sentence Coloring of N Sentences
def topics_per_document(model, corpus, start=0, end=1):
corpus_sel = corpus[start:end]
dominant_topics = []
topic_percentages = []
for i, corp in enumerate(corpus_sel):
topic_percs, wordid_topics, wordid_phivalues = model[corp]
dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
dominant_topics.append((i, dominant_topic))
topic_percentages.append(topic_percs)
return(dominant_topics, topic_percentages)
dominant_topics, topic_percentages = topics_per_document(model=lda_model, corpus=corpus, end=-1)
# Distribution of Dominant Topics in Each Document
df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()
# Total Topic Distribution by actual weight
topic_weightage_by_doc = pd.DataFrame([dict(t) for t in topic_percentages])
df_topic_weightage_by_doc = topic_weightage_by_doc.sum().to_frame(name='count').reset_index()
# Top 3 Keywords for each Topic
topic_top3words = [(i, topic) for i, topics in lda_model.show_topics(formatted=False)
for j, (topic, wt) in enumerate(topics) if j < 3]
df_top3words_stacked = pd.DataFrame(topic_top3words, columns=['topic_id', 'words'])
df_top3words = df_top3words_stacked.groupby('topic_id').agg(', \n'.join)
df_top3words.reset_index(level=0,inplace=True)
错误:
<ipython-input-13-5ea2ada44643> in topics_per_document(model, corpus, start, end)
5 topic_percentages = []
6 for i, corp in enumerate(corpus_sel):
----> 7 topic_percs, wordid_topics, wordid_phivalues = model[corp]
8 dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
9 dominant_topics.append((i, dominant_topic))
ValueError: too many values to unpack (expected 3)
非常感谢
model[corp]
不会 return 您的代码期望的元组 (topic_percs, wordid_topics, wordid_phivalues)
。相反,它 return 是 corp
的成员向量,即模型中每个主题的 corp
是从该主题生成的概率。这里 corp
是来自 corpus
的单个文档,因为您正在遍历 enumerate(corpus[0:1])
,因此您要为 corpus
.
中的每个文档请求成员资格向量
这可以从documentation中给出的例子看出(对于LdaMulticore
的父class LdaModel
但是他们return是同一个对象):
>>> from gensim.test.utils import common_corpus
>>> from gensim.models.ldamodel import LdaModel
>>> lda = LdaModel(common_corpus, num_topics=10, iterations=1)
>>> doc_bow = [(1, 0.3), (2, 0.1), (0, 0.09)]
>>> doc_lda = lda[doc_bow]
>>> doc_lda
[(0, 0.08579318),
(1, 0.0858944),
(2, 0.079572774),
(3, 0.09752562),
(4, 0.08426655),
(5, 0.1231114),
(6, 0.17063272),
(7, 0.08766636),
(8, 0.083353266),
(9, 0.102183744)]
你好像想打电话
model.get_document_topics(corp)
用于语料库中的每个文档词袋(您称之为 corp
)。
这个 return 整个文档的主题分布的三元组,
每个词最可能的主题,
并且 phi 相关值乘以每个 word-topic 组合的特征长度。
否则,您可以更改
topic_percs, wordid_topics, wordid_phivalues = model[corp]
至
topic_percs = model[corp]
甚至更清楚
topic_percs = model.get_document_topics(corp)
假设wordid_topics
是每个话题中每个wordid
的概率,
那么你可以调用 model.get_topic_terms(topicid)
到 return 主题生成的最相关单词的概率对,或者 model.get_topics()
得到 term-topic 矩阵。
您应该将 topic_percs, wordid_topics, wordid_phivalues = model[corp]
更改为 topic_percs, wordid_topics, wordid_phivalues = model.get_document_topics(corp_cur, per_word_topics = True)
我正在尝试使用 tsne 和 pyldavis 作为可视化来执行 lda 主题建模。然而,在获得主要主题的同时执行 lda 后,错误给出了太多的值无法解包。代码和错误如下。非常感谢任何帮助。
LdaMulticore 主题建模代码:
import sys
# !{sys.executable} -m spacy download en
import re, numpy as np, pandas as pd
from pprint import pprint
# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])
def Make_String(text):
return str(text)
#Reviews.columns=['Reviews']
#print(Reviews.head(10))
df['text']=df['text'].apply(lambda x: Make_String(x))
%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
# Import Dataset
#df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
#df = df.loc[df.target_names.isin(['soc.religion.christian', 'rec.sport.hockey', 'talk.politics.mideast', 'rec.motorcycles']) , :]
df=pd.read_csv("/content/drive/My Drive/Negative_data.csv", encoding="ISO-8859-1")
print(df.shape) #> (2361, 3)
df.head()
# Create Dictionary
id2word = corpora.Dictionary(data_ready)
from gensim.models import LdaMulticore
# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]
lda_model = LdaMulticore( corpus, num_topics=10, id2word=id2word , passes=2, workers=2)
pprint(lda_model.print_topics())
#> [(0,
#> '0.017*"write" + 0.015*"people" + 0.014*"organization" + 0.014*"article" + '
#> '0.013*"time" + 0.008*"give" + 0.008*"first" + 0.007*"tell" + 0.007*"new" + '
#> '0.007*"question"'),
#> (1,
#> '0.008*"christian" + 0.008*"believe" + 0.007*"god" + 0.007*"law" + '
#> '0.006*"state" + 0.006*"israel" + 0.006*"israeli" + 0.005*"exist" + '
#> '0.005*"way" + 0.004*"bible"'),
#> (2,
#> '0.024*"armenian" + 0.012*"bike" + 0.006*"kill" + 0.006*"work" + '
#> '0.005*"well" + 0.005*"year" + 0.005*"sumgait" + 0.005*"soldier" + '
#> '0.004*"way" + 0.004*"ride"'),
#> (3,
#> '0.019*"team" + 0.019*"game" + 0.013*"hockey" + 0.010*"player" + '
#> '0.009*"play" + 0.009*"win" + 0.009*"nhl" + 0.009*"year" + 0.009*"hawk" + '
#> '0.009*"season"')]
Output:
[(0,
'0.340*"seriously" + 0.017*"time" + 0.015*"samsung" + 0.014*"day" + '
'0.013*"phone" + 0.012*"order" + 0.012*"wait" + 0.011*"week" + 0.011*"damn" '
'+ 0.011*"next"'),
(1,
'0.081*"puma" + 0.068*"shoe" + 0.046*"adida" + 0.017*"site" + 0.017*"como" + '
'0.014*"wear" + 0.014*"ugly" + 0.011*"shirt" + 0.010*"era" + 0.009*"pumas"'),
(2,
'0.033*"watch" + 0.021*"hate" + 0.021*"wear" + 0.020*"shit" + 0.020*"buy" + '
'0.016*"game" + 0.014*"man" + 0.014*"stop" + 0.014*"time" + 0.013*"still"'),
(3,
'0.037*"bad" + 0.014*"year" + 0.013*"pay" + 0.013*"feel" + 0.011*"thing" + '
'0.011*"really" + 0.011*"last" + 0.011*"ever" + 0.009*"never" + '
'0.009*"people"'),
(4,
'0.332*"com" + 0.173*"twitter" + 0.078*"pic" + 0.036*"status" + '
'0.036*"https" + 0.029*"nintendo" + 0.015*"apple" + 0.008*"pue" + '
'0.006*"photo" + 0.004*"iphone"'),
(5,
'0.162*"http" + 0.028*"pace" + 0.027*"low" + 0.019*"new" + 0.019*"price" + '
'0.017*"crushed_km" + 0.017*"size" + 0.014*"video" + 0.012*"sale" + '
'0.012*"dlvr"'),
(6,
'0.062*"nike" + 0.019*"phone" + 0.019*"drop" + 0.018*"work" + 0.013*"tell" + '
'0.013*"hard" + 0.012*"call" + 0.011*"crazy" + 0.011*"lol" + 0.010*"ass"'),
(7,
'0.036*"sin" + 0.036*"die" + 0.024*"kill" + 0.018*"pero" + 0.012*"android" + '
'0.012*"pro" + 0.009*"death" + 0.008*"igual" + 0.008*"final" + '
'0.008*"problem"'),
(8,
'0.039*"black" + 0.036*"http" + 0.034*"netflix" + 0.020*"fire" + '
'0.018*"dead" + 0.014*"son" + 0.013*"lose" + 0.011*"tv" + 0.011*"tinyurl" + '
'0.010*"steal"'),
(9,
'0.299*"live" + 0.295*"alone" + 0.038*"seriously" + 0.013*"switch" + '
'0.008*"mad" + 0.006*"screen" + 0.006*"wrong" + 0.006*"season" + '
'0.005*"hour" + 0.005*"people"')]
主导主题代码:
# Sentence Coloring of N Sentences
def topics_per_document(model, corpus, start=0, end=1):
corpus_sel = corpus[start:end]
dominant_topics = []
topic_percentages = []
for i, corp in enumerate(corpus_sel):
topic_percs, wordid_topics, wordid_phivalues = model[corp]
dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
dominant_topics.append((i, dominant_topic))
topic_percentages.append(topic_percs)
return(dominant_topics, topic_percentages)
dominant_topics, topic_percentages = topics_per_document(model=lda_model, corpus=corpus, end=-1)
# Distribution of Dominant Topics in Each Document
df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()
# Total Topic Distribution by actual weight
topic_weightage_by_doc = pd.DataFrame([dict(t) for t in topic_percentages])
df_topic_weightage_by_doc = topic_weightage_by_doc.sum().to_frame(name='count').reset_index()
# Top 3 Keywords for each Topic
topic_top3words = [(i, topic) for i, topics in lda_model.show_topics(formatted=False)
for j, (topic, wt) in enumerate(topics) if j < 3]
df_top3words_stacked = pd.DataFrame(topic_top3words, columns=['topic_id', 'words'])
df_top3words = df_top3words_stacked.groupby('topic_id').agg(', \n'.join)
df_top3words.reset_index(level=0,inplace=True)
错误:
<ipython-input-13-5ea2ada44643> in topics_per_document(model, corpus, start, end)
5 topic_percentages = []
6 for i, corp in enumerate(corpus_sel):
----> 7 topic_percs, wordid_topics, wordid_phivalues = model[corp]
8 dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
9 dominant_topics.append((i, dominant_topic))
ValueError: too many values to unpack (expected 3)
非常感谢
model[corp]
不会 return 您的代码期望的元组 (topic_percs, wordid_topics, wordid_phivalues)
。相反,它 return 是 corp
的成员向量,即模型中每个主题的 corp
是从该主题生成的概率。这里 corp
是来自 corpus
的单个文档,因为您正在遍历 enumerate(corpus[0:1])
,因此您要为 corpus
.
这可以从documentation中给出的例子看出(对于LdaMulticore
的父class LdaModel
但是他们return是同一个对象):
>>> from gensim.test.utils import common_corpus
>>> from gensim.models.ldamodel import LdaModel
>>> lda = LdaModel(common_corpus, num_topics=10, iterations=1)
>>> doc_bow = [(1, 0.3), (2, 0.1), (0, 0.09)]
>>> doc_lda = lda[doc_bow]
>>> doc_lda
[(0, 0.08579318),
(1, 0.0858944),
(2, 0.079572774),
(3, 0.09752562),
(4, 0.08426655),
(5, 0.1231114),
(6, 0.17063272),
(7, 0.08766636),
(8, 0.083353266),
(9, 0.102183744)]
你好像想打电话
model.get_document_topics(corp)
用于语料库中的每个文档词袋(您称之为 corp
)。
这个 return 整个文档的主题分布的三元组,
每个词最可能的主题,
并且 phi 相关值乘以每个 word-topic 组合的特征长度。
否则,您可以更改
topic_percs, wordid_topics, wordid_phivalues = model[corp]
至
topic_percs = model[corp]
甚至更清楚
topic_percs = model.get_document_topics(corp)
假设wordid_topics
是每个话题中每个wordid
的概率,
那么你可以调用 model.get_topic_terms(topicid)
到 return 主题生成的最相关单词的概率对,或者 model.get_topics()
得到 term-topic 矩阵。
您应该将 topic_percs, wordid_topics, wordid_phivalues = model[corp]
更改为 topic_percs, wordid_topics, wordid_phivalues = model.get_document_topics(corp_cur, per_word_topics = True)