python - 产量使用不当
python - Yield improperly usage
我很确定我没有正确使用 yield:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
from gensim import corpora, models, similarities
from collections import defaultdict
from pprint import pprint # pretty-printer
from six import iteritems
import openpyxl
import string
from operator import itemgetter
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
#Creating a stoplist from file
with open('stop-word-list.txt') as f:
stoplist = [x.strip('\n') for x in f.readlines()]
corpusFileName = 'content_sample_en.xlsx'
corpusSheetName = 'content_sample_en'
class MyCorpus(object):
def __iter__(self):
wb = openpyxl.load_workbook(corpusFileName)
sheet = wb.get_sheet_by_name(corpusSheetName)
for i in range(1, (sheet.max_row+1)/2):
title = str(sheet.cell(row = i, column = 4).value.encode('utf-8'))
summary = str(sheet.cell(row = i, column = 5).value.encode('utf-8'))
content = str(sheet.cell(row = i, column = 10).value.encode('utf-8'))
yield reBuildDoc("{} {} {}".format(title, summary, content))
def removeUnwantedPunctuations(doc):
"change all (/, \, <, >) into ' ' "
newDoc = ""
for l in doc:
if l == "<" or l == ">" or l == "/" or l == "\":
newDoc += " "
else:
newDoc += l
return newDoc
def reBuildDoc(doc):
"""
:param doc:
:return: document after being dissected to our needs.
"""
doc = removeUnwantedPunctuations(doc).lower().translate(None, string.punctuation)
newDoc = [word for word in doc.split() if word not in stoplist]
return newDoc
corpus = MyCorpus()
tfidf = models.TfidfModel(corpus, normalize=True)
在下面的示例中,您可以看到我尝试从 xlsx 文件创建语料库。我从 xlsx 文件中读取 3 行标题摘要和内容并将它们附加到一个大字符串中。我的 reBuildDoc()
和 removeUnwantedPunctuations()
函数然后根据我的需要调整文本,最后 returns 一大串单词。 (例如:[hello, piano, computer, etc... ]
)最后我得到了结果但是我得到了以下错误:
Traceback (most recent call last):
File "C:/Users/Eran/PycharmProjects/tfidf/docproc.py", line 101, in <module>
tfidf = models.TfidfModel(corpus, normalize=True)
File "C:\Anaconda2\lib\site-packages\gensim-0.13.1-py2.7-win-amd64.egg\gensim\models\tfidfmodel.py", line 96, in __init__
self.initialize(corpus)
File "C:\Anaconda2\lib\site-packages\gensim-0.13.1-py2.7-win-amd64.egg\gensim\models\tfidfmodel.py", line 119, in initialize
for termid, _ in bow:
ValueError: too many values to unpack
我知道错误来自屈服线,因为我使用了不同的屈服线。它看起来像这样:
yield [word for word in dictionary.doc2bow("{} {} {}".format(title, summary, content).lower().translate(None, string.punctuation).split()) if word not in stoplist]
它有点乱,很难将其功能化,所以我已经对其进行了更改,如您在第一个示例中所见。
问题不在于 yield
本身,而是产生了什么,错误来自 for termid, _ in bow
这一行表示您希望 bow
包含元组列表或任何其他仅包含 2 个元素的对象,如 (1,2),[1,2],"12",...
,但当它结束时给出 MyCorpus
的结果,这是一个明显超过 2 个元素的字符串,因此出现错误,要解决此问题,请执行以下任一操作 for termid in bow
或在 MyCorpus
中做 yield reBuildDoc("{} {} {}".format(title, summary, content)), None
所以你产生一个包含 2 个对象的元组
为了说明这个检查这个例子
>>> def fun(obj):
for _ in range(2):
yield obj
>>> for a,b in fun("xyz"):
print(a,b)
Traceback (most recent call last):
File "<pyshell#11>", line 1, in <module>
for a,b in fun("xyz"):
ValueError: too many values to unpack (expected 2)
>>> for a,b in fun("xy"):
print(a,b)
x y
x y
>>> for a,b in fun(("xy",None)):
print(a,b)
xy None
xy None
>>>
看起来你的问题是 TfidfModel
期望 corpus
是 list
的 doc2bow
输出(它们本身是 list
的两个- tuple
s)。您的原始工作代码使用 doc2bow
正确地从您的纯字符串转换为语料库格式,您的新代码正在传递原始字符串,而不是 "vectors" TfidfModel
期望的。
回到使用 doc2bow
和 read the tutorial on converting string to vectors,这清楚地表明原始字符串作为输入是无意义的。
我很确定我没有正确使用 yield:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
from gensim import corpora, models, similarities
from collections import defaultdict
from pprint import pprint # pretty-printer
from six import iteritems
import openpyxl
import string
from operator import itemgetter
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
#Creating a stoplist from file
with open('stop-word-list.txt') as f:
stoplist = [x.strip('\n') for x in f.readlines()]
corpusFileName = 'content_sample_en.xlsx'
corpusSheetName = 'content_sample_en'
class MyCorpus(object):
def __iter__(self):
wb = openpyxl.load_workbook(corpusFileName)
sheet = wb.get_sheet_by_name(corpusSheetName)
for i in range(1, (sheet.max_row+1)/2):
title = str(sheet.cell(row = i, column = 4).value.encode('utf-8'))
summary = str(sheet.cell(row = i, column = 5).value.encode('utf-8'))
content = str(sheet.cell(row = i, column = 10).value.encode('utf-8'))
yield reBuildDoc("{} {} {}".format(title, summary, content))
def removeUnwantedPunctuations(doc):
"change all (/, \, <, >) into ' ' "
newDoc = ""
for l in doc:
if l == "<" or l == ">" or l == "/" or l == "\":
newDoc += " "
else:
newDoc += l
return newDoc
def reBuildDoc(doc):
"""
:param doc:
:return: document after being dissected to our needs.
"""
doc = removeUnwantedPunctuations(doc).lower().translate(None, string.punctuation)
newDoc = [word for word in doc.split() if word not in stoplist]
return newDoc
corpus = MyCorpus()
tfidf = models.TfidfModel(corpus, normalize=True)
在下面的示例中,您可以看到我尝试从 xlsx 文件创建语料库。我从 xlsx 文件中读取 3 行标题摘要和内容并将它们附加到一个大字符串中。我的 reBuildDoc()
和 removeUnwantedPunctuations()
函数然后根据我的需要调整文本,最后 returns 一大串单词。 (例如:[hello, piano, computer, etc... ]
)最后我得到了结果但是我得到了以下错误:
Traceback (most recent call last):
File "C:/Users/Eran/PycharmProjects/tfidf/docproc.py", line 101, in <module>
tfidf = models.TfidfModel(corpus, normalize=True)
File "C:\Anaconda2\lib\site-packages\gensim-0.13.1-py2.7-win-amd64.egg\gensim\models\tfidfmodel.py", line 96, in __init__
self.initialize(corpus)
File "C:\Anaconda2\lib\site-packages\gensim-0.13.1-py2.7-win-amd64.egg\gensim\models\tfidfmodel.py", line 119, in initialize
for termid, _ in bow:
ValueError: too many values to unpack
我知道错误来自屈服线,因为我使用了不同的屈服线。它看起来像这样:
yield [word for word in dictionary.doc2bow("{} {} {}".format(title, summary, content).lower().translate(None, string.punctuation).split()) if word not in stoplist]
它有点乱,很难将其功能化,所以我已经对其进行了更改,如您在第一个示例中所见。
问题不在于 yield
本身,而是产生了什么,错误来自 for termid, _ in bow
这一行表示您希望 bow
包含元组列表或任何其他仅包含 2 个元素的对象,如 (1,2),[1,2],"12",...
,但当它结束时给出 MyCorpus
的结果,这是一个明显超过 2 个元素的字符串,因此出现错误,要解决此问题,请执行以下任一操作 for termid in bow
或在 MyCorpus
中做 yield reBuildDoc("{} {} {}".format(title, summary, content)), None
所以你产生一个包含 2 个对象的元组
为了说明这个检查这个例子
>>> def fun(obj):
for _ in range(2):
yield obj
>>> for a,b in fun("xyz"):
print(a,b)
Traceback (most recent call last):
File "<pyshell#11>", line 1, in <module>
for a,b in fun("xyz"):
ValueError: too many values to unpack (expected 2)
>>> for a,b in fun("xy"):
print(a,b)
x y
x y
>>> for a,b in fun(("xy",None)):
print(a,b)
xy None
xy None
>>>
看起来你的问题是 TfidfModel
期望 corpus
是 list
的 doc2bow
输出(它们本身是 list
的两个- tuple
s)。您的原始工作代码使用 doc2bow
正确地从您的纯字符串转换为语料库格式,您的新代码正在传递原始字符串,而不是 "vectors" TfidfModel
期望的。
回到使用 doc2bow
和 read the tutorial on converting string to vectors,这清楚地表明原始字符串作为输入是无意义的。