python gensim word2vec gives typeerror TypeError: object of type 'generator' has no len() on custom dataclass
python gensim word2vec gives typeerror TypeError: object of type 'generator' has no len() on custom dataclass
我正在尝试让 word2vec 在 python3 中工作,但是由于我的数据集太大而无法轻松放入内存,因此我通过迭代器(从 zip 文件)加载它。但是,当我 运行 它时,我收到错误
Traceback (most recent call last):
File "WordModel.py", line 85, in <module>
main()
File "WordModel.py", line 15, in main
word2vec = gensim.models.Word2Vec(data,workers=cpu_count())
File "/home/thijser/.local/lib/python3.7/site-packages/gensim/models/word2vec.py", line 783, in __init__
fast_version=FAST_VERSION)
File "/home/thijser/.local/lib/python3.7/site-packages/gensim/models/base_any2vec.py", line 759, in __init__
self.build_vocab(sentences=sentences, corpus_file=corpus_file, trim_rule=trim_rule)
File "/home/thijser/.local/lib/python3.7/site-packages/gensim/models/base_any2vec.py", line 936, in build_vocab
sentences=sentences, corpus_file=corpus_file, progress_per=progress_per, trim_rule=trim_rule)
File "/home/thijser/.local/lib/python3.7/site-packages/gensim/models/word2vec.py", line 1591, in scan_vocab
total_words, corpus_count = self._scan_vocab(sentences, progress_per, trim_rule)
File "/home/thijser/.local/lib/python3.7/site-packages/gensim/models/word2vec.py", line 1576, in _scan_vocab
total_words += len(sentence)
TypeError: object of type 'generator' has no len()
代码如下:
import zipfile
import os
from ast import literal_eval
from lxml import etree
import io
import gensim
from multiprocessing import cpu_count
def main():
data = TrainingData("/media/thijser/Data/DataSets/uit2")
print(len(data))
word2vec = gensim.models.Word2Vec(data,workers=cpu_count())
word2vec.save('word2vec.save')
class TrainingData:
size=-1
def __init__(self, dirname):
self.data_location = dirname
def __len__(self):
if self.size<0:
for zipfile in self.get_zips_in_folder(self.data_location):
for text_file in self.get_files_names_from_zip(zipfile):
self.size=self.size+1
return self.size
def __iter__(self): #might not fit in memory otherwise
yield self.get_data()
def get_data(self):
for zipfile in self.get_zips_in_folder(self.data_location):
for text_file in self.get_files_names_from_zip(zipfile):
yield self.preproccess_text(text_file)
def stripXMLtags(self,text):
tree=etree.parse(text)
notags=etree.tostring(tree, encoding='utf8', method='text')
return notags.decode("utf-8")
def remove_newline(self,text):
text.replace("\n"," ")
return text
def preproccess_text(self,text):
text=self.stripXMLtags(text)
text=self.remove_newline(text)
return text
def get_files_names_from_zip(self,zip_location):
files=[]
archive = zipfile.ZipFile(zip_location, 'r')
for info in archive.infolist():
files.append(archive.open(info.filename))
return files
def get_zips_in_folder(self,location):
zip_files = []
for root, dirs, files in os.walk(location):
for name in files:
if name.endswith((".zip")):
filepath=root+"/"+name
zip_files.append(filepath)
return zip_files
main()
for d in data:
for dd in d :
print(type(dd))
确实告诉我 dd 是字符串类型并且包含正确的预处理字符串(每个字符串的长度在 50 到 5000 个单词之间)。
讨论后更新:
您的 TrainingData
class __iter__()
函数没有提供一个 returns 每个文本依次生成的生成器,而是一个 returns 一个生成器单个 other 发电机。 (yield
的级别太多了。)这不是 Word2Vec
所期望的。
将 __iter__()
方法的主体更改为简单...
return self.get_data()
...因此 __iter__()
是您的 get_data()
的同义词,而 returns 与 get_data()
相同的文本生成器应该帮助。
原回答:
您没有显示 get_data()
中引用的 TrainingData.preproccess_text()
(原文如此)方法,这是实际创建数据 Word2Vec
正在处理的方法。而且,正是这些数据产生了错误。
Word2Vec
要求其 sentences
语料库是一个 可迭代序列 (生成器适合),其中每个单独的项目都是 字符串标记列表。
从那个错误来看,您的 TrainingData
序列中的各个项目本身可能是生成器,而不是具有可读 len()
的列表。
(另外,如果您选择在那里使用生成器,因为单个文本可能非常非常长,请注意 gensim Word2Vec
和相关的 classes 仅针对单个文本进行训练长度最多为 10000 个单词标记。超过第 10000 个单词的任何单词都将被忽略。如果这是一个问题,您的源文本应预先分解为 10000 个标记或更少的单独文本。)
我正在尝试让 word2vec 在 python3 中工作,但是由于我的数据集太大而无法轻松放入内存,因此我通过迭代器(从 zip 文件)加载它。但是,当我 运行 它时,我收到错误
Traceback (most recent call last):
File "WordModel.py", line 85, in <module>
main()
File "WordModel.py", line 15, in main
word2vec = gensim.models.Word2Vec(data,workers=cpu_count())
File "/home/thijser/.local/lib/python3.7/site-packages/gensim/models/word2vec.py", line 783, in __init__
fast_version=FAST_VERSION)
File "/home/thijser/.local/lib/python3.7/site-packages/gensim/models/base_any2vec.py", line 759, in __init__
self.build_vocab(sentences=sentences, corpus_file=corpus_file, trim_rule=trim_rule)
File "/home/thijser/.local/lib/python3.7/site-packages/gensim/models/base_any2vec.py", line 936, in build_vocab
sentences=sentences, corpus_file=corpus_file, progress_per=progress_per, trim_rule=trim_rule)
File "/home/thijser/.local/lib/python3.7/site-packages/gensim/models/word2vec.py", line 1591, in scan_vocab
total_words, corpus_count = self._scan_vocab(sentences, progress_per, trim_rule)
File "/home/thijser/.local/lib/python3.7/site-packages/gensim/models/word2vec.py", line 1576, in _scan_vocab
total_words += len(sentence)
TypeError: object of type 'generator' has no len()
代码如下:
import zipfile
import os
from ast import literal_eval
from lxml import etree
import io
import gensim
from multiprocessing import cpu_count
def main():
data = TrainingData("/media/thijser/Data/DataSets/uit2")
print(len(data))
word2vec = gensim.models.Word2Vec(data,workers=cpu_count())
word2vec.save('word2vec.save')
class TrainingData:
size=-1
def __init__(self, dirname):
self.data_location = dirname
def __len__(self):
if self.size<0:
for zipfile in self.get_zips_in_folder(self.data_location):
for text_file in self.get_files_names_from_zip(zipfile):
self.size=self.size+1
return self.size
def __iter__(self): #might not fit in memory otherwise
yield self.get_data()
def get_data(self):
for zipfile in self.get_zips_in_folder(self.data_location):
for text_file in self.get_files_names_from_zip(zipfile):
yield self.preproccess_text(text_file)
def stripXMLtags(self,text):
tree=etree.parse(text)
notags=etree.tostring(tree, encoding='utf8', method='text')
return notags.decode("utf-8")
def remove_newline(self,text):
text.replace("\n"," ")
return text
def preproccess_text(self,text):
text=self.stripXMLtags(text)
text=self.remove_newline(text)
return text
def get_files_names_from_zip(self,zip_location):
files=[]
archive = zipfile.ZipFile(zip_location, 'r')
for info in archive.infolist():
files.append(archive.open(info.filename))
return files
def get_zips_in_folder(self,location):
zip_files = []
for root, dirs, files in os.walk(location):
for name in files:
if name.endswith((".zip")):
filepath=root+"/"+name
zip_files.append(filepath)
return zip_files
main()
for d in data:
for dd in d :
print(type(dd))
确实告诉我 dd 是字符串类型并且包含正确的预处理字符串(每个字符串的长度在 50 到 5000 个单词之间)。
讨论后更新:
您的 TrainingData
class __iter__()
函数没有提供一个 returns 每个文本依次生成的生成器,而是一个 returns 一个生成器单个 other 发电机。 (yield
的级别太多了。)这不是 Word2Vec
所期望的。
将 __iter__()
方法的主体更改为简单...
return self.get_data()
...因此 __iter__()
是您的 get_data()
的同义词,而 returns 与 get_data()
相同的文本生成器应该帮助。
原回答:
您没有显示 get_data()
中引用的 TrainingData.preproccess_text()
(原文如此)方法,这是实际创建数据 Word2Vec
正在处理的方法。而且,正是这些数据产生了错误。
Word2Vec
要求其 sentences
语料库是一个 可迭代序列 (生成器适合),其中每个单独的项目都是 字符串标记列表。
从那个错误来看,您的 TrainingData
序列中的各个项目本身可能是生成器,而不是具有可读 len()
的列表。
(另外,如果您选择在那里使用生成器,因为单个文本可能非常非常长,请注意 gensim Word2Vec
和相关的 classes 仅针对单个文本进行训练长度最多为 10000 个单词标记。超过第 10000 个单词的任何单词都将被忽略。如果这是一个问题,您的源文本应预先分解为 10000 个标记或更少的单独文本。)