I want to solve this error AttributeError: module 'tokenization' has no attribute 'FullTokenizer'
I want to solve this error AttributeError: module 'tokenization' has no attribute 'FullTokenizer'
我正在研究 tensorflow2,但在导入标记化时遇到如下错误。
我已经尝试过 pip3 install some tensorflow version 1.0~2.0 和 tokenizer 和 tokenization 的包,但仍然无法正常工作。你能给我建议来解决这个错误吗??
error message
Traceback (most recent call last):
File "_count_tokenization.py", line 26, in <module>
my_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path)
AttributeError: module 'tokenization' has no attribute 'FullTokenizer'
import tokenization
import codecs
import numpy as np
vocab_path = "./model_ch/vocab.txt"
max_seq_length = 128
file0 = "./task/message.tsv"
f0 = codecs.open(file0, "r", "utf-8")
lines = f0.readlines()
f0.close()
len_file = len(lines)
count = np.zeros([len_file])
count0 = np.zeros([len_file])
my_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path)
#file1 = "./task_data_ch/%s_count.tsv" % filename
file1 = "./task/message_count.tsv"
f1 = codecs.open(file1, "w", "utf-8")
f1.write("%s\t%s\t%s\r\n" % ("label","count","count_truncated"))
for i in range(1,len_file):
a = lines[i]
a = a.split("\t")
text = a[1]
token = my_tokenizer.tokenize(text)
print(token)
count[i] = len(token) + 2 # for [CLS] and [SEP]
if count[i] > max_seq_length:
count0[i] = max_seq_length
else:
count0[i] = count[i]
f1.write("%s\t%s\t%s\n" % (i-1,int(count[i]),int(count0[i])))
sum0 = int(np.sum(count0))
sum1 = int(np.sum(count))
print(sum0, sum1)
print(int(len_file-1))
f1.write("Total: %s, %s" % (sum1,sum0))
f1.close()
the result of pip3 list
tensorboard 2.2.1
tensorboard-plugin-wit 1.6.0.post3
tensorflow-estimator 2.2.0
tokenization 1.0.7
tokenizer 2.0.5
下面的一段代码将为您启用 TF 2.0。
# Colab has two versions of TensorFlow installed: a 1.x version and a 2.xversion.
# Collab currently uses TF 1.x by default
# To enable TF2 to execute the following code
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)
然后,导入具有停用词、标记化等特定要求的 nltk
import nltk
nltk.download("popular")
from nltk.tokenize import word_tokenize
下面的代码将标记你的句子,如果你想标记你的句子,也可以使用
tokens = sent_tokenize("Your paragraphs or multiple sentences")
text = "I love NLP and I will learn NLP in 2 months"
words = nltk.word_tokenize(text)
words
我正在研究 tensorflow2,但在导入标记化时遇到如下错误。 我已经尝试过 pip3 install some tensorflow version 1.0~2.0 和 tokenizer 和 tokenization 的包,但仍然无法正常工作。你能给我建议来解决这个错误吗??
error message
Traceback (most recent call last):
File "_count_tokenization.py", line 26, in <module>
my_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path)
AttributeError: module 'tokenization' has no attribute 'FullTokenizer'
import tokenization
import codecs
import numpy as np
vocab_path = "./model_ch/vocab.txt"
max_seq_length = 128
file0 = "./task/message.tsv"
f0 = codecs.open(file0, "r", "utf-8")
lines = f0.readlines()
f0.close()
len_file = len(lines)
count = np.zeros([len_file])
count0 = np.zeros([len_file])
my_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path)
#file1 = "./task_data_ch/%s_count.tsv" % filename
file1 = "./task/message_count.tsv"
f1 = codecs.open(file1, "w", "utf-8")
f1.write("%s\t%s\t%s\r\n" % ("label","count","count_truncated"))
for i in range(1,len_file):
a = lines[i]
a = a.split("\t")
text = a[1]
token = my_tokenizer.tokenize(text)
print(token)
count[i] = len(token) + 2 # for [CLS] and [SEP]
if count[i] > max_seq_length:
count0[i] = max_seq_length
else:
count0[i] = count[i]
f1.write("%s\t%s\t%s\n" % (i-1,int(count[i]),int(count0[i])))
sum0 = int(np.sum(count0))
sum1 = int(np.sum(count))
print(sum0, sum1)
print(int(len_file-1))
f1.write("Total: %s, %s" % (sum1,sum0))
f1.close()
the result of pip3 list
tensorboard 2.2.1
tensorboard-plugin-wit 1.6.0.post3
tensorflow-estimator 2.2.0
tokenization 1.0.7
tokenizer 2.0.5
下面的一段代码将为您启用 TF 2.0。
# Colab has two versions of TensorFlow installed: a 1.x version and a 2.xversion.
# Collab currently uses TF 1.x by default
# To enable TF2 to execute the following code
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)
然后,导入具有停用词、标记化等特定要求的 nltk
import nltk
nltk.download("popular")
from nltk.tokenize import word_tokenize
下面的代码将标记你的句子,如果你想标记你的句子,也可以使用
tokens = sent_tokenize("Your paragraphs or multiple sentences")
text = "I love NLP and I will learn NLP in 2 months"
words = nltk.word_tokenize(text)
words