将句子转换为 NLP 矩阵
Converting Sentences to a Matrix for NLP
我正在尝试获取句子,构建可能术语的字典,然后将句子转换为矩阵,其中行代表句子 1、句子 2 等,列代表这些句子可能包含的单词用 1 或 0 表示该特定单词是否包含在相应的行(句子)中。以下是我到目前为止所做的,前半部分按预期工作(我相信)但是我在标题为“在句子中查找单词标记并标记以供将来分析”的部分中的逻辑显然有问题我可以'不太清楚如何解决。我想在 NLTK 中可能还有更好的方法来做到这一点,但我还没有找到它。
from nltk import word_tokenize
import nltk
import numpy as np
import punkt
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
text_array = ["I run faster than you","I own four computers"]
# =============================================================================
# Create an array of possible words that has punctuation removed and is lemmatized
# =============================================================================
wordnet_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
all_words = []
for sentence in text_array:
#Take every word in each sentence and tokenize it
tokenize_word = word_tokenize(sentence)
for word in tokenize_word:
#Find the lemma of every word previously tokenized, for example "running" should be come something like "run"
word_mod = wordnet_lemmatizer.lemmatize(word)
#Remove punctuation from individual words
word_mod = word_mod.translate(str.maketrans('','',string.punctuation))
#Check if word tokens are empty (because some tokenized words were just punctuation) or are a stop word, don't append in either case
if word_mod != '' and word_mod not in stop_words:
all_words.append(word_mod)
unique_words = set(all_words)
unique_words_list = list(unique_words)
# =============================================================================
# Find word tokens in sentences and mark for future analysis
# =============================================================================
text_array_ex = text_array
results_matrix = np.zeros(shape=(len(text_array_ex),len(unique_words_list)),dtype='int')
for i in range(0,len(text_array_ex)):
sentence = text_array_ex[i]
#Take every word in each sentence and tokenize it
tokenize_word = word_tokenize(sentence)
for word in tokenize_word:
#Find the lemma of every word previously tokenized, for example "running" should be come something like "run"
word_mod = wordnet_lemmatizer.lemmatize(word)
#Remove punctuation from individual words
word_mod = word_mod.translate(str.maketrans('','',string.punctuation))
for j in range(0,len(unique_words_list)):
if unique_words_list[j] == word_mod:
results_matrix[i,j] = 1
else: results_matrix[i,j] = 0
嗯,我能够找出我的错误,当然这是一个愚蠢的错误。
for i in range(0,len(text_array_ex)):
sentence = text_array_ex[i]
#Take every word in each sentence and tokenize it
tokenize_word = word_tokenize(sentence)
for word in tokenize_word:
#Find the lemma of every word previously tokenized, for example "running" should be come something like "run"
word_mod = wordnet_lemmatizer.lemmatize(word)
#Remove punctuation from individual words
word_mod = word_mod.translate(str.maketrans('','',string.punctuation))
for j in range(0,len(unique_words_list)):
if unique_words_list[j] == word_mod:
results_matrix[i,j] += 1
我正在尝试获取句子,构建可能术语的字典,然后将句子转换为矩阵,其中行代表句子 1、句子 2 等,列代表这些句子可能包含的单词用 1 或 0 表示该特定单词是否包含在相应的行(句子)中。以下是我到目前为止所做的,前半部分按预期工作(我相信)但是我在标题为“在句子中查找单词标记并标记以供将来分析”的部分中的逻辑显然有问题我可以'不太清楚如何解决。我想在 NLTK 中可能还有更好的方法来做到这一点,但我还没有找到它。
from nltk import word_tokenize
import nltk
import numpy as np
import punkt
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
text_array = ["I run faster than you","I own four computers"]
# =============================================================================
# Create an array of possible words that has punctuation removed and is lemmatized
# =============================================================================
wordnet_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
all_words = []
for sentence in text_array:
#Take every word in each sentence and tokenize it
tokenize_word = word_tokenize(sentence)
for word in tokenize_word:
#Find the lemma of every word previously tokenized, for example "running" should be come something like "run"
word_mod = wordnet_lemmatizer.lemmatize(word)
#Remove punctuation from individual words
word_mod = word_mod.translate(str.maketrans('','',string.punctuation))
#Check if word tokens are empty (because some tokenized words were just punctuation) or are a stop word, don't append in either case
if word_mod != '' and word_mod not in stop_words:
all_words.append(word_mod)
unique_words = set(all_words)
unique_words_list = list(unique_words)
# =============================================================================
# Find word tokens in sentences and mark for future analysis
# =============================================================================
text_array_ex = text_array
results_matrix = np.zeros(shape=(len(text_array_ex),len(unique_words_list)),dtype='int')
for i in range(0,len(text_array_ex)):
sentence = text_array_ex[i]
#Take every word in each sentence and tokenize it
tokenize_word = word_tokenize(sentence)
for word in tokenize_word:
#Find the lemma of every word previously tokenized, for example "running" should be come something like "run"
word_mod = wordnet_lemmatizer.lemmatize(word)
#Remove punctuation from individual words
word_mod = word_mod.translate(str.maketrans('','',string.punctuation))
for j in range(0,len(unique_words_list)):
if unique_words_list[j] == word_mod:
results_matrix[i,j] = 1
else: results_matrix[i,j] = 0
嗯,我能够找出我的错误,当然这是一个愚蠢的错误。
for i in range(0,len(text_array_ex)):
sentence = text_array_ex[i]
#Take every word in each sentence and tokenize it
tokenize_word = word_tokenize(sentence)
for word in tokenize_word:
#Find the lemma of every word previously tokenized, for example "running" should be come something like "run"
word_mod = wordnet_lemmatizer.lemmatize(word)
#Remove punctuation from individual words
word_mod = word_mod.translate(str.maketrans('','',string.punctuation))
for j in range(0,len(unique_words_list)):
if unique_words_list[j] == word_mod:
results_matrix[i,j] += 1