将句子转换为 NLP 矩阵

Converting Sentences to a Matrix for NLP

我正在尝试获取句子,构建可能术语的字典,然后将句子转换为矩阵,其中行代表句子 1、句子 2 等,列代表这些句子可能包含的单词用 1 或 0 表示该特定单词是否包含在相应的行(句子)中。以下是我到目前为止所做的,前半部分按预期工作(我相信)但是我在标题为“在句子中查找单词标记并标记以供将来分析”的部分中的逻辑显然有问题我可以'不太清楚如何解决。我想在 NLTK 中可能还有更好的方法来做到这一点,但我还没有找到它。

from nltk import word_tokenize
import nltk
import numpy as np
import punkt
from nltk.stem import PorterStemmer
from nltk.stem import   WordNetLemmatizer
from nltk.corpus import stopwords 

text_array = ["I run faster than you","I own four computers"]

# =============================================================================
# Create an array of possible words that has punctuation removed and is lemmatized
# =============================================================================

wordnet_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

all_words = []

for sentence in text_array:
    #Take every word in each sentence and tokenize it
    tokenize_word = word_tokenize(sentence)
    for word in tokenize_word:
        #Find the lemma of every word previously tokenized, for example "running" should be come something like "run"
        word_mod = wordnet_lemmatizer.lemmatize(word)
        #Remove punctuation from individual words
        word_mod = word_mod.translate(str.maketrans('','',string.punctuation))
        #Check if word tokens are empty (because some tokenized words were just punctuation) or are a stop word, don't append in either case
        if word_mod != '' and word_mod not in stop_words:
            all_words.append(word_mod)

unique_words = set(all_words)
unique_words_list = list(unique_words)

# =============================================================================
# Find word tokens in sentences and mark for future analysis
# =============================================================================


text_array_ex =  text_array
results_matrix = np.zeros(shape=(len(text_array_ex),len(unique_words_list)),dtype='int') 

for i in range(0,len(text_array_ex)):
    sentence = text_array_ex[i]
    #Take every word in each sentence and tokenize it
    tokenize_word = word_tokenize(sentence)
    for word in tokenize_word:
        #Find the lemma of every word previously tokenized, for example "running" should be come something like "run"
        word_mod = wordnet_lemmatizer.lemmatize(word)
        #Remove punctuation from individual words
        word_mod = word_mod.translate(str.maketrans('','',string.punctuation))
        for j in range(0,len(unique_words_list)):
            if unique_words_list[j] == word_mod:
                results_matrix[i,j] = 1
            else: results_matrix[i,j] = 0

嗯,我能够找出我的错误,当然这是一个愚蠢的错误。

for i in range(0,len(text_array_ex)):
    sentence = text_array_ex[i]
    #Take every word in each sentence and tokenize it
    tokenize_word = word_tokenize(sentence)
    for word in tokenize_word:
        #Find the lemma of every word previously tokenized, for example "running" should be come something like "run"
        word_mod = wordnet_lemmatizer.lemmatize(word)
        #Remove punctuation from individual words
        word_mod = word_mod.translate(str.maketrans('','',string.punctuation))
        for j in range(0,len(unique_words_list)):
            if unique_words_list[j] == word_mod:
                results_matrix[i,j] += 1