Python 中的可视化和聚类
Visualization and Clustering in Python
我想基于NLP算法(tf-idf)对评论进行分类。
我设法对这些集群进行了分类,但我想以图形方式可视化它们(直方图、散点图...)
import collections
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import pandas as pd
import string
data = pd.read_excel (r'C:\Users\cra\One\intern\Book2.xlsx')
def word_tokenizer(text):
#tokenizes and stems the text
tokens = word_tokenize(text)
stemmer = PorterStemmer()
tokens = [stemmer.stem(t) for t in tokens if t not in
stopwords.words('english')]
return tokens
#tfidf convert text data to vectors
def cluster_sentences(sentences, nb_of_clusters=5):
tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer,
stop_words=stopwords.words('english'),#enlever stopwords
max_df=0.95,min_df=0.05,
lowercase=True)
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
kmeans = KMeans(n_clusters=nb_of_clusters)
kmeans.fit(tfidf_matrix)
clusters = collections.defaultdict(list)
for i, label in enumerate(kmeans.labels_):
clusters[label].append(i)
return dict(clusters)
if __name__ == "__main__":
sentences = data.Comment
nclusters= 20
clusters = cluster_sentences(sentences, nclusters) #dictionary of
#cluster and the index of the comment in the dataframe
for cluster in range(nclusters):
print ("cluster ",cluster,":")
for i,sentence in enumerate(clusters[cluster]):
print ("\tsentence ",i,": ",sentences[sentence])
例如我得到的结果:
第 6 组:
第 0 句:26 RIH DP 标准
第 1 句:32 RIH DP 标准
第 2 句:68 RIH Liner with DP std in hole
第 3 句:105 RIH DP 标准
第4句:118 RIH std no of DP in hole
第 5 句:154 RIH DP std
你能帮帮我吗!谢谢
您将需要使用 t-SNE 来可视化集群 - 这篇关于 visualizing and clustering US Laws 使用 tf-idf 的文章可以帮助您入门。
我想基于NLP算法(tf-idf)对评论进行分类。 我设法对这些集群进行了分类,但我想以图形方式可视化它们(直方图、散点图...)
import collections
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import pandas as pd
import string
data = pd.read_excel (r'C:\Users\cra\One\intern\Book2.xlsx')
def word_tokenizer(text):
#tokenizes and stems the text
tokens = word_tokenize(text)
stemmer = PorterStemmer()
tokens = [stemmer.stem(t) for t in tokens if t not in
stopwords.words('english')]
return tokens
#tfidf convert text data to vectors
def cluster_sentences(sentences, nb_of_clusters=5):
tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer,
stop_words=stopwords.words('english'),#enlever stopwords
max_df=0.95,min_df=0.05,
lowercase=True)
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
kmeans = KMeans(n_clusters=nb_of_clusters)
kmeans.fit(tfidf_matrix)
clusters = collections.defaultdict(list)
for i, label in enumerate(kmeans.labels_):
clusters[label].append(i)
return dict(clusters)
if __name__ == "__main__":
sentences = data.Comment
nclusters= 20
clusters = cluster_sentences(sentences, nclusters) #dictionary of
#cluster and the index of the comment in the dataframe
for cluster in range(nclusters):
print ("cluster ",cluster,":")
for i,sentence in enumerate(clusters[cluster]):
print ("\tsentence ",i,": ",sentences[sentence])
例如我得到的结果: 第 6 组: 第 0 句:26 RIH DP 标准 第 1 句:32 RIH DP 标准 第 2 句:68 RIH Liner with DP std in hole 第 3 句:105 RIH DP 标准 第4句:118 RIH std no of DP in hole 第 5 句:154 RIH DP std
你能帮帮我吗!谢谢
您将需要使用 t-SNE 来可视化集群 - 这篇关于 visualizing and clustering US Laws 使用 tf-idf 的文章可以帮助您入门。