Python 的文本分类
Text Classification with Python
嗨,我是 python 编程语言的新手,基于各种参考,我使用逻辑回归构建了文本分类模型,下面是代码。
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import string
import nltk
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
Train = pd.read_excel("/Desktop/ML Based Text classification/test.xlsx")
real = pd.read_excel("/Desktop/ML Based Text classification/test.xlsx", sheet_name = 'Test')
Train_data = Train['description']
Test_data = real['description']
stop = stopwords.words('english')
porter = PorterStemmer()
def remove_stopwords(text):
text = [word.lower() for word in text.split() if word.lower() not in stop]
return " ".join(text)
def stemmer(stem_text):
stem_text = [porter.stem(word) for word in stem_text.split()]
return " ".join(stem_text)
def clean_data(data):
text_clean = (data.str.replace('[^\w\s]','')
.str.replace('\d+', '')
.apply(remove_stopwords)
.apply(stemmer)
.astype(str))
return (text_clean)
Train_data = clean_data(Train_data)
counter = Counter(Train['tags'].tolist())
top_10_varieties = {i[0]: idx for idx, i in enumerate(counter.most_common(50))}
Train['Mapping'] = Train['tags'].map(top_10_varieties)
#top_10_varieties = {'Outlook Related Issue': 0, 'Password Reset': 1, 'VPN Issue': 2}
tfidf_converter = TfidfVectorizer()
model_log = LogisticRegression()
X = Train_data
Y = Train['Mapping']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.95, random_state = 0)
svc = Pipeline([('tfidf', TfidfVectorizer()),
('clf',LogisticRegression()),
])
svc.fit(X_train, y_train)
ytest = np.array(y_test)
y_pred = svc.predict(X_test)
Test_data = clean_data(Test_data)
y_pred = svc.predict(Test_data)
现在我没有错误运行这段代码,当我打印“y_pred”时,我得到的输出是
array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 2, 1, 2, 0, 2, 2, 2, 1, 0, 1,
1, 2, 1, 2, 0, 0, 2, 2, 1, 0, 0, 2, 0, 0, 0], dtype=int64)
我不确定,如何将其转换为映射字符串并根据我的原始数据对其进行标记,我想要这样的输出:
请尝试:
reverse_top_10_varieties = {idx:i[0] for idx, i in enumerate(counter.most_common(50))}
[reverse_top_10_varieties[id] for id in y_pred]
看看这是否能解决您的问题
嗨,我是 python 编程语言的新手,基于各种参考,我使用逻辑回归构建了文本分类模型,下面是代码。
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import string
import nltk
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
Train = pd.read_excel("/Desktop/ML Based Text classification/test.xlsx")
real = pd.read_excel("/Desktop/ML Based Text classification/test.xlsx", sheet_name = 'Test')
Train_data = Train['description']
Test_data = real['description']
stop = stopwords.words('english')
porter = PorterStemmer()
def remove_stopwords(text):
text = [word.lower() for word in text.split() if word.lower() not in stop]
return " ".join(text)
def stemmer(stem_text):
stem_text = [porter.stem(word) for word in stem_text.split()]
return " ".join(stem_text)
def clean_data(data):
text_clean = (data.str.replace('[^\w\s]','')
.str.replace('\d+', '')
.apply(remove_stopwords)
.apply(stemmer)
.astype(str))
return (text_clean)
Train_data = clean_data(Train_data)
counter = Counter(Train['tags'].tolist())
top_10_varieties = {i[0]: idx for idx, i in enumerate(counter.most_common(50))}
Train['Mapping'] = Train['tags'].map(top_10_varieties)
#top_10_varieties = {'Outlook Related Issue': 0, 'Password Reset': 1, 'VPN Issue': 2}
tfidf_converter = TfidfVectorizer()
model_log = LogisticRegression()
X = Train_data
Y = Train['Mapping']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.95, random_state = 0)
svc = Pipeline([('tfidf', TfidfVectorizer()),
('clf',LogisticRegression()),
])
svc.fit(X_train, y_train)
ytest = np.array(y_test)
y_pred = svc.predict(X_test)
Test_data = clean_data(Test_data)
y_pred = svc.predict(Test_data)
现在我没有错误运行这段代码,当我打印“y_pred”时,我得到的输出是
array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 2, 1, 2, 0, 2, 2, 2, 1, 0, 1,
1, 2, 1, 2, 0, 0, 2, 2, 1, 0, 0, 2, 0, 0, 0], dtype=int64)
我不确定,如何将其转换为映射字符串并根据我的原始数据对其进行标记,我想要这样的输出:
请尝试:
reverse_top_10_varieties = {idx:i[0] for idx, i in enumerate(counter.most_common(50))}
[reverse_top_10_varieties[id] for id in y_pred]
看看这是否能解决您的问题