ValueError: Found input variables with inconsistent numbers of samples on binary SVM
ValueError: Found input variables with inconsistent numbers of samples on binary SVM
尝试在 20_newsgroups 数据集上 运行 二进制 SVM。似乎出现了 ValueError:发现样本数量不一致的输入变量:[783、1177]。谁能建议为什么会这样?
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer
# from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
categories = ["comp.graphics", 'sci.space']
data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42)
def is_letter_only(word) :
return word.isalpha()
all_names = set (names.words())
lemmatizer = WordNetLemmatizer()
def clean_text(docs) :
docs_cleaned = []
for doc in docs:
doc = doc.lower()
doc_cleaned = ' '.join(lemmatizer.lemmatize(word)
for word in doc.split() if is_letter_only(word)
and word not in all_names)
docs_cleaned.append(doc_cleaned)
return docs_cleaned
cleaned_train = clean_text(data_train.data)
label_train = data_train.target
cleaned_test = clean_text(data_train.data)
label_test = data_test.target
len(label_train),len(label_test)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=None)
term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train)
term_docs_test = tfidf_vectorizer.transform(cleaned_test)
from sklearn.svm import SVC
svm = SVC(kernel='linear', C=1.0, random_state=42)
svm.fit(term_docs_train, label_train)
accuracy = svm.score(term_docs_test, label_test)
print(accuracy)
该错误只是告诉您,您尝试预测标签的样本数量与输出标签的数量存在差异。发生这种情况是因为您使用与训练集和测试集相同的数据,但随后您尝试匹配具有不同大小的测试集的标签。
只需修正这一行:
cleaned_test = clean_text(data_test.data)
您脚本的结果是:
0.966794380587484
尝试在 20_newsgroups 数据集上 运行 二进制 SVM。似乎出现了 ValueError:发现样本数量不一致的输入变量:[783、1177]。谁能建议为什么会这样?
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer
# from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
categories = ["comp.graphics", 'sci.space']
data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42)
def is_letter_only(word) :
return word.isalpha()
all_names = set (names.words())
lemmatizer = WordNetLemmatizer()
def clean_text(docs) :
docs_cleaned = []
for doc in docs:
doc = doc.lower()
doc_cleaned = ' '.join(lemmatizer.lemmatize(word)
for word in doc.split() if is_letter_only(word)
and word not in all_names)
docs_cleaned.append(doc_cleaned)
return docs_cleaned
cleaned_train = clean_text(data_train.data)
label_train = data_train.target
cleaned_test = clean_text(data_train.data)
label_test = data_test.target
len(label_train),len(label_test)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=None)
term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train)
term_docs_test = tfidf_vectorizer.transform(cleaned_test)
from sklearn.svm import SVC
svm = SVC(kernel='linear', C=1.0, random_state=42)
svm.fit(term_docs_train, label_train)
accuracy = svm.score(term_docs_test, label_test)
print(accuracy)
该错误只是告诉您,您尝试预测标签的样本数量与输出标签的数量存在差异。发生这种情况是因为您使用与训练集和测试集相同的数据,但随后您尝试匹配具有不同大小的测试集的标签。
只需修正这一行:
cleaned_test = clean_text(data_test.data)
您脚本的结果是:
0.966794380587484