确定 sklearn 中 SVM 分类器最有贡献的特征
Determining the most contributing features for SVM classifier in sklearn
我有一个数据集,我想根据该数据训练我的模型。训练后,我需要知道在 SVM 分类器的分类中起主要作用的特征。
森林算法有一个叫做特征重要性的东西,有没有类似的东西?
是的,SVM 分类器有属性 coef_
,但它仅适用于具有 线性内核 的 SVM。对于其他内核,这是不可能的,因为数据通过内核方法转换为另一个 space,这与输入 space 无关,检查 explanation.
from matplotlib import pyplot as plt
from sklearn import svm
def f_importances(coef, names):
imp = coef
imp,names = zip(*sorted(zip(imp,names)))
plt.barh(range(len(names)), imp, align='center')
plt.yticks(range(len(names)), names)
plt.show()
features_names = ['input1', 'input2']
svm = svm.SVC(kernel='linear')
svm.fit(X, Y)
f_importances(svm.coef_, features_names)
函数的输出如下所示:
仅需一行代码:
拟合 SVM 模型:
from sklearn import svm
svm = svm.SVC(gamma=0.001, C=100., kernel = 'linear')
并按如下方式实现情节:
pd.Series(abs(svm.coef_[0]), index=features.columns).nlargest(10).plot(kind='barh')
结果为:
the most contributing features of the SVM model in absolute values
我创建了一个也适用于 Python 3 的解决方案,它基于 Jakub Macina 的代码片段。
from matplotlib import pyplot as plt
from sklearn import svm
def f_importances(coef, names, top=-1):
imp = coef
imp, names = zip(*sorted(list(zip(imp, names))))
# Show all features
if top == -1:
top = len(names)
plt.barh(range(top), imp[::-1][0:top], align='center')
plt.yticks(range(top), names[::-1][0:top])
plt.show()
# whatever your features are called
features_names = ['input1', 'input2', ...]
svm = svm.SVC(kernel='linear')
svm.fit(X_train, y_train)
# Specify your top n features you want to visualize.
# You can also discard the abs() function
# if you are interested in negative contribution of features
f_importances(abs(clf.coef_[0]), feature_names, top=10)
如果您正在使用 rbf(径向基函数)内核,您可以按如下方式使用 sklearn.inspection.permutation_importance
来获取特征重要性。 [doc]
from sklearn.inspection import permutation_importance
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
svc = SVC(kernel='rbf', C=2)
svc.fit(X_train, y_train)
perm_importance = permutation_importance(svc, X_test, y_test)
feature_names = ['feature1', 'feature2', 'feature3', ...... ]
features = np.array(feature_names)
sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(features[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
我有一个数据集,我想根据该数据训练我的模型。训练后,我需要知道在 SVM 分类器的分类中起主要作用的特征。
森林算法有一个叫做特征重要性的东西,有没有类似的东西?
是的,SVM 分类器有属性 coef_
,但它仅适用于具有 线性内核 的 SVM。对于其他内核,这是不可能的,因为数据通过内核方法转换为另一个 space,这与输入 space 无关,检查 explanation.
from matplotlib import pyplot as plt
from sklearn import svm
def f_importances(coef, names):
imp = coef
imp,names = zip(*sorted(zip(imp,names)))
plt.barh(range(len(names)), imp, align='center')
plt.yticks(range(len(names)), names)
plt.show()
features_names = ['input1', 'input2']
svm = svm.SVC(kernel='linear')
svm.fit(X, Y)
f_importances(svm.coef_, features_names)
函数的输出如下所示:
仅需一行代码:
拟合 SVM 模型:
from sklearn import svm
svm = svm.SVC(gamma=0.001, C=100., kernel = 'linear')
并按如下方式实现情节:
pd.Series(abs(svm.coef_[0]), index=features.columns).nlargest(10).plot(kind='barh')
结果为:
the most contributing features of the SVM model in absolute values
我创建了一个也适用于 Python 3 的解决方案,它基于 Jakub Macina 的代码片段。
from matplotlib import pyplot as plt
from sklearn import svm
def f_importances(coef, names, top=-1):
imp = coef
imp, names = zip(*sorted(list(zip(imp, names))))
# Show all features
if top == -1:
top = len(names)
plt.barh(range(top), imp[::-1][0:top], align='center')
plt.yticks(range(top), names[::-1][0:top])
plt.show()
# whatever your features are called
features_names = ['input1', 'input2', ...]
svm = svm.SVC(kernel='linear')
svm.fit(X_train, y_train)
# Specify your top n features you want to visualize.
# You can also discard the abs() function
# if you are interested in negative contribution of features
f_importances(abs(clf.coef_[0]), feature_names, top=10)
如果您正在使用 rbf(径向基函数)内核,您可以按如下方式使用 sklearn.inspection.permutation_importance
来获取特征重要性。 [doc]
from sklearn.inspection import permutation_importance
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
svc = SVC(kernel='rbf', C=2)
svc.fit(X_train, y_train)
perm_importance = permutation_importance(svc, X_test, y_test)
feature_names = ['feature1', 'feature2', 'feature3', ...... ]
features = np.array(feature_names)
sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(features[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")