如何在管道内使用 t-SNE
How to use t-SNE inside the pipeline
如何在管道中使用 t-SNE
?
我已经设法在没有流水线的情况下成功地 运行 t-SNE
并在其上进行了分类算法。
我是否需要编写一个可以在 returns 数据帧的管道中调用的自定义方法,或者它是如何工作的?
# How I used t-SNE
%%time
from sklearn.manifold import TSNE
X_std = StandardScaler().fit_transform(dfListingsFeature_classification)
ts = TSNE()
X_tsne = ts.fit_transform(X_std)
print(X_tsne.shape)
feature_list = []
for i in range(1,X_tsne.shape[1]+1):
feature_list .append("TSNE" + str(i))
df_new = pd.DataFrame(X_tsne, columns= feature_list )
df_new['label'] = y
#df_new.head()
X = df_new.drop(columns=['label'])
y = df_new['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
rfc= RandomForestClassifier()
# Train Decision Tree Classifer
rfc= rfc.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = rfc.predict(X_test)
我想用它
# How could I use TSNE() inside the the pipeline?
%%time
steps = [('standardscaler', StandardScaler()),
('tsne', TSNE()),
('rfc', RandomForestClassifier())]
pipeline = Pipeline(steps)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
parameteres = {'rfc__max_depth':[1,2,3,4,5,6,7,8,9,10,11,12],
'rfc__criterion':['gini', 'entropy']}
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)
grid.fit(X_train, y_train)
print("score = %3.2f" %(grid.score(X_test,y_test)))
print('Training set score: ' + str(grid.score(X_train,y_train)))
print('Test set score: ' + str(grid.score(X_test,y_test)))
print(grid.best_params_)
y_pred = grid.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precison:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
[OUT] TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'TSNE()' (type <class 'sklearn.manifold._t_sne.TSNE'>) doesn't
我应该构建自定义方法还是如何构建?如果是这样,它应该是什么样子?
class TestTSNE(BaseEstimator, TransformerMixin):
def __init__(self):
# don't know
def fit(self, X, y = None):
X_std = StandardScaler().fit_transform(dfListingsFeature_classification)
ts = TSNE()
X_tsne = ts.fit_transform(X_std)
return self
def transform(self, X, y = None):
feature_list = []
for i in range(1,shelf.X_tsne.shape[1]+1):
feature_list .append("TSNE" + str(i))
df_new = pd.DataFrame(X_tsne, columns= feature_list )
df_new['label'] = y
#df_new.head()
X = df_new.drop(columns=['label'])
y = df_new['label']
return X, y
...
steps = [('standardscaler', StandardScaler()),
('testTSNE', TestTSNE()),
('rfc', RandomForestClassifier())]
pipeline = Pipeline(steps)
我认为你误解了管道的使用。来自 help page:
Pipeline of transforms with a final estimator.
Sequentially apply a list of transforms and a final estimator.
Intermediate steps of the pipeline must be ‘transforms’, that is, they
must implement fit and transform methods. The final estimator only
needs to implement fit
所以这意味着如果您的管道是:
steps = [('standardscaler', StandardScaler()),
('tsne', TSNE()),
('rfc', RandomForestClassifier())]
您将首先将标准缩放器应用于您的特征,然后使用 tsne 转换结果,然后再将其传递给分类器。我认为在 tsne 输出上训练没有多大意义。
如果你真的想锁定管道,那么你需要将 tsne 的结果存储为一个属性,然后 return 特征,按原样训练,以便分类器可以工作它。
类似
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.manifold import TSNE
from sklearn.datasets import make_classification
class TestTSNE(BaseEstimator, TransformerMixin):
def __init__(self,n_components,random_state=None,method='exact'):
self.n_components = n_components
self.method = method
self.random_state = random_state
def fit(self, X, y = None):
ts = TSNE(n_components = self.n_components,
method = self.method, random_state = self.random_state)
self.X_tsne = ts.fit_transform(X)
return self
def transform(self, X, y = None):
return X
然后:
steps = [('standardscaler', StandardScaler()),
('testTSNE', TestTSNE(2)),
('rfc', RandomForestClassifier())]
pipeline = Pipeline(steps)
X,y = make_classification()
pipeline.fit(X,y)
您可以像这样取回您的 tsne:
pd.DataFrame(pipeline.steps[1][1].X_tsne)
0 1
0 -38.756626 -4.693253
1 46.516308 53.633842
2 49.107910 16.482645
3 18.306377 9.432504
4 33.551056 -27.441383
.. ... ...
95 -31.337574 -16.913471
96 -57.918224 -39.959976
97 55.282658 37.582535
98 66.425125 19.717241
99 -50.692646 11.545088
如何在管道中使用 t-SNE
?
我已经设法在没有流水线的情况下成功地 运行 t-SNE
并在其上进行了分类算法。
我是否需要编写一个可以在 returns 数据帧的管道中调用的自定义方法,或者它是如何工作的?
# How I used t-SNE
%%time
from sklearn.manifold import TSNE
X_std = StandardScaler().fit_transform(dfListingsFeature_classification)
ts = TSNE()
X_tsne = ts.fit_transform(X_std)
print(X_tsne.shape)
feature_list = []
for i in range(1,X_tsne.shape[1]+1):
feature_list .append("TSNE" + str(i))
df_new = pd.DataFrame(X_tsne, columns= feature_list )
df_new['label'] = y
#df_new.head()
X = df_new.drop(columns=['label'])
y = df_new['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
rfc= RandomForestClassifier()
# Train Decision Tree Classifer
rfc= rfc.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = rfc.predict(X_test)
我想用它
# How could I use TSNE() inside the the pipeline?
%%time
steps = [('standardscaler', StandardScaler()),
('tsne', TSNE()),
('rfc', RandomForestClassifier())]
pipeline = Pipeline(steps)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
parameteres = {'rfc__max_depth':[1,2,3,4,5,6,7,8,9,10,11,12],
'rfc__criterion':['gini', 'entropy']}
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)
grid.fit(X_train, y_train)
print("score = %3.2f" %(grid.score(X_test,y_test)))
print('Training set score: ' + str(grid.score(X_train,y_train)))
print('Test set score: ' + str(grid.score(X_test,y_test)))
print(grid.best_params_)
y_pred = grid.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precison:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
[OUT] TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'TSNE()' (type <class 'sklearn.manifold._t_sne.TSNE'>) doesn't
我应该构建自定义方法还是如何构建?如果是这样,它应该是什么样子?
class TestTSNE(BaseEstimator, TransformerMixin):
def __init__(self):
# don't know
def fit(self, X, y = None):
X_std = StandardScaler().fit_transform(dfListingsFeature_classification)
ts = TSNE()
X_tsne = ts.fit_transform(X_std)
return self
def transform(self, X, y = None):
feature_list = []
for i in range(1,shelf.X_tsne.shape[1]+1):
feature_list .append("TSNE" + str(i))
df_new = pd.DataFrame(X_tsne, columns= feature_list )
df_new['label'] = y
#df_new.head()
X = df_new.drop(columns=['label'])
y = df_new['label']
return X, y
...
steps = [('standardscaler', StandardScaler()),
('testTSNE', TestTSNE()),
('rfc', RandomForestClassifier())]
pipeline = Pipeline(steps)
我认为你误解了管道的使用。来自 help page:
Pipeline of transforms with a final estimator.
Sequentially apply a list of transforms and a final estimator. Intermediate steps of the pipeline must be ‘transforms’, that is, they must implement fit and transform methods. The final estimator only needs to implement fit
所以这意味着如果您的管道是:
steps = [('standardscaler', StandardScaler()),
('tsne', TSNE()),
('rfc', RandomForestClassifier())]
您将首先将标准缩放器应用于您的特征,然后使用 tsne 转换结果,然后再将其传递给分类器。我认为在 tsne 输出上训练没有多大意义。
如果你真的想锁定管道,那么你需要将 tsne 的结果存储为一个属性,然后 return 特征,按原样训练,以便分类器可以工作它。
类似
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.manifold import TSNE
from sklearn.datasets import make_classification
class TestTSNE(BaseEstimator, TransformerMixin):
def __init__(self,n_components,random_state=None,method='exact'):
self.n_components = n_components
self.method = method
self.random_state = random_state
def fit(self, X, y = None):
ts = TSNE(n_components = self.n_components,
method = self.method, random_state = self.random_state)
self.X_tsne = ts.fit_transform(X)
return self
def transform(self, X, y = None):
return X
然后:
steps = [('standardscaler', StandardScaler()),
('testTSNE', TestTSNE(2)),
('rfc', RandomForestClassifier())]
pipeline = Pipeline(steps)
X,y = make_classification()
pipeline.fit(X,y)
您可以像这样取回您的 tsne:
pd.DataFrame(pipeline.steps[1][1].X_tsne)
0 1
0 -38.756626 -4.693253
1 46.516308 53.633842
2 49.107910 16.482645
3 18.306377 9.432504
4 33.551056 -27.441383
.. ... ...
95 -31.337574 -16.913471
96 -57.918224 -39.959976
97 55.282658 37.582535
98 66.425125 19.717241
99 -50.692646 11.545088