在自定义转换器中使用 .loc 会产生带有切片错误的副本

Question

编辑：问题保持不变，但代码已更改。

我正在研究 Kaggle 上的家庭信用数据集，特别是 instalment_payment.csv。以下是我的自定义转换器

class Xfrmer_replace1(BaseEstimator, TransformerMixin):
    """
        this transformer does the global repplace within the dataframe
        replace 365243 spcific to this case study with 0
        replace +/-inf , nan with zero
    """
    # constructor
    def __init__(self):
        #we are not going to use this 
        self._features = None
        
    #Return self 
    def fit(self, X,y=None  ):
        return self
    
    def transform(self,X,y=None):        
        #replace high values with zero
        for col in X.columns:
            X=X.replace([365243,365243.0],0)
            print('replaced values')
        #X=X.replace([np.inf,-np.inf],np.nan)
        #X=X.replace(np.nan,0)    
    
        return X

class Xfrmer_signchng1(BaseEstimator, TransformerMixin):
    """
        this transformer does the change for positive to negative 
    """
    # constructor
    def __init__(self):
        #we are not going to use this         
        self.signchng_columns = None
        
    #Return self 
    def fit(self,X,y=None  ):
        return self
    
    def transform(self,X,y=None):        
        #change the sign of the columns
        for col in X.columns:
            print('sign change')
            X[col]= [0  if val >= 0 else (val *-1) for val in X[col] ]
    
        return X  

class Xfrmer_dif_calc1(BaseEstimator, TransformerMixin):
    """
        this transformer does the difference bewteen the two columns
        the i/p is a list of tuples
        the second item in the tuple is divided from the first item
        the third item in the tuple is the name of this new column    
    """
    # constructor
    def __init__(self):
        #we are not going to use this         
        self.dif_columns = None
        
    #Return self 
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        print('diff caclulator')
        print('X columns', X.columns)
        print(X[X.columns[0]])
        print(X[X.columns[1]])
        #print(X['AMT_PAYMENT'])
        #print(X['AMT_INSTALMENT'])
        #print(X[X.columns[0]] - X[X.columns[1]])
        #iter1.X.loc[:,'AMT_PMT_DIF'] = X[X.columns[0]] - X[X.columns[1]]
        X['AMT_PMT_DIF'] = X[X.columns[0]] - X[X.columns[1]]
        print(X['AMT_PMT_DIF'])
        return X   

    
class Xfrmer_rto_calc1(BaseEstimator, TransformerMixin):
    """
        this transformer calculates the ratio between two columns
        the i/p is a list of tuples
        the first item in the tuple is divided from the second item
        the third item in the tuple is the name of this new column
    """
    # constructor
    def __init__(self):
        #we are not going to use this 
        self.rto_columns = None
        
    #Return self 
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):        
        print('ratio caclulator')
        #iter1.X.loc[:,'AMT_PMT_RTO'] = (X[X.columns [0]] / X[X.columns [1]]).clip(lower=0)
        X['AMT_PMT_RTO'] = (X[X.columns [0]] / X[X.columns [1]]).clip(lower=0)
    
        return X

这就是我使用管道的方式

lst_all_cols = dtprcs.X_train.columns.values.tolist()
lst_signchng_cols = ["DAYS_INSTALMENT","DAYS_ENTRY_PAYMENT"]            
lst_imptr_cols=['DAYS_ENTRY_PAYMENT','AMT_PAYMENT']
lst_diff_cols = ['AMT_PAYMENT',"AMT_INSTALMENT"]            
lst_rto_cols = ['AMT_PAYMENT',"AMT_INSTALMENT"] 
print('Starting pipeline processing')        
#"""

instpmt_preprcs_pipln = ColumnTransformer( transformers = [
                                        ( 'instpmt_repl_pipln', Xfrmer_replace1(),lst_all_cols ),
                                        ( 'instpmt_sgnchng_pipln', Xfrmer_signchng1(),lst_signchng_cols ),
                                        ( 'instpmt_imptr_piplin',SimpleImputer(strategy = 'median'),lst_imptr_cols ),
                                        ('instpmt_dif_pipln',Xfrmer_dif_calc1(), lst_diff_cols),
                                        ('instpmt_rto_pipln',Xfrmer_rto_calc1(),lst_rto_cols)],
                                        remainder='passthrough')
print('Pipeline fitting start...')
instpmt_preprcs_pipln.fit( dtprcs.X_train, dtprcs.y_train )
print('Pipeline fitting over...')
#print(dtprcs.X_train.shape,dtprcs.x_test.shape)
#print(dtprcs.X_train.columns,dtprcs.x_test.columns)
#Can predict with it like any other pipeline
print('Pipeline transforming x_test...')

y_pred = instpmt_partial_piplin.transform( dtprcs.x_test ) 
print('Pipeline transforming x_test over...')
print(type(dtprcs.X_train),type(dtprcs.x_test),type(dtprcs.y_train))
print(dtprcs.X_train.columns,dtprcs.x_test.columns)
print('Pipeline preprocessing pver. Seting up other classes...')

我的问题

如何在列转换器中向数据框添加新列？我尝试使用 .loc 和不使用 .loc。从下面的跟踪中我们发现该值实际上正在计算但没有更新到数据框中
调试值在 fit() 期间打印，但在测试数据集转换期间不打印。

最新堆栈跟踪

Finished reading apln train/test files...
installments_payments.csv
primary name train installments_payments_train.csv
primary name test installments_payments_test.csv
Train test files ready...
finished writing train/test files.
Exiting function(0).
(16915, 8)
(4574, 8)
Processing installments_payments.csv...
Starting pipeline processing
Pipeline fitting start...
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
sign change
sign change
diff caclulator
X columns Index(['AMT_PAYMENT', 'AMT_INSTALMENT'], dtype='object')
0         6948.360
2         6948.360
3         1716.525
4         1716.525
5         3375.000
           ...    
42390    12303.000
42391    10299.960
42392    10869.435
42402      124.155
42409     4198.950
Name: AMT_PAYMENT, Length: 16915, dtype: float64
0         6948.360
2         6948.360
3         1716.525
4         1716.525
5         3375.000
           ...    
42390    12303.000
42391    10299.960
42392    14958.135
42402      124.155
42409     4198.950
Name: AMT_INSTALMENT, Length: 16915, dtype: float64
0           0.0
2           0.0
3           0.0
4           0.0
5           0.0
          ...  
42390       0.0
42391       0.0
42392   -4088.7
42402       0.0
42409       0.0
Name: AMT_PMT_DIF, Length: 16915, dtype: float64
ratio caclulator
Pipeline fitting over...
Pipeline transforming x_test...
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
sign change
sign change
diff caclulator
ratio caclulator

**Pipeline transforming x_test over...**
<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>
Index(['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_VERSION',
       'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT',
       'AMT_INSTALMENT', 'AMT_PAYMENT'],
      dtype='object') Index(['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_VERSION',
       'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT',
       'AMT_INSTALMENT', 'AMT_PAYMENT'],
      dtype='object')
Pipeline preprocessing pver. Seting up other classes...
Exiting main function...
E:\anaconda\envs\appliedaicourse\lib\site-packages\ipykernel_launcher.py:187: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
E:\anaconda\envs\appliedaicourse\lib\site-packages\pandas\core\indexing.py:362: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
E:\anaconda\envs\appliedaicourse\lib\site-packages\pandas\core\indexing.py:562: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value

Answer 1

就像我在评论中所说的，我首先使用以下方法提取我需要学习的特征（.fit）：

from sklearn.base import TransformerMixin

class FeatureExtractor(TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
        print(self.cols)
    
    def fit(self, X, y=None):
        # stateless transformer
        return self
    
    def transform(self, X):
        # assumes X is Pandas Dataframe
        X_cols = X.loc[:, self.cols]
        return X_cols

然后使用此 class 从数据中的一列中学习：

class SynopsisNumWords(TransformerMixin):
    def __init__(self):
        return None
        # self.text_array = text_array
    
    def fit(self,  X, y=None, **fit_params):
        return self
    
    def transform(self, X, y=None, **fit_params):
        X = X.copy()
        # # rename the series to not have the same column name as input
        return X.loc[:,'Synopsis'].apply(lambda x: len(str(x).split())).rename('Synopsis_num_words').to_frame()

然后合并所有特征以使用此方法制作单个数据框：

class DFFeatureUnion(TransformerMixin):
    # FeatureUnion but for pandas DataFrames

    def __init__(self, transformer_list):
        self.transformer_list = transformer_list

    def fit(self, X, y=None):
        for (name, t) in self.transformer_list:
            t.fit(X)
        return self

    def transform(self, X):
        # X must be a DataFrame
        Xts = [t.transform(X) for _, t in self.transformer_list]
        Xunion = reduce(lambda X1, X2: pd.merge(X1, X2, left_index=True, right_index=True), Xts)
        return Xunion

然后将所有这些合并起来并制作如下所示的管道。该管道采用 9 列的数据帧，从一列中学习，从中生成另一列，然后将所有这些合并 return 具有 10 列的数据帧。

from sklearn.pipeline import Pipeline
synopsis_feat_gen_pipeline = Pipeline(steps=[('engineer_data',
                                        DFFeatureUnion([
                                                     ('extract_all_columns',
                                                      Pipeline(steps=[
                                                                      ('extract_all_features',
                                                                       FeatureExtractor(['Synopsis', 'Title', 'Author', 'Edition',
                                                                                         'Reviews', 'Ratings', 'Genre', 'BookCategory', 'Price'])
                                                                       )
                                                                      ], verbose=True
                                                               )
                                                     ),
                                                     ('generate_num_words_column',
                                                      Pipeline(steps=[
                                                                      ('extract_Synopsis_feature', FeatureExtractor(['Synopsis'])),
                                                                      ('generate_num_words', SynopsisNumWords())
                                                                      ], verbose=True
                                                               )
                                                      ),
                                                     ]))
                                     ],
                              verbose=True)

在自定义转换器中使用 .loc 会产生带有切片错误的副本

Using .loc inside custom transformer produces copy with slice error

python

pipeline

scikit-learn