代码之家  ›  专栏  ›  技术社区  ›  KubiK888

在Python中尝试将ColumnTransformer适配到测试功能数据子集时发生属性错误

  •  0
  • KubiK888  · 技术社区  · 6 年前

    我从这个应用类似的编码路径 tutorial 对于我自己的项目,使用ColumnTransformer一步传递分类变量和数值变量的值。但我被它困住了 X_test = colT.fit(X_test) 我不知道预期的输出应该是什么。

    这是我的代码,我在 def standardize_values 功能

    import pandas as pd
    import numpy as np
    import ctypes
    import re
    import pickle
    from scipy import stats
    from sklearn.model_selection import train_test_split
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import Normalizer, OneHotEncoder
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.linear_model import LinearRegression
    from sklearn import metrics
    import helper_functions.helper_functions as hf
    import data_preparation as data_prep
    
    # Main class
    ######################################################################
    class Machine_Learning_ProjectX(data_prep.DataPreparation_ProjectX):
        def __init__(self):
            self.pickle_descriptive_stats_demographic = None
            self.pickle_descriptive_stats_clinical = None
            self.pickle_descriptive_stats_rx = None
            self.pickle_descriptive_stats_csu = None    
            self.df_demographic = None
            self.df_clinical = None
            self.df_rx = None
            self.df_csu = None  
            self.df_master = None
            self.varname_cat_all = ['INDEX_RURAL_CAT', 'INDEX_SEX', 'AIDS_TAG', 'CHF_TAG', 'CKD_TAG', 'CLD_MILD_TAG', 'CLD_SEVERE_TAG',
                                'COPD_TAG', 'CTD_TAG', 'CVA_TAG', 'DM_MILD_TAG', 'DM_SEVERE_TAG', 'METS_TAG', 'MI_TAG', 'PUD_TAG',
                                'PVD_TAG', 'DEMENTIA_TAG', 'HEMIPLEGIA_TAG', 'TUMOR_TAG', 'INDEX_DIN_CAT']
            self.varname_num_all = ['INDEX_AGE', 'CCI_SCORE', 'PREINDEX1YR_N_DRUGX_FG_MPR', 'PREINDEX1YR_N_DRUGX_SG_MPR', 'PREINDEX1YR_N_DRUGY_TYPICAL_MPR',
                                'PREINDEX1YR_N_DRUGY_ATYPICAL_MPR', 'POSTINDEX1YR_N_DRUGX_FG_MPR', 'POSTINDEX1YR_N_DRUGX_SG_MPR',
                                'POSTINDEX1YR_N_DRUGY_TYPICAL_MPR', 'POSTINDEX1YR_N_DRUGY_ATYPICAL_MPR',
                                'SUMMED_ALLCAUSE_NUM_PRE2YR', 'SUMMED_ALLCAUSE_NUM_POST2YR', 'SUMMED_ALLCAUSE_COST_PRE2YR',
                                'SUMMED_ALLCAUSE_COST_POST2YR', 'SUMMED_DXTARGET_NUM_PRE2YR', 'SUMMED_DXTARGET_NUM_POST2YR',
                                'SUMMED_DXTARGET_COST_PRE2YR', 'SUMMED_DXTARGET_COST_POST2YR', 'DAD_ALLCAUSE_NUM_PRE2YR',
                                'DAD_ALLCAUSE_NUM_POST2YR', 'DAD_ALLCAUSE_COST_PRE2YR', 'DAD_ALLCAUSE_COST_POST2YR',
                                'DAD_DXTARGET_NUM_PRE2YR', 'DAD_DXTARGET_NUM_POST2YR', 'DAD_DXTARGET_COST_PRE2YR',
                                'DAD_DXTARGET_COST_POST2YR', 'PC_ALLCAUSE_NUM_PRE2YR', 'PC_ALLCAUSE_NUM_POST2YR', 
                                'PC_ALLCAUSE_COST_PRE2YR', 'PC_ALLCAUSE_COST_POST2YR', 'PC_DXTARGET_NUM_PRE2YR',
                                'PC_DXTARGET_NUM_POST2YR', 'PC_DXTARGET_COST_PRE2YR', 'PC_DXTARGET_COST_POST2YR',
                                'NACRS_ALLCAUSE_NUM_PRE2YR', 'NACRS_ALLCAUSE_NUM_POST2YR', 'NACRS_ALLCAUSE_COST_PRE2YR',
                                'NACRS_ALLCAUSE_COST_POST2YR', 'NACRS_DXTARGET_NUM_PRE2YR', 'NACRS_DXTARGET_NUM_POST2YR',
                                'NACRS_DXTARGET_COST_PRE2YR', 'NACRS_DXTARGET_COST_POST2YR']
            self.varname_num_unused = ['POSTINDEX1YR_N_DRUGX_FG_MPR', 'POSTINDEX1YR_N_DRUGX_SG_MPR', 'POSTINDEX1YR_N_DRUGY_TYPICAL_MPR', 
                                'POSTINDEX1YR_N_DRUGY_ATYPICAL_MPR', 'SUMMED_ALLCAUSE_NUM_POST2YR', 'SUMMED_ALLCAUSE_COST_POST2YR', 
                                'SUMMED_DXTARGET_NUM_POST2YR', 'SUMMED_DXTARGET_COST_POST2YR', 'DAD_ALLCAUSE_NUM_POST2YR', 
                                'DAD_ALLCAUSE_COST_POST2YR', 'DAD_DXTARGET_NUM_POST2YR', 'DAD_DXTARGET_COST_POST2YR', 'PC_ALLCAUSE_NUM_POST2YR', 
                                'PC_ALLCAUSE_COST_POST2YR', 'PC_DXTARGET_NUM_POST2YR',  'PC_DXTARGET_COST_POST2YR', 'NACRS_ALLCAUSE_NUM_POST2YR', 
                                'NACRS_ALLCAUSE_COST_POST2YR', 'NACRS_DXTARGET_NUM_POST2YR', 'NACRS_DXTARGET_COST_POST2YR']
            self.varname_id = ['PHN_ENC', 'INDEX_DATE']
            varname_label = ['SUMMED_ALLCAUSE_NUM_POST2YR', 'SUMMED_DXTARGET_NUM_POST2YR', 'SUMMED_ALLCAUSE_COST_POST2YR', 
                                'SUMMED_DXTARGET_COST_POST2YR', ]
            self.y_label = varname_label[0]
            self.varname_import = list(set(self.varname_id+self.varname_cat_all+self.varname_num_all)-set(self.varname_num_unused))+[self.y_label]
            self.result_dict_ml = {}
    
        def ml_steps(self):
            self.import_references()
            self.import_pickle_descriptive_stats_demographic(on_switch=True,
                                    import_dir=self.result_dir,
                                    import_filename='JAHIP_V2_SubjectGroup_DescriptiveStats_Demographic.pickle')
            self.import_pickle_descriptive_stats_clinical(on_switch=True,
                                    import_dir=self.result_dir,
                                    import_filename='JAHIP_V2_SubjectGroup_DescriptiveStats_Clinical.pickle')
            self.import_pickle_descriptive_stats_rx(on_switch=True,
                                    import_dir=self.result_dir,
                                    import_filename='JAHIP_V2_SubjectGroup_DescriptiveStats_Rx.pickle')
            self.import_pickle_descriptive_stats_csu(on_switch=True,
                                    import_dir=self.result_dir,
                                    import_filename='JAHIP_V2_SubjectGroup_DescriptiveStats_CSU.pickle')
            self.import_df_demographic(on_switch=True,
                                    import_dir=self.result_dir,
                                    import_filename='JAHIP_V2_SubjectGroup_DF_Demographic_SubjectLevel.csv')
            self.import_df_clinical(on_switch=True,
                                    import_dir=self.result_dir,
                                    import_filename='JAHIP_V2_SubjectGroup_DF_Clinical_SubjectLevel.csv')
            self.import_df_rx(on_switch=True,
                                    import_dir=self.result_dir,
                                    import_filename='JAHIP_V2_SubjectGroup_DF_Rx_SubjectLevel.csv')
            self.import_df_csu(on_switch=True,
                                    import_dir=self.result_dir,
                                    import_filename='JAHIP_V2_SubjectGroup_DF_CSU_SubjectLevel.csv')
            self.merge_dfs(on_switch=True)
            self.split_into_training_and_test_sets(on_switch=True)
            self.generate_new_features(on_switch=False)
            self.handle_missing_values(on_switch=True)
            self.standardize_values(on_switch=True)
            self.ml_pipeline(on_switch=True)
    
        def import_references(self):
            super().__init__()
            super()._pandas_output_setting()
            super().dir_name()
            super().file_name()
            super().constant_var()
            super().import_ref_data()
    
        # Decorators
        def on_or_off(func):
            def wrapper(self, *args, on_switch=False, **kwargs):
                if on_switch:
                    func(self, *args, on_switch=on_switch, **kwargs)
            return wrapper
    
        # Core class functions
        @on_or_off
        def import_pickle_descriptive_stats_demographic(self, on_switch, import_dir=None, import_filename=None):
            with open(import_dir+import_filename, 'rb') as handle:
                self.pickle_descriptive_stats_demographic = pickle.load(handle)
        @on_or_off
        def import_pickle_descriptive_stats_clinical(self, on_switch, import_dir=None, import_filename=None):
            with open(import_dir+import_filename, 'rb') as handle:
                self.pickle_descriptive_stats_clinical = pickle.load(handle)
        @on_or_off
        def import_pickle_descriptive_stats_rx(self, on_switch, import_dir=None, import_filename=None):
            with open(import_dir+import_filename, 'rb') as handle:
                self.pickle_descriptive_stats_rx = pickle.load(handle)
        @on_or_off
        def import_pickle_descriptive_stats_csu(self, on_switch, import_dir=None, import_filename=None):
            with open(import_dir+import_filename, 'rb') as handle:
                self.pickle_descriptive_stats_csu = pickle.load(handle)
    
        @on_or_off
        def import_df_demographic(self, on_switch, import_dir=None, import_filename=None):
            self.df_demographic = pd.read_csv(import_dir+import_filename, dtype={'PHN_ENC':'str'})
        @on_or_off
        def import_df_clinical(self, on_switch, import_dir=None, import_filename=None):
            self.df_clinical = pd.read_csv(import_dir+import_filename, dtype={'PHN_ENC':'str'})
        @on_or_off
        def import_df_rx(self, on_switch, import_dir=None, import_filename=None):
            self.df_rx = pd.read_csv(import_dir+import_filename, dtype={'PHN_ENC':'str'})
        @on_or_off
        def import_df_csu(self, on_switch, import_dir=None, import_filename=None):
            self.df_csu = pd.read_csv(import_dir+import_filename, dtype={'PHN_ENC':'str'})
    
        @on_or_off
        def merge_dfs(self, on_switch):
            self.df_master = self.df_demographic.copy()
            self.df_master = self.df_master.merge(self.df_clinical, on='PHN_ENC', how='outer')
            self.df_master = self.df_master.merge(self.df_rx, on='PHN_ENC', how='outer')
            self.df_master = self.df_master.merge(self.df_csu, on='PHN_ENC', how='outer')
            assert (len(self.df_master)==self.df_master['PHN_ENC'].nunique()), 'Error: Same subject appears on multiple rows.'
            # Remove duplicated columns
            self.df_master = self.df_master.loc[:,~self.df_master.columns.str.contains('_y', case=True)]
            self.df_master.columns = self.df_master.columns.str.replace('_x', '')
            self.df_master = self.df_master.loc[:,~self.df_master.columns.duplicated()]
            # Remove unused columns
            self.df_master = self.df_master.loc[:, ~self.df_master.columns.str.contains('^Unnamed')]
            self.df_master = self.df_master.drop(['temp'], axis=1)
            # Retain only needed columns
            self.df_master = self.df_master[self.varname_import]
    
        @on_or_off
        def split_into_training_and_test_sets(self, on_switch):
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.df_master, self.df_master[self.y_label], 
                test_size=0.3, random_state=888)
            self.X_train = self.X_train.drop(['PHN_ENC', 'INDEX_DATE'], axis=1)
            self.X_test = self.X_test.drop(['PHN_ENC', 'INDEX_DATE'], axis=1)
    
        @on_or_off
        def generate_new_features(self, on_switch):
            pass
    
        @on_or_off
        def handle_missing_values(self, on_switch):
            self.X_train = self.X_train.apply(lambda x:x.fillna(x.value_counts().index[0]))
            self.X_test = self.X_test.apply(lambda x:x.fillna(x.value_counts().index[0]))
            self.y_train = self.y_train.fillna(0)
            self.y_test = self.y_test.fillna(0)
    
        @on_or_off
        def standardize_values(self, on_switch):
            colT = ColumnTransformer(
                [   ('DUMMY_COL', OneHotEncoder(categories=[['URBAN', 'RURAL'],
                                                            ['M', 'F'],
                                                            ['AIDS', 'NON-AIDS'],
                                                            ['CHF', 'NON-CHF'],
                                                            ['CKD', 'NON-CKD'],
                                                            ['CLD_MILD', 'NON-CLD_MILD'],
                                                            ['CLD_SEVERE', 'NON-CLD_SEVERE'],
                                                            ['COPD', 'NON-COPD'],
                                                            ['CTD', 'NON-CTD'],
                                                            ['CVA', 'NON-CVA'],
                                                            ['DM_MILD', 'NON-DM_MILD'],
                                                            ['DM_SEVERE', 'NON-DM_SEVERE'],
                                                            ['METS', "NON-METS"],
                                                            ['MI', 'NON-MI'],
                                                            ['PUD', 'NON-PUD'],
                                                            ['PVD', 'NON-PVD'],
                                                            ['DEMENTIA', 'NON-DEMENTIA'],
                                                            ['HEMIPLEGIA', 'NON-HEMIPLEGIA'],
                                                            ['TUMOR', 'NON-TUMOR'],
                                                            ['XX', 'YY', 'ZZ'],
                                                            ]),
                        self.varname_cat_all),
                    ('NORM_COL', Normalizer(norm='l1'),
                        list(set(self.varname_num_all)-set(self.varname_num_unused)))
                ])
    
            print(self.X_train.shape) # (920, 43)
            print(self.X_test.shape) # (395, 43)
    
            self.X_train = colT.fit_transform(self.X_train)
            self.X_test = colT.fit(self.X_test)
    
            print(self.X_train.shape) # (920, 63)
    
            print(self.X_test) # Printing some weird output "ColumnTransformer..."
            print(self.X_test.shape) # AttributeError: 'ColumnTransformer' object has no attribute 'shape'
    
        @on_or_off
        def ml_pipeline(self, on_switch):
            regressor = LinearRegression()
            regressor.fit(self.X_train, self.y_train) # training the algorithm
            #y_pred = regressor.predict(self.X_test) # doesn't work
    
    # Main function
    ######################################################################
    def main():
        x = Machine_Learning_ProjectX()
        x.ml_steps()
    
    if __name__ == '__main__':
        main()
    
    # Output below
    (920, 43)
    (395, 43)
    (920, 63)
    ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
             transformer_weights=None,
             transformers=[('DUMMY_COL', OneHotEncoder(categorical_features=None,
           categories=[['URBAN', 'RURAL'], ['M', 'F'], ['AIDS', 'NON-AIDS'], ['CHF', 'NON-CHF'], ['CKD', 'NON-CKD'], ['CLD_MILD', 'NON-CLD_MILD'], ['CLD_SEVERE', 'NON-CLD_SEVERE'], ['COPD', 'NON-COPD'], ['CTD', 'NON-CTD'], ['CVA', 'NON..._DXTARGET_NUM_PRE2YR', 'PREINDEX1YR_N_DRUGY_TYPICAL_MPR', 'INDEX_AGE', 'NACRS_ALLCAUSE_NUM_PRE2YR'])])Traceback (most recent call last):
    ... line 212, in standardize_values
        print(self.X_test.shape)
    AttributeError: 'ColumnTransformer' object has no attribute 'shape'
    
    0 回复  |  直到 6 年前
        1
  •  1
  •   seralouk    6 年前

    教程的作者犯了一个错误。


    self.X_train = colT.fit_transform(self.X_train)
    self.X_test = colT.fit(self.X_test)
    

    在这里 self.X_train .fit_transform 所以这是一个 numpy 对象。 另一方面, self.X_test 是的输出 .fit 方法,它是一个没有 .shape 属性!

    您需要:

    self.X_train = colT.fit_transform(self.X_train)
    self.X_test = colT.transform(self.X_test)
    

    P、 S:在文章的最后,检查其他人在本教程的评论中说了什么。

    推荐文章