|
| 1 | +# pylint: disable=C0321,C0103,E1221,C0301,E1305,E1121,C0302,C0330 |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | +""" |
| 4 | +
|
| 5 | +https://github.com/mljar/mljar-supervised |
| 6 | +
|
| 7 | + python test_automl.py train > zlog/log_titanic_train.txt 2>&1 |
| 8 | + python test_automl.py predict > zlog/log_titanic_predict.txt 2>&1 |
| 9 | +
|
| 10 | +
|
| 11 | +mljar-supervised 0.8.8 requires dtreeviz==1.0, which is not installed. |
| 12 | +mljar-supervised 0.8.8 requires fastparquet==0.4.1, which is not installed. |
| 13 | +mljar-supervised 0.8.8 requires wordcloud==1.7.0, which is not installed. |
| 14 | +
|
| 15 | +
|
| 16 | +mljar-supervised 0.8.8 requires catboost==0.24.1, but you'll have catboost 0.22 which is incompatible. |
| 17 | +mljar-supervised 0.8.8 requires category-encoders==2.2.2, but you'll have category-encoders 2.1.0 which is incompatible. |
| 18 | +mljar-supervised 0.8.8 requires lightgbm==3.0.0, but you'll have lightgbm 2.3.0 which is incompatible. |
| 19 | +mljar-supervised 0.8.8 requires numpy>=1.18.5, but you'll have numpy 1.18.1 which is incompatible. |
| 20 | +mljar-supervised 0.8.8 requires pandas==1.1.2, but you'll have pandas 0.25.3 which is incompatible. |
| 21 | +mljar-supervised 0.8.8 requires pyarrow==0.17.0, but you'll have pyarrow 2.0.0 which is incompatible. |
| 22 | +mljar-supervised 0.8.8 requires scipy==1.4.1, but you'll have scipy 1.3.1 which is incompatible. |
| 23 | +mljar-supervised 0.8.8 requires seaborn==0.10.1, but you'll have seaborn 0.10.0 which is incompatible. |
| 24 | +mljar-supervised 0.8.8 requires shap==0.36.0, but you'll have shap 0.35.0 which is incompatible. |
| 25 | +mljar-supervised 0.8.8 requires tabulate==0.8.7, but you'll have tabulate 0.8.6 which is incompatible. |
| 26 | +mljar-supervised 0.8.8 requires xgboost==1.2.0, but you'll have xgboost 1.3.3 which is incompatible. |
| 27 | +
|
| 28 | +conda install -c conda-forge fastparquet |
| 29 | +
|
| 30 | +
|
| 31 | +""" |
| 32 | +import warnings, copy, os, sys |
| 33 | +warnings.filterwarnings('ignore') |
| 34 | + |
| 35 | +#################################################################################### |
| 36 | +###### Path ######################################################################## |
| 37 | +root_repo = os.path.abspath(os.getcwd()).replace("\\", "/") + "/" ; print(root_repo) |
| 38 | +THIS_FILEPATH = os.path.abspath(__file__) |
| 39 | + |
| 40 | +sys.path.append(root_repo) |
| 41 | +from source.util_feature import save,os_get_function_name |
| 42 | + |
| 43 | + |
| 44 | + |
| 45 | + |
| 46 | +def global_pars_update(model_dict, data_name, config_name): |
| 47 | + print("config_name", config_name) |
| 48 | + dir_data = root_repo + "/data/" ; print("dir_data", dir_data) |
| 49 | + |
| 50 | + m = {} |
| 51 | + m['config_path'] = THIS_FILEPATH |
| 52 | + m['config_name'] = config_name |
| 53 | + |
| 54 | + #### peoprocess input path |
| 55 | + m['path_data_preprocess'] = dir_data + f'/input/{data_name}/train/' |
| 56 | + |
| 57 | + #### train input path |
| 58 | + dir_data_url = "https://github.com/arita37/dsa2_data/tree/main/" #### Remote Data directory |
| 59 | + m['path_data_train'] = dir_data_url + f'/input/{data_name}/train/' |
| 60 | + m['path_data_test'] = dir_data_url + f'/input/{data_name}/test/' |
| 61 | + #m['path_data_val'] = dir_data + f'/input/{data_name}/test/' |
| 62 | + |
| 63 | + #### train output path |
| 64 | + m['path_train_output'] = dir_data + f'/output/{data_name}/{config_name}/' |
| 65 | + m['path_train_model'] = dir_data + f'/output/{data_name}/{config_name}/model/' |
| 66 | + m['path_features_store'] = dir_data + f'/output/{data_name}/{config_name}/features_store/' |
| 67 | + m['path_pipeline'] = dir_data + f'/output/{data_name}/{config_name}/pipeline/' |
| 68 | + |
| 69 | + |
| 70 | + #### predict input path |
| 71 | + m['path_pred_data'] = dir_data + f'/input/{data_name}/test/' |
| 72 | + m['path_pred_pipeline'] = dir_data + f'/output/{data_name}/{config_name}/pipeline/' |
| 73 | + m['path_pred_model'] = dir_data + f'/output/{data_name}/{config_name}/model/' |
| 74 | + |
| 75 | + #### predict output path |
| 76 | + m['path_pred_output'] = dir_data + f'/output/{data_name}/pred_{config_name}/' |
| 77 | + |
| 78 | + ##### Generic |
| 79 | + m['n_sample'] = model_dict['data_pars'].get('n_sample', 5000) |
| 80 | + |
| 81 | + model_dict[ 'global_pars'] = m |
| 82 | + return model_dict |
| 83 | + |
| 84 | + |
| 85 | + |
| 86 | +#################################################################################### |
| 87 | +##### Params######################################################################## |
| 88 | +config_default = 'config1' ### name of function which contains data configuration |
| 89 | + |
| 90 | + |
| 91 | +# data_name = "titanic" ### in data/input/ |
| 92 | +cols_input_type_1 = { |
| 93 | + "coly" : "Survived" |
| 94 | + ,"colid" : "PassengerId" |
| 95 | + ,"colcat" : ["Sex", "Embarked" ] |
| 96 | + ,"colnum" : ["Pclass", "Age","SibSp", "Parch","Fare"] |
| 97 | + ,"coltext" : [] |
| 98 | + ,"coldate" : [] |
| 99 | + ,"colcross" : [ "Name", "Sex", "Ticket","Embarked","Pclass", "Age", "SibSp", ] |
| 100 | +} |
| 101 | + |
| 102 | + |
| 103 | +#################################################################################### |
| 104 | +def config1() : |
| 105 | + """ |
| 106 | + ONE SINGLE DICT Contains all needed informations for |
| 107 | + used for titanic classification task |
| 108 | + """ |
| 109 | + data_name = "titanic" ### in data/input/ |
| 110 | + model_class = 'AutoML' ### ACTUAL Class name for model_sklearn.py |
| 111 | + n_sample = 1000 |
| 112 | + |
| 113 | + def post_process_fun(y): ### After prediction is done |
| 114 | + return int(y) |
| 115 | + |
| 116 | + def pre_process_fun(y): ### Before the prediction is done |
| 117 | + return int(y) |
| 118 | + |
| 119 | + |
| 120 | + model_dict = {'model_pars': { |
| 121 | + ### LightGBM API model ####################################### |
| 122 | + 'model_class': model_class |
| 123 | + ,'model_pars' : { |
| 124 | + 'total_time_limit' : 20, |
| 125 | + 'algorithms' : 'auto', |
| 126 | + 'results_path' : root_repo + f'/data/output/{data_name}/{os_get_function_name()}/automl_1', |
| 127 | + 'eval_metric' : 'auto' |
| 128 | + |
| 129 | + # mode='Explain', |
| 130 | + # ml_task='auto', model_time_limit=None, algorithms='auto', train_ensemble=True, |
| 131 | + # stack_models='auto', eval_metric='auto', validation_strategy='auto', explain_level='auto', |
| 132 | + # golden_features='auto', features_selection='auto', start_random_models='auto', |
| 133 | + # hill_climbing_steps='auto', top_models_to_improve='auto', verbose=1, random_state=1234) |
| 134 | + } |
| 135 | + |
| 136 | + , 'post_process_fun' : post_process_fun ### After prediction ########################################## |
| 137 | + , 'pre_process_pars' : {'y_norm_fun' : pre_process_fun , ### Before training ########################## |
| 138 | + |
| 139 | + |
| 140 | + ### Pipeline for data processing ############################## |
| 141 | + 'pipe_list': [ |
| 142 | + #### coly target prorcessing |
| 143 | + {'uri': 'source/prepro.py::pd_coly', 'pars': {}, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly' }, |
| 144 | + |
| 145 | + |
| 146 | + {'uri': 'source/prepro.py::pd_colnum_bin', 'pars': {}, 'cols_family': 'colnum', 'cols_out': 'colnum_bin', 'type': '' }, |
| 147 | + {'uri': 'source/prepro.py::pd_colnum_binto_onehot', 'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot', 'type': '' }, |
| 148 | + |
| 149 | + #### catcol INTO integer, colcat into OneHot |
| 150 | + {'uri': 'source/prepro.py::pd_colcat_bin', 'pars': {}, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': '' }, |
| 151 | + # {'uri': 'source/prepro.py::pd_colcat_to_onehot', 'pars': {}, 'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot', 'type': '' }, |
| 152 | + |
| 153 | + |
| 154 | + ### Cross_feat = feat1 X feat2 |
| 155 | + # {'uri': 'source/prepro.py::pd_colcross', 'pars': {}, 'cols_family': 'colcross', 'cols_out': 'colcross_pair', 'type': 'cross'}, |
| 156 | + |
| 157 | + |
| 158 | + #### Example of Custom processor |
| 159 | + #{'uri': THIS_FILEPATH + '::pd_col_myfun', 'pars': {}, 'cols_family': 'colnum', 'cols_out': 'col_myfun', 'type': '' }, |
| 160 | + |
| 161 | + |
| 162 | + ], |
| 163 | + } |
| 164 | + }, |
| 165 | + |
| 166 | + 'compute_pars': { 'metric_list': ['accuracy_score','average_precision_score'] |
| 167 | + |
| 168 | + ,'mlflow_pars' : None # {} ### Not empty --> use mlflow |
| 169 | + }, |
| 170 | + |
| 171 | + 'data_pars': { 'n_sample' : n_sample, |
| 172 | + |
| 173 | + 'download_pars' : None, |
| 174 | + |
| 175 | + |
| 176 | + 'cols_input_type' : cols_input_type_1, |
| 177 | + ### family of columns for MODEL ######################################################### |
| 178 | + # "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap", #### Colnum columns |
| 179 | + # "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map", #### colcat columns |
| 180 | + # 'colcross_single_onehot_select', "colcross_pair_onehot", 'colcross_pair', #### colcross columns 'coldate', 'coltext', |
| 181 | + 'cols_model_group': [ 'colnum_bin', |
| 182 | + 'colcat_bin', |
| 183 | + # 'coltext', |
| 184 | + # 'coldate', |
| 185 | + #'colcross_pair', |
| 186 | + |
| 187 | + ### example of custom |
| 188 | + # 'col_myfun' |
| 189 | + ] |
| 190 | + |
| 191 | + ### Filter data rows ################################################################## |
| 192 | + ,'filter_pars': { 'ymax' : 2 ,'ymin' : -1 } |
| 193 | + |
| 194 | + } |
| 195 | + } |
| 196 | + |
| 197 | + ##### Filling Global parameters ############################################################ |
| 198 | + model_dict = global_pars_update(model_dict, data_name, config_name=os_get_function_name() ) |
| 199 | + return model_dict |
| 200 | + |
| 201 | + |
| 202 | + |
| 203 | +def pd_col_myfun(df=None, col=None, pars={}): |
| 204 | + """ |
| 205 | + Example of custom Processor |
| 206 | + """ |
| 207 | + from source.util_feature import save, load |
| 208 | + prefix = 'col_myfun`' |
| 209 | + if 'path_pipeline' in pars : #### Inference time LOAD previous pars |
| 210 | + prepro = load(pars['path_pipeline'] + f"/{prefix}_model.pkl" ) |
| 211 | + pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl" ) |
| 212 | + pars = {} if pars is None else pars |
| 213 | + #### Do something ################################################################# |
| 214 | + df_new = df[col] ### Do nithi |
| 215 | + df_new.columns = [ col + "_myfun" for col in df.columns ] |
| 216 | + cols_new = list(df_new.columns) |
| 217 | + |
| 218 | + prepro = None |
| 219 | + pars_new = None |
| 220 | + |
| 221 | + |
| 222 | + |
| 223 | + ################################################################################### |
| 224 | + if 'path_features_store' in pars and 'path_pipeline_export' in pars: |
| 225 | + save(prepro, pars['path_pipeline_export'] + f"/{prefix}_model.pkl" ) |
| 226 | + save(cols_new, pars['path_pipeline_export'] + f"/{prefix}.pkl" ) |
| 227 | + save(pars_new, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl" ) |
| 228 | + |
| 229 | + col_pars = {'prefix' : prefix , 'path' : pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } |
| 230 | + col_pars['cols_new'] = { |
| 231 | + 'col_myfun' : cols_new ### list |
| 232 | + } |
| 233 | + return df_new, col_pars |
| 234 | + |
| 235 | + |
| 236 | + |
| 237 | + |
| 238 | +##################################################################################### |
| 239 | +########## Profile data ############################################################# |
| 240 | +from core_run import data_profile |
| 241 | +# def data_profile(path_data="", path_output="", n_sample= 5000): |
| 242 | +""" |
| 243 | +
|
| 244 | +def data_profile(path_data="", path_output="", n_sample= 5000): |
| 245 | + from source.run_feature_profile import run_profile |
| 246 | + run_profile(path_data = path_data, |
| 247 | + path_output = path_output + "/profile/", |
| 248 | + n_sample = n_sample, |
| 249 | + ) |
| 250 | +""" |
| 251 | + |
| 252 | + |
| 253 | + |
| 254 | +################################################################################### |
| 255 | +########## Preprocess ############################################################# |
| 256 | +### def preprocess(config='', nsample=1000): |
| 257 | +from core_run import preprocess |
| 258 | + |
| 259 | +""" |
| 260 | +def preprocess(config=None, nsample=None): |
| 261 | + config_name = config if config is not None else config_default |
| 262 | + mdict = globals()[config_name]() |
| 263 | + m = mdict['global_pars'] |
| 264 | + print(mdict) |
| 265 | +
|
| 266 | + from source import run_preprocess |
| 267 | + run_preprocess.run_preprocess(config_name = config_name, |
| 268 | + config_path = m['config_path'], |
| 269 | + n_sample = nsample if nsample is not None else m['n_sample'], |
| 270 | +
|
| 271 | + ### Optonal |
| 272 | + mode = 'run_preprocess') |
| 273 | +""" |
| 274 | + |
| 275 | + |
| 276 | + |
| 277 | +################################################################################## |
| 278 | +########## Train ################################################################# |
| 279 | +from core_run import train |
| 280 | +""" |
| 281 | +def train(config=None, nsample=None): |
| 282 | +
|
| 283 | + config_name = config if config is not None else config_default |
| 284 | + mdict = globals()[config_name]() |
| 285 | + m = mdict['global_pars'] |
| 286 | + print(mdict) |
| 287 | + |
| 288 | + from source import run_train |
| 289 | + run_train.run_train(config_name = config_name, |
| 290 | + config_path = m['config_path'], |
| 291 | + n_sample = nsample if nsample is not None else m['n_sample'] |
| 292 | + ) |
| 293 | +""" |
| 294 | + |
| 295 | + |
| 296 | + |
| 297 | +################################################################################### |
| 298 | +######### Check data ############################################################## |
| 299 | +def check(): |
| 300 | + pass |
| 301 | + |
| 302 | + |
| 303 | + |
| 304 | + |
| 305 | +#################################################################################### |
| 306 | +####### Inference ################################################################## |
| 307 | +# predict(config='', nsample=10000) |
| 308 | +from core_run import predict |
| 309 | + |
| 310 | +""" |
| 311 | +def predict(config=None, nsample=None): |
| 312 | + config_name = config if config is not None else config_default |
| 313 | + mdict = globals()[config_name]() |
| 314 | + m = mdict['global_pars'] |
| 315 | +
|
| 316 | +
|
| 317 | + from source import run_inference |
| 318 | + run_inference.run_predict(config_name = config_name, |
| 319 | + config_path = m['config_path'], |
| 320 | + n_sample = nsample if nsample is not None else m['n_sample'], |
| 321 | +
|
| 322 | + #### Optional |
| 323 | + path_data = m['path_pred_data'], |
| 324 | + path_output = m['path_pred_output'], |
| 325 | + model_dict = None |
| 326 | + ) |
| 327 | +""" |
| 328 | + |
| 329 | + |
| 330 | +########################################################################################################### |
| 331 | +########################################################################################################### |
| 332 | +""" |
| 333 | +python test_automl.py data_profile |
| 334 | +python test_automl.py preprocess --nsample 100 |
| 335 | +python test_automl.py train --nsample 200 |
| 336 | +python test_automl.py check |
| 337 | +python test_automl.py predict |
| 338 | +
|
| 339 | +
|
| 340 | +""" |
| 341 | +if __name__ == "__main__": |
| 342 | + d = { 'data_profile': data_profile, 'train' : train, 'predict' : predict, 'config' : config_default } |
| 343 | + import fire |
| 344 | + fire.Fire(d) |
| 345 | + |
| 346 | + |
| 347 | + |
| 348 | + |
0 commit comments