Skip to content

Commit 7154aca

Browse files
committed
automl
1 parent b7b1bef commit 7154aca

File tree

3 files changed

+354
-0
lines changed

3 files changed

+354
-0
lines changed

source/models/model_sklearn.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@
4040
from lightgbm import LGBMModel, LGBMRegressor, LGBMClassifier
4141

4242

43+
try :
44+
from supervised.automl import *
45+
except:
46+
print('cannot import automl')
47+
4348
try :
4449
#### All are Un-supervised Model
4550
from pyod.models.abod import *

test_automl.py

Lines changed: 348 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,348 @@
1+
# pylint: disable=C0321,C0103,E1221,C0301,E1305,E1121,C0302,C0330
2+
# -*- coding: utf-8 -*-
3+
"""
4+
5+
https://github.com/mljar/mljar-supervised
6+
7+
python test_automl.py train > zlog/log_titanic_train.txt 2>&1
8+
python test_automl.py predict > zlog/log_titanic_predict.txt 2>&1
9+
10+
11+
mljar-supervised 0.8.8 requires dtreeviz==1.0, which is not installed.
12+
mljar-supervised 0.8.8 requires fastparquet==0.4.1, which is not installed.
13+
mljar-supervised 0.8.8 requires wordcloud==1.7.0, which is not installed.
14+
15+
16+
mljar-supervised 0.8.8 requires catboost==0.24.1, but you'll have catboost 0.22 which is incompatible.
17+
mljar-supervised 0.8.8 requires category-encoders==2.2.2, but you'll have category-encoders 2.1.0 which is incompatible.
18+
mljar-supervised 0.8.8 requires lightgbm==3.0.0, but you'll have lightgbm 2.3.0 which is incompatible.
19+
mljar-supervised 0.8.8 requires numpy>=1.18.5, but you'll have numpy 1.18.1 which is incompatible.
20+
mljar-supervised 0.8.8 requires pandas==1.1.2, but you'll have pandas 0.25.3 which is incompatible.
21+
mljar-supervised 0.8.8 requires pyarrow==0.17.0, but you'll have pyarrow 2.0.0 which is incompatible.
22+
mljar-supervised 0.8.8 requires scipy==1.4.1, but you'll have scipy 1.3.1 which is incompatible.
23+
mljar-supervised 0.8.8 requires seaborn==0.10.1, but you'll have seaborn 0.10.0 which is incompatible.
24+
mljar-supervised 0.8.8 requires shap==0.36.0, but you'll have shap 0.35.0 which is incompatible.
25+
mljar-supervised 0.8.8 requires tabulate==0.8.7, but you'll have tabulate 0.8.6 which is incompatible.
26+
mljar-supervised 0.8.8 requires xgboost==1.2.0, but you'll have xgboost 1.3.3 which is incompatible.
27+
28+
conda install -c conda-forge fastparquet
29+
30+
31+
"""
32+
import warnings, copy, os, sys
33+
warnings.filterwarnings('ignore')
34+
35+
####################################################################################
36+
###### Path ########################################################################
37+
root_repo = os.path.abspath(os.getcwd()).replace("\\", "/") + "/" ; print(root_repo)
38+
THIS_FILEPATH = os.path.abspath(__file__)
39+
40+
sys.path.append(root_repo)
41+
from source.util_feature import save,os_get_function_name
42+
43+
44+
45+
46+
def global_pars_update(model_dict, data_name, config_name):
47+
print("config_name", config_name)
48+
dir_data = root_repo + "/data/" ; print("dir_data", dir_data)
49+
50+
m = {}
51+
m['config_path'] = THIS_FILEPATH
52+
m['config_name'] = config_name
53+
54+
#### peoprocess input path
55+
m['path_data_preprocess'] = dir_data + f'/input/{data_name}/train/'
56+
57+
#### train input path
58+
dir_data_url = "https://github.com/arita37/dsa2_data/tree/main/" #### Remote Data directory
59+
m['path_data_train'] = dir_data_url + f'/input/{data_name}/train/'
60+
m['path_data_test'] = dir_data_url + f'/input/{data_name}/test/'
61+
#m['path_data_val'] = dir_data + f'/input/{data_name}/test/'
62+
63+
#### train output path
64+
m['path_train_output'] = dir_data + f'/output/{data_name}/{config_name}/'
65+
m['path_train_model'] = dir_data + f'/output/{data_name}/{config_name}/model/'
66+
m['path_features_store'] = dir_data + f'/output/{data_name}/{config_name}/features_store/'
67+
m['path_pipeline'] = dir_data + f'/output/{data_name}/{config_name}/pipeline/'
68+
69+
70+
#### predict input path
71+
m['path_pred_data'] = dir_data + f'/input/{data_name}/test/'
72+
m['path_pred_pipeline'] = dir_data + f'/output/{data_name}/{config_name}/pipeline/'
73+
m['path_pred_model'] = dir_data + f'/output/{data_name}/{config_name}/model/'
74+
75+
#### predict output path
76+
m['path_pred_output'] = dir_data + f'/output/{data_name}/pred_{config_name}/'
77+
78+
##### Generic
79+
m['n_sample'] = model_dict['data_pars'].get('n_sample', 5000)
80+
81+
model_dict[ 'global_pars'] = m
82+
return model_dict
83+
84+
85+
86+
####################################################################################
87+
##### Params########################################################################
88+
config_default = 'config1' ### name of function which contains data configuration
89+
90+
91+
# data_name = "titanic" ### in data/input/
92+
cols_input_type_1 = {
93+
"coly" : "Survived"
94+
,"colid" : "PassengerId"
95+
,"colcat" : ["Sex", "Embarked" ]
96+
,"colnum" : ["Pclass", "Age","SibSp", "Parch","Fare"]
97+
,"coltext" : []
98+
,"coldate" : []
99+
,"colcross" : [ "Name", "Sex", "Ticket","Embarked","Pclass", "Age", "SibSp", ]
100+
}
101+
102+
103+
####################################################################################
104+
def config1() :
105+
"""
106+
ONE SINGLE DICT Contains all needed informations for
107+
used for titanic classification task
108+
"""
109+
data_name = "titanic" ### in data/input/
110+
model_class = 'AutoML' ### ACTUAL Class name for model_sklearn.py
111+
n_sample = 1000
112+
113+
def post_process_fun(y): ### After prediction is done
114+
return int(y)
115+
116+
def pre_process_fun(y): ### Before the prediction is done
117+
return int(y)
118+
119+
120+
model_dict = {'model_pars': {
121+
### LightGBM API model #######################################
122+
'model_class': model_class
123+
,'model_pars' : {
124+
'total_time_limit' : 20,
125+
'algorithms' : 'auto',
126+
'results_path' : root_repo + f'/data/output/{data_name}/{os_get_function_name()}/automl_1',
127+
'eval_metric' : 'auto'
128+
129+
# mode='Explain',
130+
# ml_task='auto', model_time_limit=None, algorithms='auto', train_ensemble=True,
131+
# stack_models='auto', eval_metric='auto', validation_strategy='auto', explain_level='auto',
132+
# golden_features='auto', features_selection='auto', start_random_models='auto',
133+
# hill_climbing_steps='auto', top_models_to_improve='auto', verbose=1, random_state=1234)
134+
}
135+
136+
, 'post_process_fun' : post_process_fun ### After prediction ##########################################
137+
, 'pre_process_pars' : {'y_norm_fun' : pre_process_fun , ### Before training ##########################
138+
139+
140+
### Pipeline for data processing ##############################
141+
'pipe_list': [
142+
#### coly target prorcessing
143+
{'uri': 'source/prepro.py::pd_coly', 'pars': {}, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly' },
144+
145+
146+
{'uri': 'source/prepro.py::pd_colnum_bin', 'pars': {}, 'cols_family': 'colnum', 'cols_out': 'colnum_bin', 'type': '' },
147+
{'uri': 'source/prepro.py::pd_colnum_binto_onehot', 'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot', 'type': '' },
148+
149+
#### catcol INTO integer, colcat into OneHot
150+
{'uri': 'source/prepro.py::pd_colcat_bin', 'pars': {}, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': '' },
151+
# {'uri': 'source/prepro.py::pd_colcat_to_onehot', 'pars': {}, 'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot', 'type': '' },
152+
153+
154+
### Cross_feat = feat1 X feat2
155+
# {'uri': 'source/prepro.py::pd_colcross', 'pars': {}, 'cols_family': 'colcross', 'cols_out': 'colcross_pair', 'type': 'cross'},
156+
157+
158+
#### Example of Custom processor
159+
#{'uri': THIS_FILEPATH + '::pd_col_myfun', 'pars': {}, 'cols_family': 'colnum', 'cols_out': 'col_myfun', 'type': '' },
160+
161+
162+
],
163+
}
164+
},
165+
166+
'compute_pars': { 'metric_list': ['accuracy_score','average_precision_score']
167+
168+
,'mlflow_pars' : None # {} ### Not empty --> use mlflow
169+
},
170+
171+
'data_pars': { 'n_sample' : n_sample,
172+
173+
'download_pars' : None,
174+
175+
176+
'cols_input_type' : cols_input_type_1,
177+
### family of columns for MODEL #########################################################
178+
# "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap", #### Colnum columns
179+
# "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map", #### colcat columns
180+
# 'colcross_single_onehot_select', "colcross_pair_onehot", 'colcross_pair', #### colcross columns 'coldate', 'coltext',
181+
'cols_model_group': [ 'colnum_bin',
182+
'colcat_bin',
183+
# 'coltext',
184+
# 'coldate',
185+
#'colcross_pair',
186+
187+
### example of custom
188+
# 'col_myfun'
189+
]
190+
191+
### Filter data rows ##################################################################
192+
,'filter_pars': { 'ymax' : 2 ,'ymin' : -1 }
193+
194+
}
195+
}
196+
197+
##### Filling Global parameters ############################################################
198+
model_dict = global_pars_update(model_dict, data_name, config_name=os_get_function_name() )
199+
return model_dict
200+
201+
202+
203+
def pd_col_myfun(df=None, col=None, pars={}):
204+
"""
205+
Example of custom Processor
206+
"""
207+
from source.util_feature import save, load
208+
prefix = 'col_myfun`'
209+
if 'path_pipeline' in pars : #### Inference time LOAD previous pars
210+
prepro = load(pars['path_pipeline'] + f"/{prefix}_model.pkl" )
211+
pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl" )
212+
pars = {} if pars is None else pars
213+
#### Do something #################################################################
214+
df_new = df[col] ### Do nithi
215+
df_new.columns = [ col + "_myfun" for col in df.columns ]
216+
cols_new = list(df_new.columns)
217+
218+
prepro = None
219+
pars_new = None
220+
221+
222+
223+
###################################################################################
224+
if 'path_features_store' in pars and 'path_pipeline_export' in pars:
225+
save(prepro, pars['path_pipeline_export'] + f"/{prefix}_model.pkl" )
226+
save(cols_new, pars['path_pipeline_export'] + f"/{prefix}.pkl" )
227+
save(pars_new, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl" )
228+
229+
col_pars = {'prefix' : prefix , 'path' : pars.get('path_pipeline_export', pars.get('path_pipeline', None)) }
230+
col_pars['cols_new'] = {
231+
'col_myfun' : cols_new ### list
232+
}
233+
return df_new, col_pars
234+
235+
236+
237+
238+
#####################################################################################
239+
########## Profile data #############################################################
240+
from core_run import data_profile
241+
# def data_profile(path_data="", path_output="", n_sample= 5000):
242+
"""
243+
244+
def data_profile(path_data="", path_output="", n_sample= 5000):
245+
from source.run_feature_profile import run_profile
246+
run_profile(path_data = path_data,
247+
path_output = path_output + "/profile/",
248+
n_sample = n_sample,
249+
)
250+
"""
251+
252+
253+
254+
###################################################################################
255+
########## Preprocess #############################################################
256+
### def preprocess(config='', nsample=1000):
257+
from core_run import preprocess
258+
259+
"""
260+
def preprocess(config=None, nsample=None):
261+
config_name = config if config is not None else config_default
262+
mdict = globals()[config_name]()
263+
m = mdict['global_pars']
264+
print(mdict)
265+
266+
from source import run_preprocess
267+
run_preprocess.run_preprocess(config_name = config_name,
268+
config_path = m['config_path'],
269+
n_sample = nsample if nsample is not None else m['n_sample'],
270+
271+
### Optonal
272+
mode = 'run_preprocess')
273+
"""
274+
275+
276+
277+
##################################################################################
278+
########## Train #################################################################
279+
from core_run import train
280+
"""
281+
def train(config=None, nsample=None):
282+
283+
config_name = config if config is not None else config_default
284+
mdict = globals()[config_name]()
285+
m = mdict['global_pars']
286+
print(mdict)
287+
288+
from source import run_train
289+
run_train.run_train(config_name = config_name,
290+
config_path = m['config_path'],
291+
n_sample = nsample if nsample is not None else m['n_sample']
292+
)
293+
"""
294+
295+
296+
297+
###################################################################################
298+
######### Check data ##############################################################
299+
def check():
300+
pass
301+
302+
303+
304+
305+
####################################################################################
306+
####### Inference ##################################################################
307+
# predict(config='', nsample=10000)
308+
from core_run import predict
309+
310+
"""
311+
def predict(config=None, nsample=None):
312+
config_name = config if config is not None else config_default
313+
mdict = globals()[config_name]()
314+
m = mdict['global_pars']
315+
316+
317+
from source import run_inference
318+
run_inference.run_predict(config_name = config_name,
319+
config_path = m['config_path'],
320+
n_sample = nsample if nsample is not None else m['n_sample'],
321+
322+
#### Optional
323+
path_data = m['path_pred_data'],
324+
path_output = m['path_pred_output'],
325+
model_dict = None
326+
)
327+
"""
328+
329+
330+
###########################################################################################################
331+
###########################################################################################################
332+
"""
333+
python test_automl.py data_profile
334+
python test_automl.py preprocess --nsample 100
335+
python test_automl.py train --nsample 200
336+
python test_automl.py check
337+
python test_automl.py predict
338+
339+
340+
"""
341+
if __name__ == "__main__":
342+
d = { 'data_profile': data_profile, 'train' : train, 'predict' : predict, 'config' : config_default }
343+
import fire
344+
fire.Fire(d)
345+
346+
347+
348+

zrequirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,4 @@ numba==0.50.1
2424
#tensorflow_text>=2.0.0rc0
2525
#sdv==0.6.1
2626
mlflow==1.13.1
27+
mljar-supervised

0 commit comments

Comments
 (0)