1
1
from enum import Enum
2
2
import numpy as np
3
3
import pandas as pd
4
- import sys
5
- import os
4
+ import sys , os , json
6
5
from sklearn .preprocessing import LabelEncoder
7
6
from sklearn .externals import joblib
8
7
sys .path .append (os .path .join ('..' , 'src' ))
9
8
sys .path .append (os .path .join ('src' ))
10
- from sklearn import tree
11
- from sklearn import ensemble
9
+ from sklearn import tree , ensemble , metrics
12
10
import evaluation
13
11
14
-
15
12
class Model (Enum ):
16
13
DECISION_TREE = 0
17
14
RANDOM_FOREST = 1
@@ -61,22 +58,26 @@ def encode(train, validate):
61
58
return train , validate
62
59
63
60
64
- def make_model (train , model = Model .DECISION_TREE , seed = None ):
65
- print ("Creating decision tree model" )
61
+ def train_model (train , model = Model .DECISION_TREE , seed = None ):
62
+ print ("Training model using regressor: {}" . format ( model . name ) )
66
63
train_dropped = train .drop ('unit_sales' , axis = 1 )
67
64
target = train ['unit_sales' ]
68
65
69
66
if model == Model .RANDOM_FOREST :
70
- clf = ensemble .RandomForestRegressor (random_state = seed )
67
+ params = {'n_estimators' : 10 }
68
+ clf = ensemble .RandomForestRegressor (random_state = seed , ** params )
71
69
elif model == Model .ADABOOST :
72
- clf = ensemble .AdaBoostRegressor (random_state = seed )
70
+ params = {'n_estimators' : 50 , 'learning_rate' : 1.0 , 'loss' :'linear' }
71
+ clf = ensemble .AdaBoostRegressor (random_state = seed , ** params )
73
72
elif model == Model .GRADIENT_BOOST :
74
- clf = ensemble .GradientBoostingRegressor (max_depth = 4 , n_estimators = 200 , random_state = seed )
73
+ params = {'n_estimators' : 200 , 'max_depth' : 4 }
74
+ clf = ensemble .GradientBoostingRegressor (random_state = seed , ** params )
75
75
else :
76
+ params = {'criterion' : 'mse' }
76
77
clf = tree .DecisionTreeRegressor (random_state = seed )
77
78
78
- clf = clf .fit (train_dropped , target )
79
- return clf
79
+ model = clf .fit (train_dropped , target )
80
+ return ( model , params )
80
81
81
82
82
83
def overwrite_unseen_prediction_with_zero (preds , train , validate ):
@@ -90,46 +91,45 @@ def overwrite_unseen_prediction_with_zero(preds, train, validate):
90
91
return preds
91
92
92
93
93
- def make_predictions (clf , validate ):
94
+ def make_predictions (model , validate ):
94
95
print ("Making prediction on validation data" )
95
96
validate_dropped = validate .drop ('unit_sales' , axis = 1 ).fillna (- 1 )
96
- validate_preds = clf .predict (validate_dropped )
97
+ validate_preds = model .predict (validate_dropped )
97
98
return validate_preds
98
99
99
100
100
- def write_predictions_and_score (validation_score , model , columns_used ):
101
+ def write_predictions_and_score (evaluation_metrics , model , columns_used ):
101
102
key = "decision_tree"
102
103
if not os .path .exists ('data/{}' .format (key )):
103
104
os .makedirs ('data/{}' .format (key ))
104
105
filename = 'data/{}/model.pkl' .format (key )
105
106
print ("Writing to {}" .format (filename ))
106
107
joblib .dump (model , filename )
107
108
108
- filename = 'results/score.txt '
109
+ filename = 'results/metrics.json '
109
110
print ("Writing to {}" .format (filename ))
110
111
if not os .path .exists ('results' ):
111
112
os .makedirs ('results' )
112
113
with open (filename , 'w+' ) as score_file :
113
- score_file .write (str (validation_score ))
114
- # score = pd.DataFrame({'estimate': [validation_score]})
115
- # score.to_csv(filename, index=False)
116
-
117
- print ("Done deciding with trees" )
114
+ json .dump (evaluation_metrics , score_file )
118
115
119
116
120
117
def main (model = Model .DECISION_TREE , seed = None ):
121
118
original_train , original_validate = load_data ()
122
119
train , validate = encode (original_train , original_validate )
123
- model = make_model (train , model , seed )
120
+ model , params = train_model (train , model , seed )
124
121
validation_predictions = make_predictions (model , validate )
125
122
126
- print ("Calculating estimated error" )
127
- validation_score = evaluation .nwrmsle (validation_predictions , validate ['unit_sales' ].values , validate ['perishable' ].values )
123
+ print ("Calculating metrics" )
124
+ evaluation_metrics = {
125
+ 'nwrmsle' : evaluation .nwrmsle (validation_predictions , validate ['unit_sales' ].values , validate ['perishable' ].values ),
126
+ 'r2_score' : metrics .r2_score (y_true = validate ['unit_sales' ].values , y_pred = validation_predictions )
127
+ }
128
128
129
- write_predictions_and_score (validation_score , model , original_train .columns )
129
+ write_predictions_and_score (evaluation_metrics , model , original_train .columns )
130
130
131
- print ("Decision tree analysis done with a validation score (error rate) of {}." .format (validation_score ))
131
+ print ("Evaluation done with metrics {}." .format (json . dumps ( evaluation_metrics ) ))
132
132
133
133
134
134
if __name__ == "__main__" :
135
- main (seed = 8675309 )
135
+ main (model = Model . DECISION_TREE , seed = 8675309 )
0 commit comments