77from sklearn .cross_validation import train_test_split
88from sklearn .metrics import fbeta_score
99from sklearn .metrics import accuracy_score
10+
1011from sklearn .ensemble import GradientBoostingClassifier
12+ from sklearn .linear_model import LogisticRegression
13+
1114from sklearn .model_selection import GridSearchCV
1215from sklearn .metrics import make_scorer
1316
17+ from trainPredict import train_predict
18+
1419def nEnc (x ):
1520 if x == '>50K' : return 1
1621 else : return 0
1722
1823data = pd .read_csv ("census.csv" )
1924income_raw = data ['income' ]
2025
26+ income_raw = data ['income' ]
27+
28+
2129features_raw = data .drop ('income' , axis = 1 )
30+ print "Full features results"
31+ print "Number of raw features: {}" .format (len (list (features_raw .columns )))
2232
2333skewed = ['capital-gain' , 'capital-loss' ]
34+ numerical = ['age' , 'education-num' , 'capital-gain' , 'capital-loss' , 'hours-per-week' ]
2435features_log_transformed = pd .DataFrame (data = features_raw )
2536features_log_transformed [skewed ] = features_raw [skewed ].apply (lambda x : np .log (x + 1 ))
26- scaler = MinMaxScaler () # default=(0, 1)
27- numerical = ['age' , 'education-num' , 'capital-gain' , 'capital-loss' , 'hours-per-week' ]
37+ scaler = MinMaxScaler ()
2838features_log_minmax_transform = pd .DataFrame (data = features_log_transformed )
2939features_log_minmax_transform [numerical ] = scaler .fit_transform (features_log_transformed [numerical ])
3040features_final = pd .get_dummies (features_log_minmax_transform )
3141income = income_raw .apply (nEnc )
3242X_train , X_test , y_train , y_test = train_test_split (features_final ,income ,test_size = 0.2 ,random_state = 0 )
43+ clf = GradientBoostingClassifier (n_estimators = 10 , random_state = 30 , max_depth = 8 )
44+ results = train_predict (clf , len (y_train ), X_train , y_train , X_test , y_test )
45+
46+ acc_trn0 = results ['acc_train' ]
47+ acc_tst0 = results ['acc_test' ]
48+ fsc_trn0 = results ['f_train' ]
49+ fsc_tst0 = results ['f_test' ]
50+
51+ print "Training accuracy: {}" .format (results ['acc_train' ])
52+ print "Testing accuracy: {}" .format (results ['acc_test' ])
53+ print "Training f-score : {}" .format (results ['f_train' ])
54+ print "Testing f-score : {}" .format (results ['f_test' ])
55+
56+
57+ features = data .drop ('income' , axis = 1 )
58+ feat_names = list (features .columns )
59+
60+ for feat in feat_names :
61+ print ""
62+ print "Removing feature {}" .format (feat )
63+ print ""
64+ features_raw = features .drop (feat , axis = 1 )
65+ #print "Number of raw features: {}".format(len(list(features_raw.columns)))
66+
67+ skewed = ['capital-gain' , 'capital-loss' ]
68+ if feat in skewed :
69+ skewed .remove (feat )
70+
71+ numerical = ['age' , 'education-num' , 'capital-gain' , 'capital-loss' , 'hours-per-week' ]
72+ if feat in numerical :
73+ numerical .remove (feat )
74+
75+ features_log_transformed = pd .DataFrame (data = features_raw )
76+ features_log_transformed [skewed ] = features_raw [skewed ].apply (lambda x : np .log (x + 1 ))
77+ scaler = MinMaxScaler ()
78+
79+ features_log_minmax_transform = pd .DataFrame (data = features_log_transformed )
80+ features_log_minmax_transform [numerical ] = scaler .fit_transform (features_log_transformed [numerical ])
81+ features_final = pd .get_dummies (features_log_minmax_transform )
82+ income = income_raw .apply (nEnc )
83+ X_train , X_test , y_train , y_test = train_test_split (features_final ,income ,test_size = 0.2 ,random_state = 0 )
84+
85+ clf = GradientBoostingClassifier (n_estimators = 10 , random_state = 30 , max_depth = 8 )
86+ results = train_predict (clf , len (y_train ), X_train , y_train , X_test , y_test )
87+
88+ acc_trn = results ['acc_train' ]
89+ acc_tst = results ['acc_test' ]
90+ fsc_trn = results ['f_train' ]
91+ fsc_tst = results ['f_test' ]
92+
93+ #print "Training accuracy: {} Change: {}".format(acc_trn, abs(acc_trn0-acc_trn))
94+ #print "Testing accuracy: {} Change: {}".format(acc_tst, abs(acc_tst0-acc_tst))
95+ #print "Training f-score : {} Change: {}".format(fsc_trn, abs(fsc_trn0-fsc_trn))
96+ print "Testing f-score : {} Change: {}" .format (fsc_tst , abs (fsc_tst0 - fsc_tst ))
97+
98+
99+
100+
101+
102+
103+ if False :
104+
105+ clf = GradientBoostingClassifier (n_estimators = 10 ,random_state = 30 ,max_depth = 8 )
106+ #parameters = {'max_depth': [4,6, 8, 10], 'n_estimators': [200,300]}
107+ parameters = {}
108+
109+ #clf = LogisticRegression(random_state=30, n_jobs=6)
110+ #parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
111+
112+
113+ scorer = make_scorer (fbeta_score ,beta = 0.5 )
114+ grid_obj = GridSearchCV (clf , parameters ,scoring = scorer )
115+ grid_fit = grid_obj .fit (X_train ,y_train )
116+ best_clf = grid_fit .best_estimator_
117+ predictions = (clf .fit (X_train , y_train )).predict (X_test )
118+ best_predictions = best_clf .predict (X_test )
119+ best_parameters = grid_fit .best_params_
33120
34- print "Training set has {} samples." .format (X_train .shape [0 ])
35- print "Testing set has {} samples." .format (X_test .shape [0 ])
36-
37- clf = GradientBoostingClassifier (random_state = 30 )
38- parameters = parameters = {'max_depth' : [4 ,6 , 8 , 10 ], 'n_estimators' : [200 ,300 ]}
39- scorer = make_scorer (fbeta_score ,beta = 0.5 )
40- grid_obj = GridSearchCV (clf , parameters ,scoring = scorer )
41- grid_fit = grid_obj .fit (X_train ,y_train )
42- best_clf = grid_fit .best_estimator_
43- predictions = (clf .fit (X_train , y_train )).predict (X_test )
44- best_predictions = best_clf .predict (X_test )
45- best_parameters = best_clf .best_params_
46-
47- print "Best grid search parameters: {}" .format (best_parameters )
48- print "Final accuracy score on the testing data: {:.4f}" .format (accuracy_score (y_test , best_predictions ))
49- print "Final F-score on the testing data: {:.4f}" .format (fbeta_score (y_test , best_predictions , beta = 0.5 ))
121+ #print "Best grid search parameters: {}".format(best_parameters)
122+ print "Final accuracy score on the testing data: {:.4f}" .format (accuracy_score (y_test , best_predictions ))
123+ print "Final F-score on the testing data: {:.4f}" .format (fbeta_score (y_test , best_predictions , beta = 0.5 ))
0 commit comments