1+
2+ import numpy as np
3+ import pandas as pd
4+ import matplotlib .pyplot as plt
5+
6+ from sklearn .preprocessing import MinMaxScaler
7+ from sklearn .cross_validation import train_test_split
8+ from sklearn .metrics import fbeta_score
9+ from sklearn .metrics import accuracy_score
10+ from sklearn .ensemble import GradientBoostingClassifier
11+ from sklearn .model_selection import GridSearchCV
12+ from sklearn .metrics import make_scorer
13+
14+ def nEnc (x ):
15+ if x == '>50K' : return 1
16+ else : return 0
17+
18+ data = pd .read_csv ("census.csv" )
19+ income_raw = data ['income' ]
20+
21+ features_raw = data .drop ('income' , axis = 1 )
22+
23+ skewed = ['capital-gain' , 'capital-loss' ]
24+ features_log_transformed = pd .DataFrame (data = features_raw )
25+ features_log_transformed [skewed ] = features_raw [skewed ].apply (lambda x : np .log (x + 1 ))
26+ scaler = MinMaxScaler () # default=(0, 1)
27+ numerical = ['age' , 'education-num' , 'capital-gain' , 'capital-loss' , 'hours-per-week' ]
28+ features_log_minmax_transform = pd .DataFrame (data = features_log_transformed )
29+ features_log_minmax_transform [numerical ] = scaler .fit_transform (features_log_transformed [numerical ])
30+ features_final = pd .get_dummies (features_log_minmax_transform )
31+ income = income_raw .apply (nEnc )
32+ X_train , X_test , y_train , y_test = train_test_split (features_final ,income ,test_size = 0.2 ,random_state = 0 )
33+
34+ print "Training set has {} samples." .format (X_train .shape [0 ])
35+ print "Testing set has {} samples." .format (X_test .shape [0 ])
36+
37+ clf = GradientBoostingClassifier (random_state = 30 )
38+ parameters = parameters = {'max_depth' : [4 ,6 , 8 , 10 ], 'n_estimators' : [200 ,300 ]}
39+ scorer = make_scorer (fbeta_score ,beta = 0.5 )
40+ grid_obj = GridSearchCV (clf , parameters ,scoring = scorer )
41+ grid_fit = grid_obj .fit (X_train ,y_train )
42+ best_clf = grid_fit .best_estimator_
43+ predictions = (clf .fit (X_train , y_train )).predict (X_test )
44+ best_predictions = best_clf .predict (X_test )
45+ best_parameters = best_clf .best_params_
46+
47+ print "Best grid search parameters: {}" .format (best_parameters )
48+ print "Final accuracy score on the testing data: {:.4f}" .format (accuracy_score (y_test , best_predictions ))
49+ print "Final F-score on the testing data: {:.4f}" .format (fbeta_score (y_test , best_predictions , beta = 0.5 ))
0 commit comments