Skip to content

Commit d94d4b9

Browse files
committed
update tests
1 parent 697f5d3 commit d94d4b9

File tree

1 file changed

+92
-18
lines changed

1 file changed

+92
-18
lines changed

feature_importance.py

Lines changed: 92 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,43 +7,117 @@
77
from sklearn.cross_validation import train_test_split
88
from sklearn.metrics import fbeta_score
99
from sklearn.metrics import accuracy_score
10+
1011
from sklearn.ensemble import GradientBoostingClassifier
12+
from sklearn.linear_model import LogisticRegression
13+
1114
from sklearn.model_selection import GridSearchCV
1215
from sklearn.metrics import make_scorer
1316

17+
from trainPredict import train_predict
18+
1419
def nEnc(x):
1520
if x =='>50K': return 1
1621
else: return 0
1722

1823
data = pd.read_csv("census.csv")
1924
income_raw = data['income']
2025

26+
income_raw = data['income']
27+
28+
2129
features_raw = data.drop('income', axis = 1)
30+
print "Full features results"
31+
print "Number of raw features: {}".format(len(list(features_raw.columns)))
2232

2333
skewed = ['capital-gain', 'capital-loss']
34+
numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
2435
features_log_transformed = pd.DataFrame(data=features_raw)
2536
features_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1))
26-
scaler = MinMaxScaler() # default=(0, 1)
27-
numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
37+
scaler = MinMaxScaler()
2838
features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
2939
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])
3040
features_final = pd.get_dummies(features_log_minmax_transform)
3141
income = income_raw.apply(nEnc)
3242
X_train, X_test, y_train, y_test = train_test_split(features_final,income,test_size = 0.2,random_state = 0)
43+
clf = GradientBoostingClassifier(n_estimators=10, random_state=30, max_depth=8)
44+
results = train_predict(clf, len(y_train), X_train, y_train, X_test, y_test)
45+
46+
acc_trn0 = results['acc_train']
47+
acc_tst0 = results['acc_test']
48+
fsc_trn0 = results['f_train']
49+
fsc_tst0 = results['f_test']
50+
51+
print "Training accuracy: {}".format(results['acc_train'])
52+
print "Testing accuracy: {}".format(results['acc_test'])
53+
print "Training f-score : {}".format(results['f_train'])
54+
print "Testing f-score : {}".format(results['f_test'])
55+
56+
57+
features = data.drop('income', axis = 1)
58+
feat_names = list(features.columns)
59+
60+
for feat in feat_names:
61+
print ""
62+
print "Removing feature {}".format(feat)
63+
print ""
64+
features_raw = features.drop(feat, axis=1)
65+
#print "Number of raw features: {}".format(len(list(features_raw.columns)))
66+
67+
skewed = ['capital-gain', 'capital-loss']
68+
if feat in skewed:
69+
skewed.remove(feat)
70+
71+
numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
72+
if feat in numerical:
73+
numerical.remove(feat)
74+
75+
features_log_transformed = pd.DataFrame(data=features_raw)
76+
features_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1))
77+
scaler = MinMaxScaler()
78+
79+
features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
80+
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])
81+
features_final = pd.get_dummies(features_log_minmax_transform)
82+
income = income_raw.apply(nEnc)
83+
X_train, X_test, y_train, y_test = train_test_split(features_final,income,test_size = 0.2,random_state = 0)
84+
85+
clf = GradientBoostingClassifier(n_estimators=10, random_state=30, max_depth=8)
86+
results = train_predict(clf, len(y_train), X_train, y_train, X_test, y_test)
87+
88+
acc_trn = results['acc_train']
89+
acc_tst = results['acc_test']
90+
fsc_trn = results['f_train']
91+
fsc_tst = results['f_test']
92+
93+
#print "Training accuracy: {} Change: {}".format(acc_trn, abs(acc_trn0-acc_trn))
94+
#print "Testing accuracy: {} Change: {}".format(acc_tst, abs(acc_tst0-acc_tst))
95+
#print "Training f-score : {} Change: {}".format(fsc_trn, abs(fsc_trn0-fsc_trn))
96+
print "Testing f-score : {} Change: {}".format(fsc_tst, abs(fsc_tst0-fsc_tst))
97+
98+
99+
100+
101+
102+
103+
if False:
104+
105+
clf = GradientBoostingClassifier(n_estimators=10,random_state=30,max_depth=8)
106+
#parameters = {'max_depth': [4,6, 8, 10], 'n_estimators': [200,300]}
107+
parameters = {}
108+
109+
#clf = LogisticRegression(random_state=30, n_jobs=6)
110+
#parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
111+
112+
113+
scorer = make_scorer(fbeta_score,beta=0.5)
114+
grid_obj = GridSearchCV(clf, parameters,scoring=scorer)
115+
grid_fit = grid_obj.fit(X_train,y_train)
116+
best_clf = grid_fit.best_estimator_
117+
predictions = (clf.fit(X_train, y_train)).predict(X_test)
118+
best_predictions = best_clf.predict(X_test)
119+
best_parameters = grid_fit.best_params_
33120

34-
print "Training set has {} samples.".format(X_train.shape[0])
35-
print "Testing set has {} samples.".format(X_test.shape[0])
36-
37-
clf = GradientBoostingClassifier(random_state=30)
38-
parameters = parameters = {'max_depth': [4,6, 8, 10], 'n_estimators': [200,300]}
39-
scorer = make_scorer(fbeta_score,beta=0.5)
40-
grid_obj = GridSearchCV(clf, parameters,scoring=scorer)
41-
grid_fit = grid_obj.fit(X_train,y_train)
42-
best_clf = grid_fit.best_estimator_
43-
predictions = (clf.fit(X_train, y_train)).predict(X_test)
44-
best_predictions = best_clf.predict(X_test)
45-
best_parameters = best_clf.best_params_
46-
47-
print "Best grid search parameters: {}".format(best_parameters)
48-
print "Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))
49-
print "Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5))
121+
#print "Best grid search parameters: {}".format(best_parameters)
122+
print "Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))
123+
print "Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5))

0 commit comments

Comments
 (0)