Skip to content

Commit 697f5d3

Browse files
committed
first commit
1 parent e278264 commit 697f5d3

File tree

1 file changed

+49
-0
lines changed

1 file changed

+49
-0
lines changed

feature_importance.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
2+
import numpy as np
3+
import pandas as pd
4+
import matplotlib.pyplot as plt
5+
6+
from sklearn.preprocessing import MinMaxScaler
7+
from sklearn.cross_validation import train_test_split
8+
from sklearn.metrics import fbeta_score
9+
from sklearn.metrics import accuracy_score
10+
from sklearn.ensemble import GradientBoostingClassifier
11+
from sklearn.model_selection import GridSearchCV
12+
from sklearn.metrics import make_scorer
13+
14+
def nEnc(x):
15+
if x =='>50K': return 1
16+
else: return 0
17+
18+
data = pd.read_csv("census.csv")
19+
income_raw = data['income']
20+
21+
features_raw = data.drop('income', axis = 1)
22+
23+
skewed = ['capital-gain', 'capital-loss']
24+
features_log_transformed = pd.DataFrame(data=features_raw)
25+
features_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1))
26+
scaler = MinMaxScaler() # default=(0, 1)
27+
numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
28+
features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
29+
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])
30+
features_final = pd.get_dummies(features_log_minmax_transform)
31+
income = income_raw.apply(nEnc)
32+
X_train, X_test, y_train, y_test = train_test_split(features_final,income,test_size = 0.2,random_state = 0)
33+
34+
print "Training set has {} samples.".format(X_train.shape[0])
35+
print "Testing set has {} samples.".format(X_test.shape[0])
36+
37+
clf = GradientBoostingClassifier(random_state=30)
38+
parameters = parameters = {'max_depth': [4,6, 8, 10], 'n_estimators': [200,300]}
39+
scorer = make_scorer(fbeta_score,beta=0.5)
40+
grid_obj = GridSearchCV(clf, parameters,scoring=scorer)
41+
grid_fit = grid_obj.fit(X_train,y_train)
42+
best_clf = grid_fit.best_estimator_
43+
predictions = (clf.fit(X_train, y_train)).predict(X_test)
44+
best_predictions = best_clf.predict(X_test)
45+
best_parameters = best_clf.best_params_
46+
47+
print "Best grid search parameters: {}".format(best_parameters)
48+
print "Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))
49+
print "Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5))

0 commit comments

Comments
 (0)