Skip to content

Commit c923bd2

Browse files
committed
New
1 parent e7915b2 commit c923bd2

22 files changed

+14830
-5967
lines changed

models/aletom/28-12-2016.py

Lines changed: 197 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -10,34 +10,116 @@
1010
import matplotlib.pyplot as plt
1111
from collections import Counter
1212
from sklearn.metrics import accuracy_score
13+
from math import sqrt
14+
from sklearn import cross_validation
1315

1416
# read the file into a dataframe
15-
X = pd.read_csv('data/trainremake.csv')
17+
train = pd.read_csv('data/trainremake.csv')
18+
19+
test = pd.read_csv('data/testremake.csv')
20+
21+
train['Type'] = 0 #Create a flag for Train and Test Data set
22+
test['Type'] = 1
23+
24+
# Handle missing values for features where median/mean or most common value doesn't make sense
25+
26+
# Alley : data description says NA means "no alley access"
27+
train.loc[:, "Alley"] = train.loc[:, "Alley"].fillna("None")
28+
# BedroomAbvGr : NA most likely means 0
29+
train.loc[:, "BedroomAbvGr"] = train.loc[:, "BedroomAbvGr"].fillna(0)
30+
# BsmtQual etc : data description says NA for basement features is "no basement"
31+
train.loc[:, "BsmtQual"] = train.loc[:, "BsmtQual"].fillna("No")
32+
train.loc[:, "BsmtCond"] = train.loc[:, "BsmtCond"].fillna("No")
33+
train.loc[:, "BsmtExposure"] = train.loc[:, "BsmtExposure"].fillna("No")
34+
train.loc[:, "BsmtFinType1"] = train.loc[:, "BsmtFinType1"].fillna("No")
35+
train.loc[:, "BsmtFinType2"] = train.loc[:, "BsmtFinType2"].fillna("No")
36+
train.loc[:, "BsmtFullBath"] = train.loc[:, "BsmtFullBath"].fillna(0)
37+
train.loc[:, "BsmtHalfBath"] = train.loc[:, "BsmtHalfBath"].fillna(0)
38+
train.loc[:, "BsmtUnfSF"] = train.loc[:, "BsmtUnfSF"].fillna(0)
39+
# CentralAir : NA most likely means No
40+
train.loc[:, "CentralAir"] = train.loc[:, "CentralAir"].fillna("N")
41+
# Condition : NA most likely means Normal
42+
train.loc[:, "Condition1"] = train.loc[:, "Condition1"].fillna("Norm")
43+
train.loc[:, "Condition2"] = train.loc[:, "Condition2"].fillna("Norm")
44+
# EnclosedPorch : NA most likely means no enclosed porch
45+
train.loc[:, "EnclosedPorch"] = train.loc[:, "EnclosedPorch"].fillna(0)
46+
# External stuff : NA most likely means average
47+
train.loc[:, "ExterCond"] = train.loc[:, "ExterCond"].fillna("TA")
48+
train.loc[:, "ExterQual"] = train.loc[:, "ExterQual"].fillna("TA")
49+
# Fence : data description says NA means "no fence"
50+
train.loc[:, "Fence"] = train.loc[:, "Fence"].fillna("No")
51+
# FireplaceQu : data description says NA means "no fireplace"
52+
train.loc[:, "FireplaceQu"] = train.loc[:, "FireplaceQu"].fillna("No")
53+
train.loc[:, "Fireplaces"] = train.loc[:, "Fireplaces"].fillna(0)
54+
# Functional : data description says NA means typical
55+
train.loc[:, "Functional"] = train.loc[:, "Functional"].fillna("Typ")
56+
# GarageType etc : data description says NA for garage features is "no garage"
57+
train.loc[:, "GarageType"] = train.loc[:, "GarageType"].fillna("No")
58+
train.loc[:, "GarageFinish"] = train.loc[:, "GarageFinish"].fillna("No")
59+
train.loc[:, "GarageQual"] = train.loc[:, "GarageQual"].fillna("No")
60+
train.loc[:, "GarageCond"] = train.loc[:, "GarageCond"].fillna("No")
61+
train.loc[:, "GarageArea"] = train.loc[:, "GarageArea"].fillna(0)
62+
train.loc[:, "GarageCars"] = train.loc[:, "GarageCars"].fillna(0)
63+
# HalfBath : NA most likely means no half baths above grade
64+
train.loc[:, "HalfBath"] = train.loc[:, "HalfBath"].fillna(0)
65+
# HeatingQC : NA most likely means typical
66+
train.loc[:, "HeatingQC"] = train.loc[:, "HeatingQC"].fillna("TA")
67+
# KitchenAbvGr : NA most likely means 0
68+
train.loc[:, "KitchenAbvGr"] = train.loc[:, "KitchenAbvGr"].fillna(0)
69+
# KitchenQual : NA most likely means typical
70+
train.loc[:, "KitchenQual"] = train.loc[:, "KitchenQual"].fillna("TA")
71+
# LotFrontage : NA most likely means no lot frontage
72+
train.loc[:, "LotFrontage"] = train.loc[:, "LotFrontage"].fillna(0)
73+
# LotShape : NA most likely means regular
74+
train.loc[:, "LotShape"] = train.loc[:, "LotShape"].fillna("Reg")
75+
# MasVnrType : NA most likely means no veneer
76+
train.loc[:, "MasVnrType"] = train.loc[:, "MasVnrType"].fillna("None")
77+
train.loc[:, "MasVnrArea"] = train.loc[:, "MasVnrArea"].fillna(0)
78+
# MiscFeature : data description says NA means "no misc feature"
79+
train.loc[:, "MiscFeature"] = train.loc[:, "MiscFeature"].fillna("No")
80+
train.loc[:, "MiscVal"] = train.loc[:, "MiscVal"].fillna(0)
81+
# OpenPorchSF : NA most likely means no open porch
82+
train.loc[:, "OpenPorchSF"] = train.loc[:, "OpenPorchSF"].fillna(0)
83+
# PavedDrive : NA most likely means not paved
84+
train.loc[:, "PavedDrive"] = train.loc[:, "PavedDrive"].fillna("N")
85+
# PoolQC : data description says NA means "no pool"
86+
train.loc[:, "PoolQC"] = train.loc[:, "PoolQC"].fillna("No")
87+
train.loc[:, "PoolArea"] = train.loc[:, "PoolArea"].fillna(0)
88+
# SaleCondition : NA most likely means normal sale
89+
train.loc[:, "SaleCondition"] = train.loc[:, "SaleCondition"].fillna("Normal")
90+
# ScreenPorch : NA most likely means no screen porch
91+
train.loc[:, "ScreenPorch"] = train.loc[:, "ScreenPorch"].fillna(0)
92+
# TotRmsAbvGrd : NA most likely means 0
93+
train.loc[:, "TotRmsAbvGrd"] = train.loc[:, "TotRmsAbvGrd"].fillna(0)
94+
# Utilities : NA most likely means all public utilities
95+
train.loc[:, "Utilities"] = train.loc[:, "Utilities"].fillna("AllPub")
96+
# WoodDeckSF : NA most likely means no wood deck
97+
train.loc[:, "WoodDeckSF"] = train.loc[:, "WoodDeckSF"].fillna(0)
1698

17-
Y = pd.read_csv('data/testremake.csv')
1899

19-
X['Type'] = 0 #Create a flag for Train and Test Data set
20-
Y['Type'] = 1
21100

22-
fullData = pd.concat([X,Y],axis=0) #Combined both Train and Test Data set
23101
#######################
24102
# basic data cleaning #
25103
#######################
26104

27105
#### Decide which categorical variables you want to use in model
28106
# Make an array of it
107+
fullData = pd.concat([train,test],axis=0) #Combined both Train and Test Data set
29108
var = []
30109
for col_name in fullData.columns:
31110
if fullData[col_name].dtypes == 'object':
32111
unique_cat = len(fullData[col_name].unique())
33112
var.append(col_name)
34-
# print("Feature '{col_name}' has {unique_cat} unique categories".format(col_name=col_name, unique_cat=unique_cat))
113+
print("Feature '{col_name}' has {unique_cat} unique categories".format(col_name=col_name, unique_cat=unique_cat))
35114

36-
print var
115+
# print var
37116
# Create a list of features to dummy
38-
todummy_list = ['Alley', 'BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'CentralAir', 'Condition1', 'Condition2', 'Electrical', 'ExterCond', 'ExterQual', 'Exterior1st', 'Exterior2nd', 'Fence', 'FireplaceQu', 'Foundation', 'Functional', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'Heating', 'HeatingQC', 'HouseStyle', 'KitchenQual', 'LandContour', 'LandSlope', 'LotConfig', 'LotShape', 'MSZoning', 'MasVnrType', 'MiscFeature', 'Neighborhood', 'PavedDrive', 'PoolQC', 'RoofMatl', 'RoofStyle', 'SaleCondition', 'SaleType', 'Street', 'Utilities']
39117

40118

119+
todummy_list = ['Alley', 'BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'CentralAir', 'Condition1', 'Condition2', 'Electrical', 'ExterCond', 'ExterQual', 'Exterior1st', 'Exterior2nd', 'Fence', 'FireplaceQu', 'Foundation', 'Functional', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'Heating', 'HeatingQC', 'HouseStyle', 'KitchenQual', 'LandContour', 'LandSlope', 'LotConfig', 'LotShape', 'MSZoning', 'MasVnrType', 'MiscFeature', 'Neighborhood', 'PavedDrive', 'PoolQC', 'RoofMatl', 'RoofStyle', 'SaleCondition', 'SaleType', 'Street', 'Utilities','StreetName']
120+
121+
# How much of our data is missing?
122+
print fullData.isnull().sum().sort_values(ascending=False).head(len(fullData))
41123
# Function to dummy all the categorical variables used for modeling
42124
def dummy_df(df, todummy_list):
43125
for x in todummy_list:
@@ -52,20 +134,58 @@ def dummy_df(df, todummy_list):
52134

53135
#### Handling missing data
54136

55-
# How much of our data is missing?
56-
# print X.isnull().sum().sort_values(ascending=False).head()
137+
from sklearn.base import TransformerMixin
138+
139+
class DataFrameImputer(TransformerMixin):
140+
141+
def __init__(self):
142+
"""Impute missing values.
143+
144+
Columns of dtype object are imputed with the most frequent value
145+
in column.
146+
147+
Columns of other types are imputed with mean of column.
57148
58-
# print X.columns
149+
"""
150+
def fit(self, X, y=None):
59151

60-
#Imputer in sklearn.preprocessing
61-
from sklearn.preprocessing import Imputer
152+
self.fill = pd.Series([X[c].value_counts().index[0]
153+
if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
154+
index=X.columns)
62155

63-
imp = Imputer(missing_values='NaN', strategy='median', axis=0)
64-
imp.fit(fullData)
65-
fullData = pd.DataFrame(data=imp.transform(fullData) , columns=fullData.columns)
156+
return self
66157

158+
def transform(self, X, y=None):
159+
return X.fillna(self.fill)
160+
161+
fullData = DataFrameImputer().fit_transform(fullData)
162+
# How much of our data is missing?
163+
print fullData.isnull().sum().sort_values(ascending=False).head(len(fullData))
164+
165+
# Use PolynomialFeatures in sklearn.preprocessing to create two-way interactions for all features
166+
from itertools import combinations
167+
from sklearn.preprocessing import PolynomialFeatures
168+
169+
def add_interactions(df):
170+
# Get feature names
171+
combos = list(combinations(list(df.columns), 2))
172+
colnames = list(df.columns) + ['_'.join(x) for x in combos]
173+
174+
# Find interactions
175+
poly = PolynomialFeatures(interaction_only=True, include_bias=False)
176+
df = poly.fit_transform(df)
177+
df = pd.DataFrame(df)
178+
df.columns = colnames
179+
180+
# Remove interaction terms with all 0 values
181+
noint_indicies = [i for i, x in enumerate(list((df == 0).all())) if x]
182+
df = df.drop(df.columns[noint_indicies], axis=1)
183+
184+
return df
185+
186+
fullData = add_interactions(fullData)
67187
# Now check again to see if you still have missing data
68-
print fullData.isnull().sum().sort_values(ascending=False).head()
188+
# print fullData.isnull().sum().sort_values(ascending=False).head()
69189

70190
#### Remove and detect outliers
71191
# Removed four data entries
@@ -74,68 +194,77 @@ def dummy_df(df, todummy_list):
74194
###############################
75195
# trying out different models #
76196
###############################
77-
78-
rs = 1
79-
ests = [ linear_model.LinearRegression(), linear_model.Ridge(),
197+
ests = [ linear_model.LinearRegression(fit_intercept=True), linear_model.Ridge(),
80198
linear_model.Lasso(), linear_model.ElasticNet(),
81199
linear_model.BayesianRidge(), linear_model.OrthogonalMatchingPursuit() ]
82200
ests_labels = np.array(['Linear', 'Ridge', 'Lasso', 'ElasticNet', 'BayesRidge', 'OMP'])
83-
errvals = np.array([])
201+
84202
train=fullData[fullData['Type']==0]
85203
test=fullData[fullData['Type']==1]
86-
87-
88204
X_train, X_test, y_train, y_test = train_test_split(train.drop(['SalePrice'], axis=1),
89-
train.SalePrice, test_size=0.2, random_state=20)
205+
train.SalePrice, test_size=0.2, random_state=1)
90206

91207

208+
errvals = np.array([])
209+
r2vals = np.array([])
210+
accvals = np.array([])
92211
for e in ests:
93-
e.fit(X_train, y_train)
94-
this_err = metrics.median_absolute_error(y_test, e.predict(X_test))
95-
#print "got error %0.2f" % this_err
96-
errvals = np.append(errvals, this_err)
97-
98-
print errvals
99-
100-
pos = np.arange(errvals.shape[0])
101-
srt = np.argsort(errvals)
102-
plt.figure(figsize=(7,5))
103-
plt.bar(pos, errvals[srt], align='center')
104-
plt.xticks(pos, ests_labels[srt])
105-
plt.xlabel('Estimator')
106-
plt.ylabel('Median Absolute Error')
212+
e.fit(X_train, y_train)
213+
rms = metrics.median_absolute_error(y_test, e.predict(X_test))
214+
r2 = metrics.r2_score(y_test, e.predict(X_test))
215+
acc = e.score(X_test, y_test)
216+
print("MAD: %.4f" % rms)
217+
print("R2: %.4f" % r2)
218+
print("Accuracy: %.4f" % acc)
219+
accvals = np.append(accvals,acc)
220+
errvals = np.append(errvals, rms)
221+
r2vals = np.append(r2vals, r2)
222+
223+
# print errvals
224+
225+
# pos = np.arange(errvals.shape[0])
226+
# srt = np.argsort(errvals)
227+
# plt.figure(figsize=(7,5))
228+
# plt.bar(pos, errvals[srt], align='center')
229+
# plt.xticks(pos, ests_labels[srt])
230+
# plt.xlabel('Estimator')
231+
# plt.ylabel('Median Absolute Error')
107232
# plt.show()
108233

109234

235+
110236
##################
111237
# model ensemble #
112238
##################
113239

114-
n_est = 800
115-
116-
tuned_parameters = {
117-
"n_estimators": [ n_est ],
118-
"max_depth" : [ 8 ],
119-
"learning_rate": [ 0.001 ],
120-
"min_samples_split" : [ 3 ],
121-
"loss" : [ 'ls', 'lad' ]
240+
tuned_parameters = {'learning_rate': [0.1, 0.01, 0.001],
241+
'max_depth': [4, 6, 8],
242+
'min_samples_leaf': [3,5,8],
243+
'max_features' : [10, 15, 20],
244+
'loss' : ['ls']
122245
}
123-
124-
125-
gbr = ensemble.GradientBoostingRegressor()
126-
clf = GridSearchCV(gbr, cv=3, param_grid=tuned_parameters,
127-
scoring='neg_median_absolute_error')
128-
preds = clf.fit(X_train, y_train)
129-
best = clf.best_estimator_
130-
246+
n_est = 1500
247+
248+
gbr = ensemble.GradientBoostingRegressor(n_estimators=n_est)
249+
clf = GridSearchCV(gbr, tuned_parameters, scoring='neg_median_absolute_error', n_jobs=4).fit(X_train, y_train)
250+
print('Best hyperparameters: %r' % clf.best_params_)
251+
gbr.set_params(** clf.best_params_)
252+
gbr.fit(X_train, y_train)
253+
rms = metrics.median_absolute_error(y_test, gbr.predict(X_test))
254+
r2 = metrics.r2_score(y_test, gbr.predict(X_test))
255+
acc = gbr.score(X_test, y_test)
256+
print("RMS: %.4f" % rms)
257+
print("R2: %.4f" % r2)
258+
print("Accuracy: %.4f" % acc)
131259

132260
# plot error for each round of boosting
133261
test_score = np.zeros(n_est, dtype=np.float64)
134-
262+
best = clf.best_estimator_
135263
train_score = best.train_score_
136264
for i, y_pred in enumerate(best.staged_predict(X_test)):
137265
test_score[i] = best.loss_(y_test, y_pred)
138266

267+
139268
plt.figure(figsize=(12, 6))
140269
plt.subplot(1, 2, 1)
141270
plt.plot(np.arange(n_est), train_score, 'darkblue', label='Training Set Error')
@@ -145,31 +274,22 @@ def dummy_df(df, todummy_list):
145274
plt.ylabel('Least Absolute Deviation')
146275
plt.show()
147276

277+
# feature_importance = clf.best_estimator_.feature_importances_
278+
# feature_importance = 100.0 * (feature_importance / feature_importance.max())
279+
# sorted_idx = np.argsort(feature_importance)
148280

149-
feature_importance = clf.best_estimator_.feature_importances_
150-
print "feature_importance"
151-
print feature_importance
152-
feature_importance = 100.0 * (feature_importance / feature_importance.max())
153-
sorted_idx = np.argsort(feature_importance)
281+
# pos = np.arange(sorted_idx.shape[0]) + 2
282+
# pvals = feature_importance[sorted_idx]
154283

155-
print "sorted_idx"
156-
print sorted_idx
157284

158-
pos = np.arange(sorted_idx.shape[0]) + 2
159-
pvals = feature_importance[sorted_idx]
160-
print "pvals"
161-
print pvals
285+
# pcols = X_train.columns[sorted_idx]
162286

163-
pcols = X_train.columns[sorted_idx]
164-
print "pcols"
165-
print pcols
166-
167-
plt.figure(figsize=(32,48))
168-
plt.barh(pos, pvals, align='center')
169-
plt.yticks(pos, pcols)
170-
plt.xlabel('Relative Importance')
171-
plt.title('Variable Importance')
172-
plt.show()
287+
# plt.figure(figsize=(32,48))
288+
# plt.barh(pos, pvals, align='center')
289+
# plt.yticks(pos, pcols)
290+
# plt.xlabel('Relative Importance')
291+
# plt.title('Variable Importance')
292+
# plt.show()
173293

174294
X_train, X_test, y_train, y_test = train_test_split(test.drop(['SalePrice'], axis=1),
175295
test.SalePrice, test_size=0, random_state=0)
@@ -179,7 +299,7 @@ def dummy_df(df, todummy_list):
179299
with open('output.csv', 'w') as outcsv:
180300
writer = csv.writer(outcsv, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
181301
writer.writerow(['Id', 'SalePrice'])
182-
dataPrediction = clf.predict(X_train)
302+
dataPrediction = gbr.predict(X_train)
183303
for i in xrange(0, len(Y)):
184304
dataId = Y['Id'][i]
185305
writer.writerow([dataId, dataPrediction[i]])

models/aletom/backup/models.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
import matplotlib.pyplot as plt
1313

1414
# Load all data sources
15-
trainData = graphlab.SFrame('../../data/train.csv')
16-
testData = graphlab.SFrame('../../data/test.csv')
15+
trainData = graphlab.SFrame('../data/trainremake.csv')
16+
testData = graphlab.SFrame('../data/testremake.csv')
1717

1818
# trainData.print_rows(num_rows=10, num_columns=81)
1919

0 commit comments

Comments
 (0)