1010import matplotlib .pyplot as plt
1111from collections import Counter
1212from sklearn .metrics import accuracy_score
13+ from math import sqrt
14+ from sklearn import cross_validation
1315
1416# read the file into a dataframe
15- X = pd .read_csv ('data/trainremake.csv' )
17+ train = pd .read_csv ('data/trainremake.csv' )
18+
19+ test = pd .read_csv ('data/testremake.csv' )
20+
21+ train ['Type' ] = 0 #Create a flag for Train and Test Data set
22+ test ['Type' ] = 1
23+
24+ # Handle missing values for features where median/mean or most common value doesn't make sense
25+
26+ # Alley : data description says NA means "no alley access"
27+ train .loc [:, "Alley" ] = train .loc [:, "Alley" ].fillna ("None" )
28+ # BedroomAbvGr : NA most likely means 0
29+ train .loc [:, "BedroomAbvGr" ] = train .loc [:, "BedroomAbvGr" ].fillna (0 )
30+ # BsmtQual etc : data description says NA for basement features is "no basement"
31+ train .loc [:, "BsmtQual" ] = train .loc [:, "BsmtQual" ].fillna ("No" )
32+ train .loc [:, "BsmtCond" ] = train .loc [:, "BsmtCond" ].fillna ("No" )
33+ train .loc [:, "BsmtExposure" ] = train .loc [:, "BsmtExposure" ].fillna ("No" )
34+ train .loc [:, "BsmtFinType1" ] = train .loc [:, "BsmtFinType1" ].fillna ("No" )
35+ train .loc [:, "BsmtFinType2" ] = train .loc [:, "BsmtFinType2" ].fillna ("No" )
36+ train .loc [:, "BsmtFullBath" ] = train .loc [:, "BsmtFullBath" ].fillna (0 )
37+ train .loc [:, "BsmtHalfBath" ] = train .loc [:, "BsmtHalfBath" ].fillna (0 )
38+ train .loc [:, "BsmtUnfSF" ] = train .loc [:, "BsmtUnfSF" ].fillna (0 )
39+ # CentralAir : NA most likely means No
40+ train .loc [:, "CentralAir" ] = train .loc [:, "CentralAir" ].fillna ("N" )
41+ # Condition : NA most likely means Normal
42+ train .loc [:, "Condition1" ] = train .loc [:, "Condition1" ].fillna ("Norm" )
43+ train .loc [:, "Condition2" ] = train .loc [:, "Condition2" ].fillna ("Norm" )
44+ # EnclosedPorch : NA most likely means no enclosed porch
45+ train .loc [:, "EnclosedPorch" ] = train .loc [:, "EnclosedPorch" ].fillna (0 )
46+ # External stuff : NA most likely means average
47+ train .loc [:, "ExterCond" ] = train .loc [:, "ExterCond" ].fillna ("TA" )
48+ train .loc [:, "ExterQual" ] = train .loc [:, "ExterQual" ].fillna ("TA" )
49+ # Fence : data description says NA means "no fence"
50+ train .loc [:, "Fence" ] = train .loc [:, "Fence" ].fillna ("No" )
51+ # FireplaceQu : data description says NA means "no fireplace"
52+ train .loc [:, "FireplaceQu" ] = train .loc [:, "FireplaceQu" ].fillna ("No" )
53+ train .loc [:, "Fireplaces" ] = train .loc [:, "Fireplaces" ].fillna (0 )
54+ # Functional : data description says NA means typical
55+ train .loc [:, "Functional" ] = train .loc [:, "Functional" ].fillna ("Typ" )
56+ # GarageType etc : data description says NA for garage features is "no garage"
57+ train .loc [:, "GarageType" ] = train .loc [:, "GarageType" ].fillna ("No" )
58+ train .loc [:, "GarageFinish" ] = train .loc [:, "GarageFinish" ].fillna ("No" )
59+ train .loc [:, "GarageQual" ] = train .loc [:, "GarageQual" ].fillna ("No" )
60+ train .loc [:, "GarageCond" ] = train .loc [:, "GarageCond" ].fillna ("No" )
61+ train .loc [:, "GarageArea" ] = train .loc [:, "GarageArea" ].fillna (0 )
62+ train .loc [:, "GarageCars" ] = train .loc [:, "GarageCars" ].fillna (0 )
63+ # HalfBath : NA most likely means no half baths above grade
64+ train .loc [:, "HalfBath" ] = train .loc [:, "HalfBath" ].fillna (0 )
65+ # HeatingQC : NA most likely means typical
66+ train .loc [:, "HeatingQC" ] = train .loc [:, "HeatingQC" ].fillna ("TA" )
67+ # KitchenAbvGr : NA most likely means 0
68+ train .loc [:, "KitchenAbvGr" ] = train .loc [:, "KitchenAbvGr" ].fillna (0 )
69+ # KitchenQual : NA most likely means typical
70+ train .loc [:, "KitchenQual" ] = train .loc [:, "KitchenQual" ].fillna ("TA" )
71+ # LotFrontage : NA most likely means no lot frontage
72+ train .loc [:, "LotFrontage" ] = train .loc [:, "LotFrontage" ].fillna (0 )
73+ # LotShape : NA most likely means regular
74+ train .loc [:, "LotShape" ] = train .loc [:, "LotShape" ].fillna ("Reg" )
75+ # MasVnrType : NA most likely means no veneer
76+ train .loc [:, "MasVnrType" ] = train .loc [:, "MasVnrType" ].fillna ("None" )
77+ train .loc [:, "MasVnrArea" ] = train .loc [:, "MasVnrArea" ].fillna (0 )
78+ # MiscFeature : data description says NA means "no misc feature"
79+ train .loc [:, "MiscFeature" ] = train .loc [:, "MiscFeature" ].fillna ("No" )
80+ train .loc [:, "MiscVal" ] = train .loc [:, "MiscVal" ].fillna (0 )
81+ # OpenPorchSF : NA most likely means no open porch
82+ train .loc [:, "OpenPorchSF" ] = train .loc [:, "OpenPorchSF" ].fillna (0 )
83+ # PavedDrive : NA most likely means not paved
84+ train .loc [:, "PavedDrive" ] = train .loc [:, "PavedDrive" ].fillna ("N" )
85+ # PoolQC : data description says NA means "no pool"
86+ train .loc [:, "PoolQC" ] = train .loc [:, "PoolQC" ].fillna ("No" )
87+ train .loc [:, "PoolArea" ] = train .loc [:, "PoolArea" ].fillna (0 )
88+ # SaleCondition : NA most likely means normal sale
89+ train .loc [:, "SaleCondition" ] = train .loc [:, "SaleCondition" ].fillna ("Normal" )
90+ # ScreenPorch : NA most likely means no screen porch
91+ train .loc [:, "ScreenPorch" ] = train .loc [:, "ScreenPorch" ].fillna (0 )
92+ # TotRmsAbvGrd : NA most likely means 0
93+ train .loc [:, "TotRmsAbvGrd" ] = train .loc [:, "TotRmsAbvGrd" ].fillna (0 )
94+ # Utilities : NA most likely means all public utilities
95+ train .loc [:, "Utilities" ] = train .loc [:, "Utilities" ].fillna ("AllPub" )
96+ # WoodDeckSF : NA most likely means no wood deck
97+ train .loc [:, "WoodDeckSF" ] = train .loc [:, "WoodDeckSF" ].fillna (0 )
1698
17- Y = pd .read_csv ('data/testremake.csv' )
1899
19- X ['Type' ] = 0 #Create a flag for Train and Test Data set
20- Y ['Type' ] = 1
21100
22- fullData = pd .concat ([X ,Y ],axis = 0 ) #Combined both Train and Test Data set
23101#######################
24102# basic data cleaning #
25103#######################
26104
27105#### Decide which categorical variables you want to use in model
28106# Make an array of it
107+ fullData = pd .concat ([train ,test ],axis = 0 ) #Combined both Train and Test Data set
29108var = []
30109for col_name in fullData .columns :
31110 if fullData [col_name ].dtypes == 'object' :
32111 unique_cat = len (fullData [col_name ].unique ())
33112 var .append (col_name )
34- # print("Feature '{col_name}' has {unique_cat} unique categories".format(col_name=col_name, unique_cat=unique_cat))
113+ print ("Feature '{col_name}' has {unique_cat} unique categories" .format (col_name = col_name , unique_cat = unique_cat ))
35114
36- print var
115+ # print var
37116# Create a list of features to dummy
38- todummy_list = ['Alley' , 'BldgType' , 'BsmtCond' , 'BsmtExposure' , 'BsmtFinType1' , 'BsmtFinType2' , 'BsmtQual' , 'CentralAir' , 'Condition1' , 'Condition2' , 'Electrical' , 'ExterCond' , 'ExterQual' , 'Exterior1st' , 'Exterior2nd' , 'Fence' , 'FireplaceQu' , 'Foundation' , 'Functional' , 'GarageCond' , 'GarageFinish' , 'GarageQual' , 'GarageType' , 'Heating' , 'HeatingQC' , 'HouseStyle' , 'KitchenQual' , 'LandContour' , 'LandSlope' , 'LotConfig' , 'LotShape' , 'MSZoning' , 'MasVnrType' , 'MiscFeature' , 'Neighborhood' , 'PavedDrive' , 'PoolQC' , 'RoofMatl' , 'RoofStyle' , 'SaleCondition' , 'SaleType' , 'Street' , 'Utilities' ]
39117
40118
119+ todummy_list = ['Alley' , 'BldgType' , 'BsmtCond' , 'BsmtExposure' , 'BsmtFinType1' , 'BsmtFinType2' , 'BsmtQual' , 'CentralAir' , 'Condition1' , 'Condition2' , 'Electrical' , 'ExterCond' , 'ExterQual' , 'Exterior1st' , 'Exterior2nd' , 'Fence' , 'FireplaceQu' , 'Foundation' , 'Functional' , 'GarageCond' , 'GarageFinish' , 'GarageQual' , 'GarageType' , 'Heating' , 'HeatingQC' , 'HouseStyle' , 'KitchenQual' , 'LandContour' , 'LandSlope' , 'LotConfig' , 'LotShape' , 'MSZoning' , 'MasVnrType' , 'MiscFeature' , 'Neighborhood' , 'PavedDrive' , 'PoolQC' , 'RoofMatl' , 'RoofStyle' , 'SaleCondition' , 'SaleType' , 'Street' , 'Utilities' ,'StreetName' ]
120+
121+ # How much of our data is missing?
122+ print fullData .isnull ().sum ().sort_values (ascending = False ).head (len (fullData ))
41123# Function to dummy all the categorical variables used for modeling
42124def dummy_df (df , todummy_list ):
43125 for x in todummy_list :
@@ -52,20 +134,58 @@ def dummy_df(df, todummy_list):
52134
53135#### Handling missing data
54136
55- # How much of our data is missing?
56- # print X.isnull().sum().sort_values(ascending=False).head()
137+ from sklearn .base import TransformerMixin
138+
139+ class DataFrameImputer (TransformerMixin ):
140+
141+ def __init__ (self ):
142+ """Impute missing values.
143+
144+ Columns of dtype object are imputed with the most frequent value
145+ in column.
146+
147+ Columns of other types are imputed with mean of column.
57148
58- # print X.columns
149+ """
150+ def fit (self , X , y = None ):
59151
60- #Imputer in sklearn.preprocessing
61- from sklearn .preprocessing import Imputer
152+ self .fill = pd .Series ([X [c ].value_counts ().index [0 ]
153+ if X [c ].dtype == np .dtype ('O' ) else X [c ].mean () for c in X ],
154+ index = X .columns )
62155
63- imp = Imputer (missing_values = 'NaN' , strategy = 'median' , axis = 0 )
64- imp .fit (fullData )
65- fullData = pd .DataFrame (data = imp .transform (fullData ) , columns = fullData .columns )
156+ return self
66157
158+ def transform (self , X , y = None ):
159+ return X .fillna (self .fill )
160+
161+ fullData = DataFrameImputer ().fit_transform (fullData )
162+ # How much of our data is missing?
163+ print fullData .isnull ().sum ().sort_values (ascending = False ).head (len (fullData ))
164+
165+ # Use PolynomialFeatures in sklearn.preprocessing to create two-way interactions for all features
166+ from itertools import combinations
167+ from sklearn .preprocessing import PolynomialFeatures
168+
169+ def add_interactions (df ):
170+ # Get feature names
171+ combos = list (combinations (list (df .columns ), 2 ))
172+ colnames = list (df .columns ) + ['_' .join (x ) for x in combos ]
173+
174+ # Find interactions
175+ poly = PolynomialFeatures (interaction_only = True , include_bias = False )
176+ df = poly .fit_transform (df )
177+ df = pd .DataFrame (df )
178+ df .columns = colnames
179+
180+ # Remove interaction terms with all 0 values
181+ noint_indicies = [i for i , x in enumerate (list ((df == 0 ).all ())) if x ]
182+ df = df .drop (df .columns [noint_indicies ], axis = 1 )
183+
184+ return df
185+
186+ fullData = add_interactions (fullData )
67187# Now check again to see if you still have missing data
68- print fullData .isnull ().sum ().sort_values (ascending = False ).head ()
188+ # print fullData.isnull().sum().sort_values(ascending=False).head()
69189
70190#### Remove and detect outliers
71191# Removed four data entries
@@ -74,68 +194,77 @@ def dummy_df(df, todummy_list):
74194###############################
75195# trying out different models #
76196###############################
77-
78- rs = 1
79- ests = [ linear_model .LinearRegression (), linear_model .Ridge (),
197+ ests = [ linear_model .LinearRegression (fit_intercept = True ), linear_model .Ridge (),
80198 linear_model .Lasso (), linear_model .ElasticNet (),
81199 linear_model .BayesianRidge (), linear_model .OrthogonalMatchingPursuit () ]
82200ests_labels = np .array (['Linear' , 'Ridge' , 'Lasso' , 'ElasticNet' , 'BayesRidge' , 'OMP' ])
83- errvals = np . array ([])
201+
84202train = fullData [fullData ['Type' ]== 0 ]
85203test = fullData [fullData ['Type' ]== 1 ]
86-
87-
88204X_train , X_test , y_train , y_test = train_test_split (train .drop (['SalePrice' ], axis = 1 ),
89- train .SalePrice , test_size = 0.2 , random_state = 20 )
205+ train .SalePrice , test_size = 0.2 , random_state = 1 )
90206
91207
208+ errvals = np .array ([])
209+ r2vals = np .array ([])
210+ accvals = np .array ([])
92211for e in ests :
93- e .fit (X_train , y_train )
94- this_err = metrics .median_absolute_error (y_test , e .predict (X_test ))
95- #print "got error %0.2f" % this_err
96- errvals = np .append (errvals , this_err )
97-
98- print errvals
99-
100- pos = np .arange (errvals .shape [0 ])
101- srt = np .argsort (errvals )
102- plt .figure (figsize = (7 ,5 ))
103- plt .bar (pos , errvals [srt ], align = 'center' )
104- plt .xticks (pos , ests_labels [srt ])
105- plt .xlabel ('Estimator' )
106- plt .ylabel ('Median Absolute Error' )
212+ e .fit (X_train , y_train )
213+ rms = metrics .median_absolute_error (y_test , e .predict (X_test ))
214+ r2 = metrics .r2_score (y_test , e .predict (X_test ))
215+ acc = e .score (X_test , y_test )
216+ print ("MAD: %.4f" % rms )
217+ print ("R2: %.4f" % r2 )
218+ print ("Accuracy: %.4f" % acc )
219+ accvals = np .append (accvals ,acc )
220+ errvals = np .append (errvals , rms )
221+ r2vals = np .append (r2vals , r2 )
222+
223+ # print errvals
224+
225+ # pos = np.arange(errvals.shape[0])
226+ # srt = np.argsort(errvals)
227+ # plt.figure(figsize=(7,5))
228+ # plt.bar(pos, errvals[srt], align='center')
229+ # plt.xticks(pos, ests_labels[srt])
230+ # plt.xlabel('Estimator')
231+ # plt.ylabel('Median Absolute Error')
107232# plt.show()
108233
109234
235+
110236##################
111237# model ensemble #
112238##################
113239
114- n_est = 800
115-
116- tuned_parameters = {
117- "n_estimators" : [ n_est ],
118- "max_depth" : [ 8 ],
119- "learning_rate" : [ 0.001 ],
120- "min_samples_split" : [ 3 ],
121- "loss" : [ 'ls' , 'lad' ]
240+ tuned_parameters = {'learning_rate' : [0.1 , 0.01 , 0.001 ],
241+ 'max_depth' : [4 , 6 , 8 ],
242+ 'min_samples_leaf' : [3 ,5 ,8 ],
243+ 'max_features' : [10 , 15 , 20 ],
244+ 'loss' : ['ls' ]
122245}
123-
124-
125- gbr = ensemble .GradientBoostingRegressor ()
126- clf = GridSearchCV (gbr , cv = 3 , param_grid = tuned_parameters ,
127- scoring = 'neg_median_absolute_error' )
128- preds = clf .fit (X_train , y_train )
129- best = clf .best_estimator_
130-
246+ n_est = 1500
247+
248+ gbr = ensemble .GradientBoostingRegressor (n_estimators = n_est )
249+ clf = GridSearchCV (gbr , tuned_parameters , scoring = 'neg_median_absolute_error' , n_jobs = 4 ).fit (X_train , y_train )
250+ print ('Best hyperparameters: %r' % clf .best_params_ )
251+ gbr .set_params (** clf .best_params_ )
252+ gbr .fit (X_train , y_train )
253+ rms = metrics .median_absolute_error (y_test , gbr .predict (X_test ))
254+ r2 = metrics .r2_score (y_test , gbr .predict (X_test ))
255+ acc = gbr .score (X_test , y_test )
256+ print ("RMS: %.4f" % rms )
257+ print ("R2: %.4f" % r2 )
258+ print ("Accuracy: %.4f" % acc )
131259
132260# plot error for each round of boosting
133261test_score = np .zeros (n_est , dtype = np .float64 )
134-
262+ best = clf . best_estimator_
135263train_score = best .train_score_
136264for i , y_pred in enumerate (best .staged_predict (X_test )):
137265 test_score [i ] = best .loss_ (y_test , y_pred )
138266
267+
139268plt .figure (figsize = (12 , 6 ))
140269plt .subplot (1 , 2 , 1 )
141270plt .plot (np .arange (n_est ), train_score , 'darkblue' , label = 'Training Set Error' )
@@ -145,31 +274,22 @@ def dummy_df(df, todummy_list):
145274plt .ylabel ('Least Absolute Deviation' )
146275plt .show ()
147276
277+ # feature_importance = clf.best_estimator_.feature_importances_
278+ # feature_importance = 100.0 * (feature_importance / feature_importance.max())
279+ # sorted_idx = np.argsort(feature_importance)
148280
149- feature_importance = clf .best_estimator_ .feature_importances_
150- print "feature_importance"
151- print feature_importance
152- feature_importance = 100.0 * (feature_importance / feature_importance .max ())
153- sorted_idx = np .argsort (feature_importance )
281+ # pos = np.arange(sorted_idx.shape[0]) + 2
282+ # pvals = feature_importance[sorted_idx]
154283
155- print "sorted_idx"
156- print sorted_idx
157284
158- pos = np .arange (sorted_idx .shape [0 ]) + 2
159- pvals = feature_importance [sorted_idx ]
160- print "pvals"
161- print pvals
285+ # pcols = X_train.columns[sorted_idx]
162286
163- pcols = X_train .columns [sorted_idx ]
164- print "pcols"
165- print pcols
166-
167- plt .figure (figsize = (32 ,48 ))
168- plt .barh (pos , pvals , align = 'center' )
169- plt .yticks (pos , pcols )
170- plt .xlabel ('Relative Importance' )
171- plt .title ('Variable Importance' )
172- plt .show ()
287+ # plt.figure(figsize=(32,48))
288+ # plt.barh(pos, pvals, align='center')
289+ # plt.yticks(pos, pcols)
290+ # plt.xlabel('Relative Importance')
291+ # plt.title('Variable Importance')
292+ # plt.show()
173293
174294X_train , X_test , y_train , y_test = train_test_split (test .drop (['SalePrice' ], axis = 1 ),
175295 test .SalePrice , test_size = 0 , random_state = 0 )
@@ -179,7 +299,7 @@ def dummy_df(df, todummy_list):
179299with open ('output.csv' , 'w' ) as outcsv :
180300 writer = csv .writer (outcsv , delimiter = ',' , quotechar = '|' , quoting = csv .QUOTE_MINIMAL , lineterminator = '\n ' )
181301 writer .writerow (['Id' , 'SalePrice' ])
182- dataPrediction = clf .predict (X_train )
302+ dataPrediction = gbr .predict (X_train )
183303 for i in xrange (0 , len (Y )):
184304 dataId = Y ['Id' ][i ]
185305 writer .writerow ([dataId , dataPrediction [i ]])
0 commit comments