|
| 1 | + |
| 2 | +# coding: utf-8 |
| 3 | + |
| 4 | +# ## PART III: Machine Learning: Supervised - Logistic Regression |
| 5 | + |
| 6 | +# The dataset includes data from 768 women with 8 characteristics, in particular: |
| 7 | +# Number of times pregnant, Plasma glucose concentration a 2 hours in an oral glucose tolerance test, Diastolic blood pressure (mm Hg), Triceps skin fold thickness (mm), 2-Hour serum insulin (mu U/ml), Body mass index (weight in kg/(height in m)^2), Diabetes pedigree function, Age (years), Class (whether has diabetes or not) |
| 8 | + |
| 9 | +# ### Import all needed libraries |
| 10 | + |
| 11 | +# In[1]: |
| 12 | + |
| 13 | + |
| 14 | +# Import Python Libraries: Numpy and Pandas |
| 15 | + |
| 16 | +import pandas as pd |
| 17 | +import numpy as np |
| 18 | + |
| 19 | +# Import Python Libraries & modules for data visualization |
| 20 | +from pandas.plotting import scatter_matrix |
| 21 | +from matplotlib import pyplot |
| 22 | + |
| 23 | +# Import scikit-Learn module for the algorithm/model:Logistic Regression |
| 24 | +from sklearn.linear_model import LogisticRegression |
| 25 | + |
| 26 | +# Import scikit-Learn module to split the dataset into train\test sub-datasets |
| 27 | +from sklearn.model_selection import train_test_split |
| 28 | + |
| 29 | + |
| 30 | +# Import scikit-Learn module for K-fold cross validation - algorithm/model evaluation and validation |
| 31 | +from sklearn.model_selection import KFold |
| 32 | +from sklearn.model_selection import cross_val_score |
| 33 | + |
| 34 | +# Import scikit Learn module classification report to later use for information about how the system try to classify /label each record |
| 35 | + |
| 36 | +from sklearn.metrics import classification_report |
| 37 | + |
| 38 | + |
| 39 | +# ### Load the dataset |
| 40 | + |
| 41 | +# In[4]: |
| 42 | + |
| 43 | + |
| 44 | +# Specify location of the dataset |
| 45 | +filename ='D:/unt/5340/pima_diabetes.csv' |
| 46 | + |
| 47 | +# load the data into a Pandas DataFrame |
| 48 | +diabetes= pd.read_csv(filename) |
| 49 | + |
| 50 | + |
| 51 | +# In[5]: |
| 52 | + |
| 53 | + |
| 54 | +diabetes.head() |
| 55 | + |
| 56 | + |
| 57 | +# ### Perform the exploratory data analysis (EDA) on the dataset¶ |
| 58 | + |
| 59 | +# In[6]: |
| 60 | + |
| 61 | + |
| 62 | +## Get the dimensions of the dataset |
| 63 | +print(diabetes.shape) |
| 64 | + |
| 65 | + |
| 66 | +# In[7]: |
| 67 | + |
| 68 | + |
| 69 | +# Get the data types of all the variables of the dataset |
| 70 | +print(diabetes.dtypes) |
| 71 | + |
| 72 | + |
| 73 | +# In[8]: |
| 74 | + |
| 75 | + |
| 76 | +# Get the summary stats of the numeric variables of the dataset |
| 77 | + |
| 78 | +print(diabetes.describe()) |
| 79 | + |
| 80 | + |
| 81 | +# In[9]: |
| 82 | + |
| 83 | + |
| 84 | +# class distribution- number of records in each class |
| 85 | + |
| 86 | +diabetes.groupby('class').size() |
| 87 | + |
| 88 | + |
| 89 | +# In[10]: |
| 90 | + |
| 91 | + |
| 92 | +# plot the histogram for each numeric |
| 93 | +diabetes.hist(figsize=(12,8)) |
| 94 | +pyplot.show() |
| 95 | + |
| 96 | + |
| 97 | +# In[11]: |
| 98 | + |
| 99 | + |
| 100 | +# density plots |
| 101 | + |
| 102 | +diabetes.plot(kind='density',subplots=True, layout=(3,3),sharex=False,legend=True,fontsize=1,figsize=(12,16)) |
| 103 | +pyplot.show() |
| 104 | + |
| 105 | + |
| 106 | +# In[12]: |
| 107 | + |
| 108 | + |
| 109 | +# box plots |
| 110 | + |
| 111 | +diabetes.plot(kind='box',subplots=True, layout=(3,3),sharex=False,figsize=(12,8)) |
| 112 | +pyplot.show() |
| 113 | + |
| 114 | + |
| 115 | +# In[14]: |
| 116 | + |
| 117 | + |
| 118 | +# scatter plot matrix |
| 119 | + |
| 120 | +scatter_matrix(diabetes, alpha=0.8,figsize=(15,15)) |
| 121 | +pyplot.show() |
| 122 | + |
| 123 | + |
| 124 | +# ### Missing or Null Data points |
| 125 | + |
| 126 | +# In[15]: |
| 127 | + |
| 128 | + |
| 129 | +diabetes.isnull().sum() |
| 130 | +diabetes.isna().sum() |
| 131 | + |
| 132 | + |
| 133 | +# Unexpected Outliers |
| 134 | +# When analyzing the histogram we can identify that there are some outliers in some columns. We will further analyse those outliers and determine what we can do about them. |
| 135 | + |
| 136 | +# Blood pressure : By observing the data we can see that there are 0 values for blood pressure. And it is evident that the readings of the data set seems wrong because a living person cannot have diastolic blood pressure of zero. By observing the data we can see 35 counts where the value is 0. |
| 137 | + |
| 138 | +# In[16]: |
| 139 | + |
| 140 | + |
| 141 | +print("Total : ", diabetes[diabetes.pres == 0].shape[0]) |
| 142 | + |
| 143 | + |
| 144 | +# In[17]: |
| 145 | + |
| 146 | + |
| 147 | +print(diabetes[diabetes.pres == 0].groupby('class')['age'].count()) |
| 148 | + |
| 149 | + |
| 150 | +# Plasma glucose levels : Even after fasting glucose level would not be as low as zero. Therefor zero is an invalid reading. By observing the data we can see 5 counts where the value is 0. |
| 151 | + |
| 152 | +# In[18]: |
| 153 | + |
| 154 | + |
| 155 | +print("Total : ", diabetes[diabetes.plas == 0].shape[0]) |
| 156 | + |
| 157 | + |
| 158 | +# In[19]: |
| 159 | + |
| 160 | + |
| 161 | +print(diabetes[diabetes.plas == 0].groupby('class')['age'].count()) |
| 162 | + |
| 163 | + |
| 164 | +# Skin Fold Thickness : For normal people skin fold thickness can’t be less than 10 mm better yet zero. Total count where value is 0 : 227. |
| 165 | + |
| 166 | +# In[20]: |
| 167 | + |
| 168 | + |
| 169 | +print("Total : ", diabetes[diabetes.skin == 0].shape[0]) |
| 170 | + |
| 171 | + |
| 172 | +# In[21]: |
| 173 | + |
| 174 | + |
| 175 | +print(diabetes[diabetes.skin == 0].groupby('class')['age'].count()) |
| 176 | + |
| 177 | + |
| 178 | +# BMI : Should not be 0 or close to zero unless the person is really underweight which could be life threatening. |
| 179 | + |
| 180 | +# In[22]: |
| 181 | + |
| 182 | + |
| 183 | +print("Total : ", diabetes[diabetes.mass == 0].shape[0]) |
| 184 | + |
| 185 | + |
| 186 | +# In[23]: |
| 187 | + |
| 188 | + |
| 189 | +print(diabetes[diabetes.mass == 0].groupby('class')['age'].count()) |
| 190 | + |
| 191 | + |
| 192 | +# Insulin : In a rare situation a person can have zero insulin but by observing the data, we can find that there is a total of 374 counts. |
| 193 | + |
| 194 | +# In[24]: |
| 195 | + |
| 196 | + |
| 197 | +print("Total : ", diabetes[diabetes.test == 0].shape[0]) |
| 198 | + |
| 199 | + |
| 200 | +# In[25]: |
| 201 | + |
| 202 | + |
| 203 | +print(diabetes[diabetes.test == 0].groupby('class')['age'].count()) |
| 204 | + |
| 205 | + |
| 206 | +# Here are several ways to handle invalid data values : |
| 207 | +# |
| 208 | +# Ignore/remove these cases : This is not actually possible in most cases because that would mean losing valuable information. And in this case “skin thickness” and “insulin” columns means have a lot of invalid points. But it might work for “BMI”, “glucose ”and “blood pressure” data points. |
| 209 | +# Put average/mean values : This might work for some data sets, but in our case putting a mean value to the blood pressure column would send a wrong signal to the model. |
| 210 | +# Avoid using features : It is possible to not use the features with a lot of invalid values for the model. This may work for “skin thickness” but its hard to predict that. |
| 211 | +# By the end of the data cleaning process we have come to the conclusion that this given data set is incomplete. Since this is a demonstration for machine learning we will proceed with the given data with some minor adjustments. |
| 212 | +# |
| 213 | +# We will remove the rows which the “BloodPressure”, “BMI” and “Glucose” are zero. |
| 214 | + |
| 215 | +# In[26]: |
| 216 | + |
| 217 | + |
| 218 | +diabetes_mod = diabetes[(diabetes.pres != 0) & (diabetes.mass != 0) & (diabetes.plas != 0)] |
| 219 | +print(diabetes_mod.shape) |
| 220 | + |
| 221 | + |
| 222 | +# ### Feature Engineering |
| 223 | + |
| 224 | +# In[57]: |
| 225 | + |
| 226 | + |
| 227 | +feature_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age'] |
| 228 | +outcome= ['class'] |
| 229 | + |
| 230 | + |
| 231 | +# ### Separate dataset into input and output Numpy Arrays |
| 232 | +# |
| 233 | + |
| 234 | +# In[48]: |
| 235 | + |
| 236 | + |
| 237 | +# Store dataframe values into a numpy array |
| 238 | +array=diabetes_mod.values |
| 239 | + |
| 240 | +# separate array into input and output components by slicing |
| 241 | + |
| 242 | +X = diabetes_mod[feature_names] |
| 243 | +Y = diabetes_mod[outcome] |
| 244 | + |
| 245 | + |
| 246 | +# In[49]: |
| 247 | + |
| 248 | + |
| 249 | +# selection of recordsto include in which sub-dataset must be done randomly |
| 250 | + |
| 251 | +test_size=0.33 |
| 252 | + |
| 253 | +seed =7 |
| 254 | + |
| 255 | +# split the dataset |
| 256 | +X_train, X_test, Y_train,Y_test=train_test_split(X,Y,test_size=test_size,random_state=seed) |
| 257 | + |
| 258 | + |
| 259 | +# ### Build and Train the Model |
| 260 | +# |
| 261 | + |
| 262 | +# In[62]: |
| 263 | + |
| 264 | + |
| 265 | +# Build the model |
| 266 | +model=LogisticRegression() |
| 267 | + |
| 268 | +# Train he model using the training sub-dataset |
| 269 | +model.fit(X_train,Y_train.values.ravel()) |
| 270 | + |
| 271 | +# print the classification report |
| 272 | +predicted=model.predict(X_test) |
| 273 | + |
| 274 | +report=classification_report(Y_test.values.ravel(),predicted) |
| 275 | + |
| 276 | +print(report) |
| 277 | + |
| 278 | + |
| 279 | +# ### Score the accuracy of the model |
| 280 | +# |
| 281 | + |
| 282 | +# In[63]: |
| 283 | + |
| 284 | + |
| 285 | +# score the accuracy leve |
| 286 | +result=model.score(X_test, Y_test.values.ravel()) |
| 287 | + |
| 288 | +# print out the results |
| 289 | +print(("Accuracy: %.3f%%")% (result*100.0)) |
| 290 | + |
| 291 | + |
| 292 | +# # Predict the outcome (having diabetes or not) of two new records: |
| 293 | +# #It is assumed that new data has been collected from two persons whose information has not yet been included in the existing |
| 294 | +# #Make up two new records consisting of the predictors (all the variables except "class") to represent the data of these two new persons, using the existing records of the dataset as samples. |
| 295 | + |
| 296 | +# In[60]: |
| 297 | + |
| 298 | + |
| 299 | +model.predict([[5,180,64,20,94,23.3,0.674,23],[1,62,65,22,90,28.1,0.167,25]]) |
| 300 | + |
| 301 | + |
| 302 | +# Thus the person 1 has diabetes per the prediction and person 2 does not have diabetes given the set of values |
| 303 | + |
| 304 | +# ### Evaluate the Algorithm/Model . Using 10-Fold Cross- Validation |
| 305 | +# |
| 306 | + |
| 307 | +# In[64]: |
| 308 | + |
| 309 | + |
| 310 | +# evaluate the algorithm and specify the no.of times of repeated splitting, in this case 10 folds |
| 311 | +num_splits=10 |
| 312 | + |
| 313 | +# Fix the random seed- must use the same seed value so that the subsets can be obtained |
| 314 | + |
| 315 | +seed =7 |
| 316 | + |
| 317 | +# Split the whole data set into folds |
| 318 | +kfold=KFold(num_splits, random_state=seed) |
| 319 | + |
| 320 | +# For logistic regression we can use the accuracy level to evaluate the model |
| 321 | +scoring='accuracy' |
| 322 | + |
| 323 | +# Train the model and run K-Fold cross validation to validate/evaluate the model |
| 324 | + |
| 325 | +results=cross_val_score(model,X,Y.values.ravel(), cv=kfold,scoring=scoring) |
| 326 | + |
| 327 | +# print out the evaluation results- the average of all the results obtained from the k-fold cross -validation- |
| 328 | + |
| 329 | +print("Accuracy: %.3f(%.3f)"% (results.mean(),results.std())) |
| 330 | + |
| 331 | + |
| 332 | +# using the 10-fold cross -validation to evaluate the model/algorithm, the accuracy of this logistic regression model is 76.7%. |
| 333 | +# |
| 334 | + |
| 335 | +# Note: values.ravel was used after Y because of the following warning :DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel(). |
| 336 | +# |
0 commit comments