1
+ ###########################################
2
+ # Suppress matplotlib user warnings
3
+ # Necessary for newer version of matplotlib
4
+ import warnings
5
+ warnings .filterwarnings ("ignore" , category = UserWarning , module = "matplotlib" )
6
+ #
7
+ # Display inline matplotlib plots with IPython
8
+ from IPython import get_ipython
9
+ get_ipython ().run_line_magic ('matplotlib' , 'inline' )
10
+ ###########################################
11
+
12
+ import matplotlib .pyplot as plt
13
+ import matplotlib .cm as cm
14
+ import pandas as pd
15
+ import numpy as np
16
+
17
+ def pca_results (good_data , pca ):
18
+ '''
19
+ Create a DataFrame of the PCA results
20
+ Includes dimension feature weights and explained variance
21
+ Visualizes the PCA results
22
+ '''
23
+
24
+ # Dimension indexing
25
+ dimensions = dimensions = ['Dimension {}' .format (i ) for i in range (1 ,len (pca .components_ )+ 1 )]
26
+
27
+ # PCA components
28
+ components = pd .DataFrame (np .round (pca .components_ , 4 ), columns = good_data .keys ())
29
+ components .index = dimensions
30
+
31
+ # PCA explained variance
32
+ ratios = pca .explained_variance_ratio_ .reshape (len (pca .components_ ), 1 )
33
+ variance_ratios = pd .DataFrame (np .round (ratios , 4 ), columns = ['Explained Variance' ])
34
+ variance_ratios .index = dimensions
35
+
36
+ # Create a bar plot visualization
37
+ fig , ax = plt .subplots (figsize = (14 ,8 ))
38
+
39
+ # Plot the feature weights as a function of the components
40
+ components .plot (ax = ax , kind = 'bar' );
41
+ ax .set_ylabel ("Feature Weights" )
42
+ ax .set_xticklabels (dimensions , rotation = 0 )
43
+
44
+
45
+ # Display the explained variance ratios
46
+ for i , ev in enumerate (pca .explained_variance_ratio_ ):
47
+ ax .text (i - 0.40 , ax .get_ylim ()[1 ] + 0.05 , "Explained Variance\n %.4f" % (ev ))
48
+
49
+ # Return a concatenated DataFrame
50
+ return pd .concat ([variance_ratios , components ], axis = 1 )
51
+
52
+ def cluster_results (reduced_data , preds , centers , pca_samples ):
53
+ '''
54
+ Visualizes the PCA-reduced cluster data in two dimensions
55
+ Adds cues for cluster centers and student-selected sample data
56
+ '''
57
+
58
+ predictions = pd .DataFrame (preds , columns = ['Cluster' ])
59
+ plot_data = pd .concat ([predictions , reduced_data ], axis = 1 )
60
+
61
+ # Generate the cluster plot
62
+ fig , ax = plt .subplots (figsize = (14 ,8 ))
63
+
64
+ # Color map
65
+ cmap = cm .get_cmap ('gist_rainbow' )
66
+
67
+ # Color the points based on assigned cluster
68
+ for i , cluster in plot_data .groupby ('Cluster' ):
69
+ cluster .plot (ax = ax , kind = 'scatter' , x = 'Dimension 1' , y = 'Dimension 2' , \
70
+ color = cmap ((i )* 1.0 / (len (centers )- 1 )), label = 'Cluster %i' % (i ), s = 30 );
71
+
72
+ # Plot centers with indicators
73
+ for i , c in enumerate (centers ):
74
+ ax .scatter (x = c [0 ], y = c [1 ], color = 'white' , edgecolors = 'black' , \
75
+ alpha = 1 , linewidth = 2 , marker = 'o' , s = 200 );
76
+ ax .scatter (x = c [0 ], y = c [1 ], marker = '$%d$' % (i ), alpha = 1 , s = 100 );
77
+
78
+ # Plot transformed sample points
79
+ ax .scatter (x = pca_samples [:,0 ], y = pca_samples [:,1 ], \
80
+ s = 150 , linewidth = 4 , color = 'black' , marker = 'x' );
81
+
82
+ # Set plot title
83
+ ax .set_title ("Cluster Learning on PCA-Reduced Data - Centroids Marked by Number\n Transformed Sample Data Marked by Black Cross" );
84
+
85
+
86
+ def biplot (good_data , reduced_data , pca ):
87
+ '''
88
+ Produce a biplot that shows a scatterplot of the reduced
89
+ data and the projections of the original features.
90
+
91
+ good_data: original data, before transformation.
92
+ Needs to be a pandas dataframe with valid column names
93
+ reduced_data: the reduced data (the first two dimensions are plotted)
94
+ pca: pca object that contains the components_ attribute
95
+
96
+ return: a matplotlib AxesSubplot object (for any additional customization)
97
+
98
+ This procedure is inspired by the script:
99
+ https://github.com/teddyroland/python-biplot
100
+ '''
101
+
102
+ fig , ax = plt .subplots (figsize = (14 ,8 ))
103
+ # scatterplot of the reduced data
104
+ ax .scatter (x = reduced_data .loc [:, 'Dimension 1' ], y = reduced_data .loc [:, 'Dimension 2' ],
105
+ facecolors = 'b' , edgecolors = 'b' , s = 70 , alpha = 0.5 )
106
+
107
+ feature_vectors = pca .components_ .T
108
+
109
+ # we use scaling factors to make the arrows easier to see
110
+ arrow_size , text_pos = 7.0 , 8.0 ,
111
+
112
+ # projections of the original features
113
+ for i , v in enumerate (feature_vectors ):
114
+ ax .arrow (0 , 0 , arrow_size * v [0 ], arrow_size * v [1 ],
115
+ head_width = 0.2 , head_length = 0.2 , linewidth = 2 , color = 'red' )
116
+ ax .text (v [0 ]* text_pos , v [1 ]* text_pos , good_data .columns [i ], color = 'black' ,
117
+ ha = 'center' , va = 'center' , fontsize = 18 )
118
+
119
+ ax .set_xlabel ("Dimension 1" , fontsize = 14 )
120
+ ax .set_ylabel ("Dimension 2" , fontsize = 14 )
121
+ ax .set_title ("PC plane with original feature projections." , fontsize = 16 );
122
+ return ax
123
+
124
+
125
+ def channel_results (reduced_data , outliers , pca_samples ):
126
+ '''
127
+ Visualizes the PCA-reduced cluster data in two dimensions using the full dataset
128
+ Data is labeled by "Channel" and cues added for student-selected sample data
129
+ '''
130
+
131
+ # Check that the dataset is loadable
132
+ try :
133
+ full_data = pd .read_csv ("customers.csv" )
134
+ except :
135
+ print "Dataset could not be loaded. Is the file missing?"
136
+ return False
137
+
138
+ # Create the Channel DataFrame
139
+ channel = pd .DataFrame (full_data ['Channel' ], columns = ['Channel' ])
140
+ channel = channel .drop (channel .index [outliers ]).reset_index (drop = True )
141
+ labeled = pd .concat ([reduced_data , channel ], axis = 1 )
142
+
143
+ # Generate the cluster plot
144
+ fig , ax = plt .subplots (figsize = (14 ,8 ))
145
+
146
+ # Color map
147
+ cmap = cm .get_cmap ('gist_rainbow' )
148
+
149
+ # Color the points based on assigned Channel
150
+ labels = ['Hotel/Restaurant/Cafe' , 'Retailer' ]
151
+ grouped = labeled .groupby ('Channel' )
152
+ for i , channel in grouped :
153
+ channel .plot (ax = ax , kind = 'scatter' , x = 'Dimension 1' , y = 'Dimension 2' , \
154
+ color = cmap ((i - 1 )* 1.0 / 2 ), label = labels [i - 1 ], s = 30 );
155
+
156
+ # Plot transformed sample points
157
+ for i , sample in enumerate (pca_samples ):
158
+ ax .scatter (x = sample [0 ], y = sample [1 ], \
159
+ s = 200 , linewidth = 3 , color = 'black' , marker = 'o' , facecolors = 'none' );
160
+ ax .scatter (x = sample [0 ]+ 0.25 , y = sample [1 ]+ 0.3 , marker = '$%d$' % (i ), alpha = 1 , s = 125 );
161
+
162
+ # Set plot title
163
+ ax .set_title ("PCA-Reduced Data Labeled by 'Channel'\n Transformed Sample Data Circled" );
0 commit comments