@@ -63,11 +63,11 @@ def plot_squared_clustering_errors(plt):
63
63
# using clustering to recolor an image
64
64
#
65
65
66
- def recolor_image (input_file , k ):
66
+ def recolor_image (input_file , k = 5 ):
67
67
68
68
img = mpimg .imread (path_to_png_file )
69
69
pixels = [pixel for row in img for pixel in row ]
70
- clusterer = KMeans (5 )
70
+ clusterer = KMeans (k )
71
71
clusterer .train (pixels ) # this might take a while
72
72
73
73
def recolor (pixel ):
@@ -85,91 +85,80 @@ def recolor(pixel):
85
85
# hierarchical clustering
86
86
#
87
87
88
+ def is_leaf (cluster ):
89
+ """a cluster is a leaf if it has length 1"""
90
+ return len (cluster ) == 1
91
+
92
+ def get_children (cluster ):
93
+ """returns the two children of this cluster if it's a merged cluster;
94
+ raises an exception if this is a leaf cluster"""
95
+ if is_leaf (cluster ):
96
+ raise TypeError ("a leaf cluster has no children" )
97
+ else :
98
+ return cluster [1 ]
99
+
100
+ def get_values (cluster ):
101
+ """returns the value in this cluster (if it's a leaf cluster)
102
+ or all the values in the leaf clusters below it (if it's not)"""
103
+ if is_leaf (cluster ):
104
+ return cluster # is already a 1-tuple containing value
105
+ else :
106
+ return [value
107
+ for child in get_children (cluster )
108
+ for value in get_values (child )]
109
+
88
110
def cluster_distance (cluster1 , cluster2 , distance_agg = min ):
89
111
"""finds the aggregate distance between elements of cluster1
90
112
and elements of cluster2"""
91
- return distance_agg (distance (input_i , input_j )
92
- for input_i in cluster1 .members ()
93
- for input_j in cluster2 .members ())
94
-
95
- class LeafCluster :
96
- """stores a single input
97
- it has 'infinite depth' so that we never try to split it"""
98
-
99
- def __init__ (self , value ):
100
- self .value = value
101
- self .depth = float ('inf' )
102
-
103
- def __repr__ (self ):
104
- return str (self .value )
105
-
106
- def members (self ):
107
- """a LeafCluster has only one member"""
108
- return [self .value ]
109
-
110
- class MergedCluster :
111
- """a new cluster that's the result of 'merging' two clusters"""
112
-
113
- def __init__ (self , branches , depth ):
114
- self .branches = branches
115
- self .depth = depth
116
-
117
- def __repr__ (self ):
118
- """show as {(depth) child1, child2}"""
119
- return ("{(" + str (self .depth ) + ") " +
120
- ", " .join (str (b ) for b in self .branches ) + " }" )
121
-
122
- def members (self ):
123
- """recursively get members by looking for members of branches"""
124
- return [member
125
- for cluster in self .branches
126
- for member in cluster .members ()]
127
-
128
-
129
- class BottomUpClusterer :
130
-
131
- def __init__ (self , distance_agg = min ):
132
- self .agg = distance_agg
133
- self .clusters = None
134
-
135
- def train (self , inputs ):
136
- # start with each input its own cluster
137
- self .clusters = [LeafCluster (input ) for input in inputs ]
138
-
139
- while len (self .clusters ) > 1 :
140
-
141
- # find the two closest clusters
142
- c1 , c2 = min ([(cluster1 , cluster2 )
143
- for cluster1 in self .clusters
144
- for cluster2 in self .clusters
145
- if cluster1 != cluster2 ],
146
- key = lambda (c1 , c2 ): cluster_distance (c1 , c2 ,
147
- self .agg ))
148
-
149
- merged_cluster = MergedCluster ([c1 , c2 ], len (self .clusters ))
150
-
151
- self .clusters = [c for c in self .clusters
152
- if c not in [c1 , c2 ]]
153
-
154
- self .clusters .append (merged_cluster )
155
-
156
- def get_clusters (self , num_clusters ):
157
- """extract num_clusters clusters from the hierachy"""
158
-
159
- clusters = self .clusters [:] # create a copy so we can modify it
160
- while len (clusters ) < num_clusters :
161
- # choose the least deep cluster
162
- next_cluster = min (clusters , key = lambda c : c .depth )
163
- # remove it from the list
164
- clusters = [c for c in clusters if c != next_cluster ]
165
- # and add its children
166
- clusters .extend (next_cluster .branches )
113
+ return distance_agg ([distance (input1 , input2 )
114
+ for input1 in get_values (cluster1 )
115
+ for input2 in get_values (cluster2 )])
116
+
117
+ def get_merge_order (cluster ):
118
+ if is_leaf (cluster ):
119
+ return float ('inf' )
120
+ else :
121
+ return cluster [0 ] # merge_order is first element of 2-tuple
122
+
123
+ def bottom_up_cluster (inputs , distance_agg = min ):
124
+ # start with every input a leaf cluster / 1-tuple
125
+ clusters = [(input ,) for input in inputs ]
126
+
127
+ # as long as we have more than one cluster left...
128
+ while len (clusters ) > 1 :
129
+ # find the two closest clusters
130
+ c1 , c2 = min ([(cluster1 , cluster2 )
131
+ for i , cluster1 in enumerate (clusters )
132
+ for cluster2 in clusters [:i ]],
133
+ key = lambda (x , y ): cluster_distance (x , y , distance_agg ))
167
134
168
- return clusters
135
+ # remove them from the list of clusters
136
+ clusters = [c for c in clusters if c != c1 and c != c2 ]
169
137
138
+ # merge them, using merge_order = # of clusters left
139
+ merged_cluster = (len (clusters ), [c1 , c2 ])
170
140
141
+ # and add their merge
142
+ clusters .append (merged_cluster )
171
143
144
+ # when there's only one cluster left, return it
145
+ return clusters [0 ]
172
146
147
+ def generate_clusters (base_cluster , num_clusters ):
148
+ # start with a list with just the base cluster
149
+ clusters = [base_cluster ]
150
+
151
+ # as long as we don't have enough clusters yet...
152
+ while len (clusters ) < num_clusters :
153
+ # choose the last-merged of our clusters
154
+ next_cluster = min (clusters , key = get_merge_order )
155
+ # remove it from the list
156
+ clusters = [c for c in clusters if c != next_cluster ]
157
+ # and add its children to the list (i.e., unmerge it)
158
+ clusters .extend (get_children (next_cluster ))
159
+
160
+ # once we have enough clusters...
161
+ return clusters
173
162
174
163
if __name__ == "__main__" :
175
164
@@ -198,11 +187,16 @@ def get_clusters(self, num_clusters):
198
187
199
188
print "bottom up hierarchical clustering"
200
189
201
- buc = BottomUpClusterer () # or BottomUpClusterer(max) if you like
202
- buc .train (inputs )
203
- print buc .clusters [0 ]
190
+ base_cluster = bottom_up_cluster (inputs )
191
+ print base_cluster
192
+
193
+ print
194
+ print "three clusters, min:"
195
+ for cluster in generate_clusters (base_cluster , 3 ):
196
+ print get_values (cluster )
204
197
205
198
print
206
- print "three clusters:"
207
- for cluster in buc .get_clusters (3 ):
208
- print cluster
199
+ print "three clusters, max:"
200
+ base_cluster = bottom_up_cluster (inputs , max )
201
+ for cluster in generate_clusters (base_cluster , 3 ):
202
+ print get_values (cluster )
0 commit comments