1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Fri Aug 3 13:53:40 2018
4
+
5
+ @author: wzy
6
+ """
7
+ import matplotlib .pyplot as plt
8
+ import numpy as np
9
+
10
+ """
11
+ 函数说明:将文本文档中的数据读入到python中
12
+
13
+ Parameters:
14
+ fileName - 文件名
15
+
16
+ Returns:
17
+ dataMat - 数据矩阵
18
+
19
+ Modify:
20
+ 2018-08-02
21
+ """
22
+ def loadDataSet (fileName ):
23
+ dataMat = []
24
+ fr = open (fileName )
25
+ for line in fr .readlines ():
26
+ curLine = line .strip ().split ('\t ' )
27
+ fltLine = list (map (float , curLine ))
28
+ dataMat .append (fltLine )
29
+ return dataMat
30
+
31
+
32
+ """
33
+ 函数说明:数据向量计算欧式距离
34
+
35
+ Parameters:
36
+ vecA - 数据向量A
37
+ vecB - 数据向量B
38
+
39
+ Returns:
40
+ 两个向量之间的欧几里德距离
41
+
42
+ Modify:
43
+ 2018-08-02
44
+ """
45
+ def distEclud (vecA , vecB ):
46
+ return np .sqrt (np .sum (np .power (vecA - vecB , 2 )))
47
+
48
+
49
+ """
50
+ 函数说明:随机初始化k个质心(质心满足数据边界之内)
51
+
52
+ Parameters:
53
+ dataSet - 输入的数据集
54
+ k - 选取k个质心
55
+
56
+ Returns:
57
+ centroids - 返回初始化得到的k个质心向量
58
+
59
+ Modify:
60
+ 2018-08-02
61
+ """
62
+ def randCent (dataSet , k ):
63
+ # 得到数据样本的维度
64
+ n = np .shape (dataSet )[1 ]
65
+ # 初始化为一个(k,n)的全零矩阵
66
+ centroids = np .mat (np .zeros ((k , n )))
67
+ # 遍历数据集的每一个维度
68
+ for j in range (n ):
69
+ # 得到该列数据的最小值,最大值
70
+ minJ = np .min (dataSet [:, j ])
71
+ maxJ = np .max (dataSet [:, j ])
72
+ # 得到该列数据的范围(最大值-最小值)
73
+ rangeJ = float (maxJ - minJ )
74
+ # k个质心向量的第j维数据值随机为位于(最小值,最大值)内的某一值
75
+ # Create an array of the given shape and populate it with random samples from a uniform distribution over [0, 1).
76
+ centroids [:, j ] = minJ + rangeJ * np .random .rand (k , 1 )
77
+ # 返回初始化得到的k个质心向量
78
+ return centroids
79
+
80
+
81
+ """
82
+ 函数说明:k-means聚类算法
83
+
84
+ Parameters:
85
+ dataSet - 用于聚类的数据集
86
+ k - 选取k个质心
87
+ distMeas - 距离计算方法,默认欧氏距离distEclud()
88
+ createCent - 获取k个质心的方法,默认随机获取randCent()
89
+
90
+ Returns:
91
+ centroids - k个聚类的聚类结果
92
+ clusterAssment - 聚类误差
93
+
94
+ Modify:
95
+ 2018-08-02
96
+ """
97
+ def kMeans (dataSet , k , distMeas = distEclud , createCent = randCent ):
98
+ # 获取数据集样本数
99
+ m = np .shape (dataSet )[0 ]
100
+ # 初始化一个(m,2)全零矩阵
101
+ clusterAssment = np .mat (np .zeros ((m , 2 )))
102
+ # 创建初始的k个质心向量
103
+ centroids = createCent (dataSet , k )
104
+ # 聚类结果是否发生变化的布尔类型
105
+ clusterChanged = True
106
+ # 只要聚类结果一直发生变化,就一直执行聚类算法,直至所有数据点聚类结果不发生变化
107
+ while clusterChanged :
108
+ # 聚类结果变化布尔类型置为False
109
+ clusterChanged = False
110
+ # 遍历数据集每一个样本向量
111
+ for i in range (m ):
112
+ # 初始化最小距离为正无穷,最小距离对应的索引为-1
113
+ minDist = float ('inf' )
114
+ minIndex = - 1
115
+ # 循环k个类的质心
116
+ for j in range (k ):
117
+ # 计算数据点到质心的欧氏距离
118
+ distJI = distMeas (centroids [j , :], dataSet [i , :])
119
+ # 如果距离小于当前最小距离
120
+ if distJI < minDist :
121
+ # 当前距离为最小距离,最小距离对应索引应为j(第j个类)
122
+ minDist = distJI
123
+ minIndex = j
124
+ # 当前聚类结果中第i个样本的聚类结果发生变化:布尔值置为True,继续聚类算法
125
+ if clusterAssment [i , 0 ] != minIndex :
126
+ clusterChanged = True
127
+ # 更新当前变化样本的聚类结果和平方误差
128
+ clusterAssment [i , :] = minIndex , minDist ** 2
129
+ # 打印k-means聚类的质心
130
+ # print(centroids)
131
+ # 遍历每一个质心
132
+ for cent in range (k ):
133
+ # 将数据集中所有属于当前质心类的样本通过条件过滤筛选出来
134
+ ptsInClust = dataSet [np .nonzero (clusterAssment [:, 0 ].A == cent )[0 ]]
135
+ # 计算这些数据的均值(axis=0:求列均值),作为该类质心向量
136
+ centroids [cent , :] = np .mean (ptsInClust , axis = 0 )
137
+ # 返回k个聚类,聚类结果及误差
138
+ return centroids , clusterAssment
139
+
140
+
141
+ """
142
+ 函数说明:二分k-means聚类算法
143
+
144
+ Parameters:
145
+ dataSet - 用于聚类的数据集
146
+ k - 选取k个质心
147
+ distMeas - 距离计算方法,默认欧氏距离distEclud()
148
+
149
+ Returns:
150
+ centList - k个聚类的聚类结果
151
+ clusterAssment - 聚类误差
152
+
153
+ Modify:
154
+ 2018-08-03
155
+ """
156
+ def biKmeans (dataSet , k , distMeas = distEclud ):
157
+ # 获取数据集的样本数
158
+ m = np .shape (dataSet )[0 ]
159
+ # 初始化一个元素均值0的(m, 2)矩阵
160
+ clusterAssment = np .mat (np .zeros ((m , 2 )))
161
+ # 获取数据集每一列数据的均值,组成一个列表
162
+ centroid0 = np .mean (dataSet , axis = 0 ).tolist ()[0 ]
163
+ # 当前聚类列表为将数据集聚为一类
164
+ centList = [centroid0 ]
165
+ # 遍历每个数据集样本
166
+ for j in range (m ):
167
+ # 计算当前聚为一类时各个数据点距离质心的平方距离
168
+ clusterAssment [j , 1 ] = distMeas (np .mat (centroid0 ), dataSet [j , :])** 2
169
+ # 循环,直至二分k-Means值达到k类为止
170
+ while (len (centList ) < k ):
171
+ # 将当前最小平方误差置为正无穷
172
+ lowerSSE = float ('inf' )
173
+ # 遍历当前每个聚类
174
+ for i in range (len (centList )):
175
+ # 通过数组过滤筛选出属于第i类的数据集合
176
+ ptsInCurrCluster = dataSet [np .nonzero (clusterAssment [:, 0 ].A == i )[0 ], :]
177
+ # 对该类利用二分k-means算法进行划分,返回划分后的结果以及误差
178
+ centroidMat , splitClustAss = kMeans (ptsInCurrCluster , 2 , distMeas )
179
+ # 计算该类划分后两个类的误差平方和
180
+ sseSplit = np .sum (splitClustAss [:, 1 ])
181
+ # 计算数据集中不属于该类的数据的误差平方和
182
+ sseNotSplit = np .sum (clusterAssment [np .nonzero (clusterAssment [:, 0 ].A != i )[0 ], 1 ])
183
+ # 打印这两项误差值
184
+ print ('sseSplit = %f, and notSplit = %f' % (sseSplit , sseNotSplit ))
185
+ # 划分第i类后总误差小于当前最小总误差
186
+ if (sseSplit + sseNotSplit ) < lowerSSE :
187
+ # 第i类作为本次划分类
188
+ bestCentToSplit = i
189
+ # 第i类划分后得到的两个质心向量
190
+ bestNewCents = centroidMat
191
+ # 复制第i类中数据点的聚类结果即误差值
192
+ bestClustAss = splitClustAss .copy ()
193
+ # 将划分第i类后的总误差作为当前最小误差
194
+ lowerSSE = sseSplit + sseNotSplit
195
+ # 数组过滤选出本次2-means聚类划分后类编号为1数据点,将这些数据点类编号变为
196
+ # 当前类个数+1, 作为新的一个聚类
197
+ bestClustAss [np .nonzero (bestClustAss [:, 0 ].A == 1 )[0 ], 0 ] = len (centList )
198
+ # 同理,将划分数据中类编号为0的数据点的类编号仍置为被划分的类编号,使类编号
199
+ # 连续不出现空缺
200
+ bestClustAss [np .nonzero (bestClustAss [:, 0 ].A == 0 )[0 ], 0 ] = bestCentToSplit
201
+ # 打印本次执行2-means聚类算法的类
202
+ print ('the bestCentToSplit is %d' % bestCentToSplit )
203
+ # 打印被划分的类的数据个数
204
+ print ('the len of bestClustAss is %d' % len (bestClustAss ))
205
+ # 更新质心列表中变化后的质心向量
206
+ centList [bestCentToSplit ] = bestNewCents [0 , :]
207
+ # 添加新的类的质心向量
208
+ centList .append (bestNewCents [1 , :])
209
+ # 更新clusterAssment列表中参与2-means聚类数据点变化后的分类编号,及数据该类的误差平方
210
+ clusterAssment [np .nonzero (clusterAssment [:, 0 ].A == bestCentToSplit )[0 ], :] = bestClustAss
211
+ # 返回聚类结果
212
+ return centList , clusterAssment
213
+
214
+
215
+ """
216
+ 函数说明:绘制数据集
217
+
218
+ Parameters:
219
+ fileName - 文件名
220
+ k - 选取k个质心
221
+
222
+ Returns:
223
+ None
224
+
225
+ Modify:
226
+ 2018-08-01
227
+ """
228
+ def plotDataSet (filename , k ):
229
+ # 导入数据
230
+ datMat = np .mat (loadDataSet (filename ))
231
+ # 进行k-means算法其中k为4
232
+ centList , clusterAssment = biKmeans (datMat , k )
233
+ clusterAssment = clusterAssment .tolist ()
234
+ xcord = [[], [], []]
235
+ ycord = [[], [], []]
236
+ datMat = datMat .tolist ()
237
+ m = len (clusterAssment )
238
+ for i in range (m ):
239
+ if int (clusterAssment [i ][0 ]) == 0 :
240
+ xcord [0 ].append (datMat [i ][0 ])
241
+ ycord [0 ].append (datMat [i ][1 ])
242
+ elif int (clusterAssment [i ][0 ]) == 1 :
243
+ xcord [1 ].append (datMat [i ][0 ])
244
+ ycord [1 ].append (datMat [i ][1 ])
245
+ elif int (clusterAssment [i ][0 ]) == 2 :
246
+ xcord [2 ].append (datMat [i ][0 ])
247
+ ycord [2 ].append (datMat [i ][1 ])
248
+ fig = plt .figure ()
249
+ ax = fig .add_subplot (111 )
250
+ # 绘制样本点
251
+ ax .scatter (xcord [0 ], ycord [0 ], s = 20 , c = 'b' , marker = '*' , alpha = .5 )
252
+ ax .scatter (xcord [1 ], ycord [1 ], s = 20 , c = 'r' , marker = 'D' , alpha = .5 )
253
+ ax .scatter (xcord [2 ], ycord [2 ], s = 20 , c = 'c' , marker = '>' , alpha = .5 )
254
+ # 绘制质心
255
+ for i in range (k ):
256
+ ax .scatter (centList [i ].tolist ()[0 ][0 ], centList [i ].tolist ()[0 ][1 ], s = 100 , c = 'k' , marker = '+' , alpha = .5 )
257
+ # ax.scatter(centList[0].tolist()[0][0], centList[0].tolist()[0][1], s=100, c='k', marker='+', alpha=.5)
258
+ # ax.scatter(centList[1].tolist()[0][0], centList[1].tolist()[0][1], s=100, c='k', marker='+', alpha=.5)
259
+ # ax.scatter(centList[2].tolist()[0][0], centList[2].tolist()[0][1], s=100, c='k', marker='+', alpha=.5)
260
+ plt .title ('DataSet' )
261
+ plt .xlabel ('X' )
262
+ plt .show ()
263
+
264
+
265
+ if __name__ == '__main__' :
266
+ datMat = np .mat (loadDataSet ('testSet2.txt' ))
267
+ centList , myNewAssments = biKmeans (datMat , 3 )
268
+ plotDataSet ('testSet2.txt' , 3 )
269
+
0 commit comments