Skip to content

Commit 4c53d27

Browse files
authored
Add files via upload
1 parent 46c5d27 commit 4c53d27

File tree

3 files changed

+329
-0
lines changed

3 files changed

+329
-0
lines changed

K_Means_Project2/Figure_1.png

13.2 KB
Loading

K_Means_Project2/K_Means.py

Lines changed: 269 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,269 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Fri Aug 3 13:53:40 2018
4+
5+
@author: wzy
6+
"""
7+
import matplotlib.pyplot as plt
8+
import numpy as np
9+
10+
"""
11+
函数说明:将文本文档中的数据读入到python中
12+
13+
Parameters:
14+
fileName - 文件名
15+
16+
Returns:
17+
dataMat - 数据矩阵
18+
19+
Modify:
20+
2018-08-02
21+
"""
22+
def loadDataSet(fileName):
23+
dataMat = []
24+
fr = open(fileName)
25+
for line in fr.readlines():
26+
curLine = line.strip().split('\t')
27+
fltLine = list(map(float, curLine))
28+
dataMat.append(fltLine)
29+
return dataMat
30+
31+
32+
"""
33+
函数说明:数据向量计算欧式距离
34+
35+
Parameters:
36+
vecA - 数据向量A
37+
vecB - 数据向量B
38+
39+
Returns:
40+
两个向量之间的欧几里德距离
41+
42+
Modify:
43+
2018-08-02
44+
"""
45+
def distEclud(vecA, vecB):
46+
return np.sqrt(np.sum(np.power(vecA - vecB, 2)))
47+
48+
49+
"""
50+
函数说明:随机初始化k个质心(质心满足数据边界之内)
51+
52+
Parameters:
53+
dataSet - 输入的数据集
54+
k - 选取k个质心
55+
56+
Returns:
57+
centroids - 返回初始化得到的k个质心向量
58+
59+
Modify:
60+
2018-08-02
61+
"""
62+
def randCent(dataSet, k):
63+
# 得到数据样本的维度
64+
n = np.shape(dataSet)[1]
65+
# 初始化为一个(k,n)的全零矩阵
66+
centroids = np.mat(np.zeros((k, n)))
67+
# 遍历数据集的每一个维度
68+
for j in range(n):
69+
# 得到该列数据的最小值,最大值
70+
minJ = np.min(dataSet[:, j])
71+
maxJ = np.max(dataSet[:, j])
72+
# 得到该列数据的范围(最大值-最小值)
73+
rangeJ = float(maxJ - minJ)
74+
# k个质心向量的第j维数据值随机为位于(最小值,最大值)内的某一值
75+
# Create an array of the given shape and populate it with random samples from a uniform distribution over [0, 1).
76+
centroids[:, j] = minJ + rangeJ * np.random.rand(k, 1)
77+
# 返回初始化得到的k个质心向量
78+
return centroids
79+
80+
81+
"""
82+
函数说明:k-means聚类算法
83+
84+
Parameters:
85+
dataSet - 用于聚类的数据集
86+
k - 选取k个质心
87+
distMeas - 距离计算方法,默认欧氏距离distEclud()
88+
createCent - 获取k个质心的方法,默认随机获取randCent()
89+
90+
Returns:
91+
centroids - k个聚类的聚类结果
92+
clusterAssment - 聚类误差
93+
94+
Modify:
95+
2018-08-02
96+
"""
97+
def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
98+
# 获取数据集样本数
99+
m = np.shape(dataSet)[0]
100+
# 初始化一个(m,2)全零矩阵
101+
clusterAssment = np.mat(np.zeros((m, 2)))
102+
# 创建初始的k个质心向量
103+
centroids = createCent(dataSet, k)
104+
# 聚类结果是否发生变化的布尔类型
105+
clusterChanged = True
106+
# 只要聚类结果一直发生变化,就一直执行聚类算法,直至所有数据点聚类结果不发生变化
107+
while clusterChanged:
108+
# 聚类结果变化布尔类型置为False
109+
clusterChanged = False
110+
# 遍历数据集每一个样本向量
111+
for i in range(m):
112+
# 初始化最小距离为正无穷,最小距离对应的索引为-1
113+
minDist = float('inf')
114+
minIndex = -1
115+
# 循环k个类的质心
116+
for j in range(k):
117+
# 计算数据点到质心的欧氏距离
118+
distJI = distMeas(centroids[j, :], dataSet[i, :])
119+
# 如果距离小于当前最小距离
120+
if distJI < minDist:
121+
# 当前距离为最小距离,最小距离对应索引应为j(第j个类)
122+
minDist = distJI
123+
minIndex = j
124+
# 当前聚类结果中第i个样本的聚类结果发生变化:布尔值置为True,继续聚类算法
125+
if clusterAssment[i, 0] != minIndex:
126+
clusterChanged = True
127+
# 更新当前变化样本的聚类结果和平方误差
128+
clusterAssment[i, :] = minIndex, minDist**2
129+
# 打印k-means聚类的质心
130+
# print(centroids)
131+
# 遍历每一个质心
132+
for cent in range(k):
133+
# 将数据集中所有属于当前质心类的样本通过条件过滤筛选出来
134+
ptsInClust = dataSet[np.nonzero(clusterAssment[:, 0].A == cent)[0]]
135+
# 计算这些数据的均值(axis=0:求列均值),作为该类质心向量
136+
centroids[cent, :] = np.mean(ptsInClust, axis=0)
137+
# 返回k个聚类,聚类结果及误差
138+
return centroids, clusterAssment
139+
140+
141+
"""
142+
函数说明:二分k-means聚类算法
143+
144+
Parameters:
145+
dataSet - 用于聚类的数据集
146+
k - 选取k个质心
147+
distMeas - 距离计算方法,默认欧氏距离distEclud()
148+
149+
Returns:
150+
centList - k个聚类的聚类结果
151+
clusterAssment - 聚类误差
152+
153+
Modify:
154+
2018-08-03
155+
"""
156+
def biKmeans(dataSet, k, distMeas=distEclud):
157+
# 获取数据集的样本数
158+
m = np.shape(dataSet)[0]
159+
# 初始化一个元素均值0的(m, 2)矩阵
160+
clusterAssment = np.mat(np.zeros((m, 2)))
161+
# 获取数据集每一列数据的均值,组成一个列表
162+
centroid0 = np.mean(dataSet, axis=0).tolist()[0]
163+
# 当前聚类列表为将数据集聚为一类
164+
centList = [centroid0]
165+
# 遍历每个数据集样本
166+
for j in range(m):
167+
# 计算当前聚为一类时各个数据点距离质心的平方距离
168+
clusterAssment[j, 1] = distMeas(np.mat(centroid0), dataSet[j, :])**2
169+
# 循环,直至二分k-Means值达到k类为止
170+
while (len(centList) < k):
171+
# 将当前最小平方误差置为正无穷
172+
lowerSSE = float('inf')
173+
# 遍历当前每个聚类
174+
for i in range(len(centList)):
175+
# 通过数组过滤筛选出属于第i类的数据集合
176+
ptsInCurrCluster = dataSet[np.nonzero(clusterAssment[:, 0].A == i)[0], :]
177+
# 对该类利用二分k-means算法进行划分,返回划分后的结果以及误差
178+
centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)
179+
# 计算该类划分后两个类的误差平方和
180+
sseSplit = np.sum(splitClustAss[:, 1])
181+
# 计算数据集中不属于该类的数据的误差平方和
182+
sseNotSplit = np.sum(clusterAssment[np.nonzero(clusterAssment[:, 0].A != i)[0], 1])
183+
# 打印这两项误差值
184+
print('sseSplit = %f, and notSplit = %f' % (sseSplit, sseNotSplit))
185+
# 划分第i类后总误差小于当前最小总误差
186+
if (sseSplit + sseNotSplit) < lowerSSE:
187+
# 第i类作为本次划分类
188+
bestCentToSplit = i
189+
# 第i类划分后得到的两个质心向量
190+
bestNewCents = centroidMat
191+
# 复制第i类中数据点的聚类结果即误差值
192+
bestClustAss = splitClustAss.copy()
193+
# 将划分第i类后的总误差作为当前最小误差
194+
lowerSSE = sseSplit + sseNotSplit
195+
# 数组过滤选出本次2-means聚类划分后类编号为1数据点,将这些数据点类编号变为
196+
# 当前类个数+1, 作为新的一个聚类
197+
bestClustAss[np.nonzero(bestClustAss[:, 0].A == 1)[0], 0] = len(centList)
198+
# 同理,将划分数据中类编号为0的数据点的类编号仍置为被划分的类编号,使类编号
199+
# 连续不出现空缺
200+
bestClustAss[np.nonzero(bestClustAss[:, 0].A == 0)[0], 0] = bestCentToSplit
201+
# 打印本次执行2-means聚类算法的类
202+
print('the bestCentToSplit is %d' % bestCentToSplit)
203+
# 打印被划分的类的数据个数
204+
print('the len of bestClustAss is %d' % len(bestClustAss))
205+
# 更新质心列表中变化后的质心向量
206+
centList[bestCentToSplit] = bestNewCents[0, :]
207+
# 添加新的类的质心向量
208+
centList.append(bestNewCents[1, :])
209+
# 更新clusterAssment列表中参与2-means聚类数据点变化后的分类编号,及数据该类的误差平方
210+
clusterAssment[np.nonzero(clusterAssment[:, 0].A == bestCentToSplit)[0], :] = bestClustAss
211+
# 返回聚类结果
212+
return centList, clusterAssment
213+
214+
215+
"""
216+
函数说明:绘制数据集
217+
218+
Parameters:
219+
fileName - 文件名
220+
k - 选取k个质心
221+
222+
Returns:
223+
None
224+
225+
Modify:
226+
2018-08-01
227+
"""
228+
def plotDataSet(filename, k):
229+
# 导入数据
230+
datMat = np.mat(loadDataSet(filename))
231+
# 进行k-means算法其中k为4
232+
centList, clusterAssment = biKmeans(datMat, k)
233+
clusterAssment = clusterAssment.tolist()
234+
xcord = [[], [], []]
235+
ycord = [[], [], []]
236+
datMat = datMat.tolist()
237+
m = len(clusterAssment)
238+
for i in range(m):
239+
if int(clusterAssment[i][0]) == 0:
240+
xcord[0].append(datMat[i][0])
241+
ycord[0].append(datMat[i][1])
242+
elif int(clusterAssment[i][0]) == 1:
243+
xcord[1].append(datMat[i][0])
244+
ycord[1].append(datMat[i][1])
245+
elif int(clusterAssment[i][0]) == 2:
246+
xcord[2].append(datMat[i][0])
247+
ycord[2].append(datMat[i][1])
248+
fig = plt.figure()
249+
ax = fig.add_subplot(111)
250+
# 绘制样本点
251+
ax.scatter(xcord[0], ycord[0], s=20, c='b', marker='*', alpha=.5)
252+
ax.scatter(xcord[1], ycord[1], s=20, c='r', marker='D', alpha=.5)
253+
ax.scatter(xcord[2], ycord[2], s=20, c='c', marker='>', alpha=.5)
254+
# 绘制质心
255+
for i in range(k):
256+
ax.scatter(centList[i].tolist()[0][0], centList[i].tolist()[0][1], s=100, c='k', marker='+', alpha=.5)
257+
# ax.scatter(centList[0].tolist()[0][0], centList[0].tolist()[0][1], s=100, c='k', marker='+', alpha=.5)
258+
# ax.scatter(centList[1].tolist()[0][0], centList[1].tolist()[0][1], s=100, c='k', marker='+', alpha=.5)
259+
# ax.scatter(centList[2].tolist()[0][0], centList[2].tolist()[0][1], s=100, c='k', marker='+', alpha=.5)
260+
plt.title('DataSet')
261+
plt.xlabel('X')
262+
plt.show()
263+
264+
265+
if __name__ == '__main__':
266+
datMat = np.mat(loadDataSet('testSet2.txt'))
267+
centList, myNewAssments = biKmeans(datMat, 3)
268+
plotDataSet('testSet2.txt', 3)
269+

K_Means_Project2/testSet2.txt

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
3.275154 2.957587
2+
-3.344465 2.603513
3+
0.355083 -3.376585
4+
1.852435 3.547351
5+
-2.078973 2.552013
6+
-0.993756 -0.884433
7+
2.682252 4.007573
8+
-3.087776 2.878713
9+
-1.565978 -1.256985
10+
2.441611 0.444826
11+
-0.659487 3.111284
12+
-0.459601 -2.618005
13+
2.177680 2.387793
14+
-2.920969 2.917485
15+
-0.028814 -4.168078
16+
3.625746 2.119041
17+
-3.912363 1.325108
18+
-0.551694 -2.814223
19+
2.855808 3.483301
20+
-3.594448 2.856651
21+
0.421993 -2.372646
22+
1.650821 3.407572
23+
-2.082902 3.384412
24+
-0.718809 -2.492514
25+
4.513623 3.841029
26+
-4.822011 4.607049
27+
-0.656297 -1.449872
28+
1.919901 4.439368
29+
-3.287749 3.918836
30+
-1.576936 -2.977622
31+
3.598143 1.975970
32+
-3.977329 4.900932
33+
-1.791080 -2.184517
34+
3.914654 3.559303
35+
-1.910108 4.166946
36+
-1.226597 -3.317889
37+
1.148946 3.345138
38+
-2.113864 3.548172
39+
0.845762 -3.589788
40+
2.629062 3.535831
41+
-1.640717 2.990517
42+
-1.881012 -2.485405
43+
4.606999 3.510312
44+
-4.366462 4.023316
45+
0.765015 -3.001270
46+
3.121904 2.173988
47+
-4.025139 4.652310
48+
-0.559558 -3.840539
49+
4.376754 4.863579
50+
-1.874308 4.032237
51+
-0.089337 -3.026809
52+
3.997787 2.518662
53+
-3.082978 2.884822
54+
0.845235 -3.454465
55+
1.327224 3.358778
56+
-2.889949 3.596178
57+
-0.966018 -2.839827
58+
2.960769 3.079555
59+
-3.275518 1.577068
60+
0.639276 -3.412840

0 commit comments

Comments
 (0)