Skip to content

Commit 9bff172

Browse files
Create KMeans.py
1 parent 3b7b31c commit 9bff172

File tree

1 file changed

+122
-0
lines changed

1 file changed

+122
-0
lines changed

Chapter_10 KMeans/KMeans.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
# coding:UTF-8
2+
'''
3+
Date:20160923
4+
@author: zhaozhiyong
5+
'''
6+
import numpy as np
7+
8+
def load_data(file_path):
9+
'''导入数据
10+
input: file_path(string):文件的存储位置
11+
output: data(mat):数据
12+
'''
13+
f = open(file_path)
14+
data = []
15+
for line in f.readlines():
16+
row = [] # 记录每一行
17+
lines = line.strip().split("\t")
18+
for x in lines:
19+
row.append(float(x)) # 将文本中的特征转换成浮点数
20+
data.append(row)
21+
f.close()
22+
return np.mat(data)
23+
24+
def distance(vecA, vecB):
25+
'''计算vecA与vecB之间的欧式距离的平方
26+
input: vecA(mat)A点坐标
27+
vecB(mat)B点坐标
28+
output: dist[0, 0](float)A点与B点距离的平方
29+
'''
30+
dist = (vecA - vecB) * (vecA - vecB).T
31+
return dist[0, 0]
32+
33+
def randCent(data, k):
34+
'''随机初始化聚类中心
35+
input: data(mat):训练数据
36+
k(int):类别个数
37+
output: centroids(mat):聚类中心
38+
'''
39+
n = np.shape(data)[1] # 属性的个数
40+
centroids = np.mat(np.zeros((k, n))) # 初始化k个聚类中心
41+
for j in xrange(n): # 初始化聚类中心每一维的坐标
42+
minJ = np.min(data[:, j])
43+
rangeJ = np.max(data[:, j]) - minJ
44+
# 在最大值和最小值之间随机初始化
45+
centroids[:, j] = minJ * np.mat(np.ones((k , 1))) \
46+
+ np.random.rand(k, 1) * rangeJ
47+
return centroids
48+
49+
def kmeans(data, k, centroids):
50+
'''根据KMeans算法求解聚类中心
51+
input: data(mat):训练数据
52+
k(int):类别个数
53+
centroids(mat):随机初始化的聚类中心
54+
output: centroids(mat):训练完成的聚类中心
55+
subCenter(mat):每一个样本所属的类别
56+
'''
57+
m, n = np.shape(data) # m:样本的个数,n:特征的维度
58+
subCenter = np.mat(np.zeros((m, 2))) # 初始化每一个样本所属的类别
59+
change = True # 判断是否需要重新计算聚类中心
60+
while change == True:
61+
change = False # 重置
62+
for i in xrange(m):
63+
minDist = np.inf # 设置样本与聚类中心之间的最小的距离,初始值为争取穷
64+
minIndex = 0 # 所属的类别
65+
for j in xrange(k):
66+
# 计算i和每个聚类中心之间的距离
67+
dist = distance(data[i, ], centroids[j, ])
68+
if dist < minDist:
69+
minDist = dist
70+
minIndex = j
71+
# 判断是否需要改变
72+
if subCenter[i, 0] <> minIndex: # 需要改变
73+
change = True
74+
subCenter[i, ] = np.mat([minIndex, minDist])
75+
# 重新计算聚类中心
76+
for j in xrange(k):
77+
sum_all = np.mat(np.zeros((1, n)))
78+
r = 0 # 每个类别中的样本的个数
79+
for i in xrange(m):
80+
if subCenter[i, 0] == j: # 计算第j个类别
81+
sum_all += data[i, ]
82+
r += 1
83+
for z in xrange(n):
84+
try:
85+
centroids[j, z] = sum_all[0, z] / r
86+
except:
87+
print " r is zero"
88+
return subCenter
89+
90+
def save_result(file_name, source):
91+
'''保存source中的结果到file_name文件中
92+
input: file_name(string):文件名
93+
source(mat):需要保存的数据
94+
output:
95+
'''
96+
m, n = np.shape(source)
97+
f = open(file_name, "w")
98+
for i in xrange(m):
99+
tmp = []
100+
for j in xrange(n):
101+
tmp.append(str(source[i, j]))
102+
f.write("\t".join(tmp) + "\n")
103+
f.close()
104+
105+
if __name__ == "__main__":
106+
k = 4 # 聚类中心的个数
107+
file_path = "data.txt"
108+
# 1、导入数据
109+
print "---------- 1.load data ------------"
110+
data = load_data(file_path)
111+
# 2、随机初始化k个聚类中心
112+
print "---------- 2.random center ------------"
113+
centroids = randCent(data, k)
114+
# 3、聚类计算
115+
print "---------- 3.kmeans ------------"
116+
subCenter = kmeans(data, k, centroids)
117+
# 4、保存所属的类别文件
118+
print "---------- 4.save subCenter ------------"
119+
save_result("sub", subCenter)
120+
# 5、保存聚类中心
121+
print "---------- 5.save centroids ------------"
122+
save_result("center", centroids)

0 commit comments

Comments
 (0)