Skip to content

Commit bea6716

Browse files
authored
Add files via upload
1 parent 1aaee50 commit bea6716

File tree

1 file changed

+305
-0
lines changed

1 file changed

+305
-0
lines changed

charpter17_kmeans/kmeans.ipynb

Lines changed: 305 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,305 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"### kmeans"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": 1,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"import numpy as np\n",
17+
"from sklearn.cluster import KMeans"
18+
]
19+
},
20+
{
21+
"cell_type": "code",
22+
"execution_count": 2,
23+
"metadata": {},
24+
"outputs": [
25+
{
26+
"data": {
27+
"text/plain": [
28+
"array([[0, 2],\n",
29+
" [0, 0],\n",
30+
" [1, 0],\n",
31+
" [5, 0],\n",
32+
" [5, 2]])"
33+
]
34+
},
35+
"execution_count": 2,
36+
"metadata": {},
37+
"output_type": "execute_result"
38+
}
39+
],
40+
"source": [
41+
"X = np.array([[0,2],[0,0],[1,0],[5,0],[5,2]])\n",
42+
"X"
43+
]
44+
},
45+
{
46+
"cell_type": "code",
47+
"execution_count": 14,
48+
"metadata": {},
49+
"outputs": [
50+
{
51+
"name": "stdout",
52+
"output_type": "stream",
53+
"text": [
54+
"[1 1 1 0 0]\n"
55+
]
56+
}
57+
],
58+
"source": [
59+
"from sklearn.cluster import KMeans\n",
60+
"kmeans = KMeans(n_clusters=2, random_state=0).fit(X)\n",
61+
"print(kmeans.labels_)"
62+
]
63+
},
64+
{
65+
"cell_type": "code",
66+
"execution_count": 4,
67+
"metadata": {},
68+
"outputs": [
69+
{
70+
"data": {
71+
"text/plain": [
72+
"array([1, 0])"
73+
]
74+
},
75+
"execution_count": 4,
76+
"metadata": {},
77+
"output_type": "execute_result"
78+
}
79+
],
80+
"source": [
81+
"kmeans.predict([[0, 0], [12, 3]])"
82+
]
83+
},
84+
{
85+
"cell_type": "code",
86+
"execution_count": 5,
87+
"metadata": {},
88+
"outputs": [
89+
{
90+
"data": {
91+
"text/plain": [
92+
"array([[5. , 1. ],\n",
93+
" [0.33333333, 0.66666667]])"
94+
]
95+
},
96+
"execution_count": 5,
97+
"metadata": {},
98+
"output_type": "execute_result"
99+
}
100+
],
101+
"source": [
102+
"kmeans.cluster_centers_"
103+
]
104+
},
105+
{
106+
"cell_type": "code",
107+
"execution_count": 6,
108+
"metadata": {},
109+
"outputs": [
110+
{
111+
"name": "stdout",
112+
"output_type": "stream",
113+
"text": [
114+
"5.0\n"
115+
]
116+
}
117+
],
118+
"source": [
119+
"import numpy as np\n",
120+
"# 定义欧式距离\n",
121+
"def euclidean_distance(x1, x2):\n",
122+
" distance = 0\n",
123+
" # 距离的平方项再开根号\n",
124+
" for i in range(len(x1)):\n",
125+
" distance += pow((x1[i] - x2[i]), 2)\n",
126+
" return np.sqrt(distance)\n",
127+
"\n",
128+
"print(euclidean_distance(X[0], X[4]))"
129+
]
130+
},
131+
{
132+
"cell_type": "code",
133+
"execution_count": 7,
134+
"metadata": {},
135+
"outputs": [],
136+
"source": [
137+
"# 定义中心初始化函数\n",
138+
"def centroids_init(k, X):\n",
139+
" m, n = X.shape\n",
140+
" centroids = np.zeros((k, n))\n",
141+
" for i in range(k):\n",
142+
" # 每一次循环随机选择一个类别中心\n",
143+
" centroid = X[np.random.choice(range(m))]\n",
144+
" centroids[i] = centroid\n",
145+
" return centroids"
146+
]
147+
},
148+
{
149+
"cell_type": "code",
150+
"execution_count": 8,
151+
"metadata": {},
152+
"outputs": [],
153+
"source": [
154+
"# 定义样本的最近质心点所属的类别索引\n",
155+
"def closest_centroid(sample, centroids):\n",
156+
" closest_i = 0\n",
157+
" closest_dist = float('inf')\n",
158+
" for i, centroid in enumerate(centroids):\n",
159+
" # 根据欧式距离判断,选择最小距离的中心点所属类别\n",
160+
" distance = euclidean_distance(sample, centroid)\n",
161+
" if distance < closest_dist:\n",
162+
" closest_i = i\n",
163+
" closest_dist = distance\n",
164+
" return closest_i"
165+
]
166+
},
167+
{
168+
"cell_type": "code",
169+
"execution_count": 9,
170+
"metadata": {},
171+
"outputs": [],
172+
"source": [
173+
"# 定义构建类别过程\n",
174+
"def build_clusters(centroids, k, X):\n",
175+
" clusters = [[] for _ in range(k)]\n",
176+
" for x_i, x in enumerate(X):\n",
177+
" # 将样本划分到最近的类别区域\n",
178+
" centroid_i = closest_centroid(x, centroids)\n",
179+
" clusters[centroid_i].append(x_i)\n",
180+
" return clusters"
181+
]
182+
},
183+
{
184+
"cell_type": "code",
185+
"execution_count": 10,
186+
"metadata": {},
187+
"outputs": [],
188+
"source": [
189+
"# 根据上一步聚类结果计算新的中心点\n",
190+
"def calculate_centroids(clusters, k, X):\n",
191+
" n = X.shape[1]\n",
192+
" centroids = np.zeros((k, n))\n",
193+
" # 以当前每个类样本的均值为新的中心点\n",
194+
" for i, cluster in enumerate(clusters):\n",
195+
" centroid = np.mean(X[cluster], axis=0)\n",
196+
" centroids[i] = centroid\n",
197+
" return centroids"
198+
]
199+
},
200+
{
201+
"cell_type": "code",
202+
"execution_count": 11,
203+
"metadata": {},
204+
"outputs": [],
205+
"source": [
206+
"# 获取每个样本所属的聚类类别\n",
207+
"def get_cluster_labels(clusters, X):\n",
208+
" y_pred = np.zeros(X.shape[0])\n",
209+
" for cluster_i, cluster in enumerate(clusters):\n",
210+
" for X_i in cluster:\n",
211+
" y_pred[X_i] = cluster_i\n",
212+
" return y_pred"
213+
]
214+
},
215+
{
216+
"cell_type": "code",
217+
"execution_count": 12,
218+
"metadata": {},
219+
"outputs": [],
220+
"source": [
221+
"# 根据上述各流程定义kmeans算法流程\n",
222+
"def kmeans(X, k, max_iterations):\n",
223+
" # 1.初始化中心点\n",
224+
" centroids = centroids_init(k, X)\n",
225+
" # 遍历迭代求解\n",
226+
" for _ in range(max_iterations):\n",
227+
" # 2.根据当前中心点进行聚类\n",
228+
" clusters = build_clusters(centroids, k, X)\n",
229+
" # 保存当前中心点\n",
230+
" prev_centroids = centroids\n",
231+
" # 3.根据聚类结果计算新的中心点\n",
232+
" centroids = calculate_centroids(clusters, k, X)\n",
233+
" # 4.设定收敛条件为中心点是否发生变化\n",
234+
" diff = centroids - prev_centroids\n",
235+
" if not diff.any():\n",
236+
" break\n",
237+
" # 返回最终的聚类标签\n",
238+
" return get_cluster_labels(clusters, X)"
239+
]
240+
},
241+
{
242+
"cell_type": "code",
243+
"execution_count": 13,
244+
"metadata": {},
245+
"outputs": [
246+
{
247+
"name": "stdout",
248+
"output_type": "stream",
249+
"text": [
250+
"[0. 0. 0. 1. 1.]\n"
251+
]
252+
}
253+
],
254+
"source": [
255+
"# 测试数据\n",
256+
"X = np.array([[0,2],[0,0],[1,0],[5,0],[5,2]])\n",
257+
"# 设定聚类类别为2个,最大迭代次数为10次\n",
258+
"labels = kmeans(X, 2, 10)\n",
259+
"# 打印每个样本所属的类别标签\n",
260+
"print(labels)"
261+
]
262+
},
263+
{
264+
"cell_type": "code",
265+
"execution_count": null,
266+
"metadata": {},
267+
"outputs": [],
268+
"source": []
269+
}
270+
],
271+
"metadata": {
272+
"kernelspec": {
273+
"display_name": "Python 3",
274+
"language": "python",
275+
"name": "python3"
276+
},
277+
"language_info": {
278+
"codemirror_mode": {
279+
"name": "ipython",
280+
"version": 3
281+
},
282+
"file_extension": ".py",
283+
"mimetype": "text/x-python",
284+
"name": "python",
285+
"nbconvert_exporter": "python",
286+
"pygments_lexer": "ipython3",
287+
"version": "3.7.3"
288+
},
289+
"toc": {
290+
"base_numbering": 1,
291+
"nav_menu": {},
292+
"number_sections": true,
293+
"sideBar": true,
294+
"skip_h1_title": false,
295+
"title_cell": "Table of Contents",
296+
"title_sidebar": "Contents",
297+
"toc_cell": false,
298+
"toc_position": {},
299+
"toc_section_display": true,
300+
"toc_window_display": false
301+
}
302+
},
303+
"nbformat": 4,
304+
"nbformat_minor": 4
305+
}

0 commit comments

Comments
 (0)