Add files via upload

luwill · web-flow · commit bea67162bbc8 · 2021-06-11T21:35:10.000+08:00
diff --git a/charpter17_kmeans/kmeans.ipynb b/charpter17_kmeans/kmeans.ipynb
@@ -0,0 +1,305 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### kmeans"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from sklearn.cluster import KMeans"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[0, 2],\n",
+       "       [0, 0],\n",
+       "       [1, 0],\n",
+       "       [5, 0],\n",
+       "       [5, 2]])"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X = np.array([[0,2],[0,0],[1,0],[5,0],[5,2]])\n",
+    "X"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[1 1 1 0 0]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.cluster import KMeans\n",
+    "kmeans = KMeans(n_clusters=2, random_state=0).fit(X)\n",
+    "print(kmeans.labels_)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([1, 0])"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "kmeans.predict([[0, 0], [12, 3]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[5.        , 1.        ],\n",
+       "       [0.33333333, 0.66666667]])"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "kmeans.cluster_centers_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "5.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "# 定义欧式距离\n",
+    "def euclidean_distance(x1, x2):\n",
+    "    distance = 0\n",
+    "    # 距离的平方项再开根号\n",
+    "    for i in range(len(x1)):\n",
+    "        distance += pow((x1[i] - x2[i]), 2)\n",
+    "    return np.sqrt(distance)\n",
+    "\n",
+    "print(euclidean_distance(X[0], X[4]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 定义中心初始化函数\n",
+    "def centroids_init(k, X):\n",
+    "    m, n = X.shape\n",
+    "    centroids = np.zeros((k, n))\n",
+    "    for i in range(k):\n",
+    "        # 每一次循环随机选择一个类别中心\n",
+    "        centroid = X[np.random.choice(range(m))]\n",
+    "        centroids[i] = centroid\n",
+    "    return centroids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 定义样本的最近质心点所属的类别索引\n",
+    "def closest_centroid(sample, centroids):\n",
+    "    closest_i = 0\n",
+    "    closest_dist = float('inf')\n",
+    "    for i, centroid in enumerate(centroids):\n",
+    "        # 根据欧式距离判断，选择最小距离的中心点所属类别\n",
+    "        distance = euclidean_distance(sample, centroid)\n",
+    "        if distance < closest_dist:\n",
+    "            closest_i = i\n",
+    "            closest_dist = distance\n",
+    "    return closest_i"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 定义构建类别过程\n",
+    "def build_clusters(centroids, k, X):\n",
+    "    clusters = [[] for _ in range(k)]\n",
+    "    for x_i, x in enumerate(X):\n",
+    "        # 将样本划分到最近的类别区域\n",
+    "        centroid_i = closest_centroid(x, centroids)\n",
+    "        clusters[centroid_i].append(x_i)\n",
+    "    return clusters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 根据上一步聚类结果计算新的中心点\n",
+    "def calculate_centroids(clusters, k, X):\n",
+    "    n = X.shape[1]\n",
+    "    centroids = np.zeros((k, n))\n",
+    "    # 以当前每个类样本的均值为新的中心点\n",
+    "    for i, cluster in enumerate(clusters):\n",
+    "        centroid = np.mean(X[cluster], axis=0)\n",
+    "        centroids[i] = centroid\n",
+    "    return centroids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 获取每个样本所属的聚类类别\n",
+    "def get_cluster_labels(clusters, X):\n",
+    "    y_pred = np.zeros(X.shape[0])\n",
+    "    for cluster_i, cluster in enumerate(clusters):\n",
+    "        for X_i in cluster:\n",
+    "            y_pred[X_i] = cluster_i\n",
+    "    return y_pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 根据上述各流程定义kmeans算法流程\n",
+    "def kmeans(X, k, max_iterations):\n",
+    "    # 1.初始化中心点\n",
+    "    centroids = centroids_init(k, X)\n",
+    "    # 遍历迭代求解\n",
+    "    for _ in range(max_iterations):\n",
+    "        # 2.根据当前中心点进行聚类\n",
+    "        clusters = build_clusters(centroids, k, X)\n",
+    "        # 保存当前中心点\n",
+    "        prev_centroids = centroids\n",
+    "        # 3.根据聚类结果计算新的中心点\n",
+    "        centroids = calculate_centroids(clusters, k, X)\n",
+    "        # 4.设定收敛条件为中心点是否发生变化\n",
+    "        diff = centroids - prev_centroids\n",
+    "        if not diff.any():\n",
+    "            break\n",
+    "    # 返回最终的聚类标签\n",
+    "    return get_cluster_labels(clusters, X)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0. 0. 0. 1. 1.]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 测试数据\n",
+    "X = np.array([[0,2],[0,0],[1,0],[5,0],[5,2]])\n",
+    "# 设定聚类类别为2个，最大迭代次数为10次\n",
+    "labels = kmeans(X, 2, 10)\n",
+    "# 打印每个样本所属的类别标签\n",
+    "print(labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}