Add files via upload

louwill · web-flow · commit c77e3805251b · 2019-05-12T11:51:53.000+08:00
diff --git a/naive bayes/naive_bayes.ipynb b/naive bayes/naive_bayes.ipynb
@@ -0,0 +1,239 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>x1</th>\n",
+       "      <th>x2</th>\n",
+       "      <th>y</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>S</td>\n",
+       "      <td>-1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>M</td>\n",
+       "      <td>-1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>M</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>S</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1</td>\n",
+       "      <td>S</td>\n",
+       "      <td>-1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   x1 x2  y\n",
+       "0   1  S -1\n",
+       "1   1  M -1\n",
+       "2   1  M  1\n",
+       "3   1  S  1\n",
+       "4   1  S -1"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "x1 = [1,1,1,1,1,2,2,2,2,2,3,3,3,3,3]\n",
+    "x2 = ['S','M','M','S','S','S','M','M','L','L','L','M','M','L','L']\n",
+    "y = [-1,-1,1,1,-1,-1,-1,1,1,1,1,1,1,1,-1]\n",
+    "\n",
+    "df = pd.DataFrame({'x1':x1, 'x2':x2, 'y':y})\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = df[['x1', 'x2']]\n",
+    "y = df[['y']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def nb_fit(X, y):\n",
+    "    classes = y[y.columns[0]].unique()\n",
+    "    class_count = y[y.columns[0]].value_counts()\n",
+    "    class_prior = class_count/len(y)\n",
+    "    \n",
+    "    prior = dict()\n",
+    "    for col in X.columns:\n",
+    "        for j in classes:\n",
+    "            p_x_y = X[(y==j).values][col].value_counts()\n",
+    "            for i in p_x_y.index:\n",
+    "                prior[(col, i, j)] = p_x_y[i]/class_count[j]\n",
+    "    return classes, class_prior, prior"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(array([-1,  1], dtype=int64),  1    0.6\n",
+       " -1    0.4\n",
+       " Name: y, dtype: float64, {('x1', 1, -1): 0.5,\n",
+       "  ('x1', 2, -1): 0.3333333333333333,\n",
+       "  ('x1', 3, -1): 0.16666666666666666,\n",
+       "  ('x1', 3, 1): 0.4444444444444444,\n",
+       "  ('x1', 2, 1): 0.3333333333333333,\n",
+       "  ('x1', 1, 1): 0.2222222222222222,\n",
+       "  ('x2', 'S', -1): 0.5,\n",
+       "  ('x2', 'M', -1): 0.3333333333333333,\n",
+       "  ('x2', 'L', -1): 0.16666666666666666,\n",
+       "  ('x2', 'L', 1): 0.4444444444444444,\n",
+       "  ('x2', 'M', 1): 0.4444444444444444,\n",
+       "  ('x2', 'S', 1): 0.1111111111111111})"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nb_fit(X, y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_test = {'x1': 2, 'x2': 'S'}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "classes, class_prior, prior = nb_fit(X, y)\n",
+    "\n",
+    "def predict(X_test):\n",
+    "    res = []\n",
+    "    for c in classes:\n",
+    "        p_y = class_prior[c]\n",
+    "        p_x_y = 1\n",
+    "        for i in X_test.items():\n",
+    "            p_x_y *= prior[tuple(list(i)+[c])]\n",
+    "        res.append(p_y*p_x_y)\n",
+    "    return classes[np.argmax(res)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "测试数据预测类别为： -1\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('测试数据预测类别为：', predict(X_test))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/naive bayes/naive_bayes.py b/naive bayes/naive_bayes.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+
+'''
+@author: louwill
+@contact: ygnjd2016@gmail.com
+@file: naive_bayes.py
+@time: 2019/5/12 9:41
+'''
+
+import numpy as  np
+import pandas as pd
+
+class Naive_Bayes:
+    def __init__(self):
+        pass
+
+    # 朴素贝叶斯训练过程
+    def nb_fit(self, X, y):
+        classes = y[y.columns[0]].unique()
+        class_count = y[y.columns[0]].value_counts()
+        # 类先验概率
+        class_prior = class_count / len(y)
+        # 计算类条件概率
+        prior = dict()
+        for col in X.columns:
+            for j in classes:
+                p_x_y = X[(y == j).values][col].value_counts()
+                for i in p_x_y.index:
+                    prior[(col, i, j)] = p_x_y[i] / class_count[j]
+
+        return classes, class_prior, prior
+
+    # 预测新的实例
+    def predict(self, X_test):
+        res = []
+        for c in classes:
+            p_y = class_prior[c]
+            p_x_y = 1
+            for i in X_test.items():
+                p_x_y *= prior[tuple(list(i) + [c])]
+            res.append(p_y * p_x_y)
+        return classes[np.argmax(res)]
+
+
+if __name__ == "__main__":
+    x1 = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]
+    x2 = ['S', 'M', 'M', 'S', 'S', 'S', 'M', 'M', 'L', 'L', 'L', 'M', 'M', 'L', 'L']
+    y = [-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1]
+    df = pd.DataFrame({'x1': x1, 'x2': x2, 'y': y})
+    X = df[['x1', 'x2']]
+    y = df[['y']]
+    X_test = {'x1': 2, 'x2': 'S'}
+
+    nb = Naive_Bayes()
+    classes, class_prior, prior = nb.nb_fit(X, y)
+    print('测试数据预测类别为：', nb.predict(X_test))
+