HLJ1997
diff --git a/‎charpter7_decision_tree/CART.ipynb‎
Lines changed: 367 additions & 0 deletions b/‎charpter7_decision_tree/CART.ipynb‎
Lines changed: 367 additions & 0 deletions
@@ -0,0 +1,367 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### CART"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import accuracy_score, mean_squared_error\n",
+    "from utils import feature_split, calculate_gini"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### 定义树结点\n",
+    "class TreeNode():\n",
+    "    def __init__(self, feature_i=None, threshold=None,\n",
+    "                 leaf_value=None, left_branch=None, right_branch=None):\n",
+    "        # 特征索引\n",
+    "        self.feature_i = feature_i          \n",
+    "        # 特征划分阈值\n",
+    "        self.threshold = threshold \n",
+    "        # 叶子节点取值\n",
+    "        self.leaf_value = leaf_value   \n",
+    "        # 左子树\n",
+    "        self.left_branch = left_branch     \n",
+    "        # 右子树\n",
+    "        self.right_branch = right_branch    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### 定义二叉决策树\n",
+    "class BinaryDecisionTree(object):\n",
+    "    ### 决策树初始参数\n",
+    "    def __init__(self, min_samples_split=2, min_gini_impurity=999,\n",
+    "                 max_depth=float(\"inf\"), loss=None):\n",
+    "        # 根结点\n",
+    "        self.root = None  \n",
+    "        # 节点最小分裂样本数\n",
+    "        self.min_samples_split = min_samples_split\n",
+    "        # 节点初始化基尼不纯度\n",
+    "        self.mini_gini_impurity = min_gini_impurity\n",
+    "        # 树最大深度\n",
+    "        self.max_depth = max_depth\n",
+    "        # 基尼不纯度计算函数\n",
+    "        self.gini_impurity_calculation = None\n",
+    "        # 叶子节点值预测函数\n",
+    "        self._leaf_value_calculation = None\n",
+    "        # 损失函数\n",
+    "        self.loss = loss\n",
+    "\n",
+    "    ### 决策树拟合函数\n",
+    "    def fit(self, X, y, loss=None):\n",
+    "        # 递归构建决策树\n",
+    "        self.root = self._build_tree(X, y)\n",
+    "        self.loss=None\n",
+    "\n",
+    "    ### 决策树构建函数\n",
+    "    def _build_tree(self, X, y, current_depth=0):\n",
+    "        # 初始化最小基尼不纯度\n",
+    "        init_gini_impurity = 999\n",
+    "        # 初始化最佳特征索引和阈值\n",
+    "        best_criteria = None    \n",
+    "        # 初始化数据子集\n",
+    "        best_sets = None        \n",
+    "\n",
+    "        # 合并输入和标签\n",
+    "        Xy = np.concatenate((X, y), axis=1)\n",
+    "        # 获取样本数和特征数\n",
+    "        n_samples, n_features = X.shape\n",
+    "        # 设定决策树构建条件\n",
+    "        # 训练样本数量大于节点最小分裂样本数且当前树深度小于最大深度\n",
+    "        if n_samples >= self.min_samples_split and current_depth <= self.max_depth:\n",
+    "            # 遍历计算每个特征的基尼不纯度\n",
+    "            for feature_i in range(n_features):\n",
+    "                # 获取第i特征的所有取值\n",
+    "                feature_values = np.expand_dims(X[:, feature_i], axis=1)\n",
+    "                # 获取第i个特征的唯一取值\n",
+    "                unique_values = np.unique(feature_values)\n",
+    "\n",
+    "                # 遍历取值并寻找最佳特征分裂阈值\n",
+    "                for threshold in unique_values:\n",
+    "                    # 特征节点二叉分裂\n",
+    "                    Xy1, Xy2 = feature_split(Xy, feature_i, threshold)\n",
+    "                    # 如果分裂后的子集大小都不为0\n",
+    "                    if len(Xy1) > 0 and len(Xy2) > 0:\n",
+    "                        # 获取两个子集的标签值\n",
+    "                        y1 = Xy1[:, n_features:]\n",
+    "                        y2 = Xy2[:, n_features:]\n",
+    "\n",
+    "                        # 计算基尼不纯度\n",
+    "                        impurity = self.impurity_calculation(y, y1, y2)\n",
+    "\n",
+    "                        # 获取最小基尼不纯度\n",
+    "                        # 最佳特征索引和分裂阈值\n",
+    "                        if impurity < init_gini_impurity:\n",
+    "                            init_gini_impurity = impurity\n",
+    "                            best_criteria = {\"feature_i\": feature_i, \"threshold\": threshold}\n",
+    "                            best_sets = {\n",
+    "                                \"leftX\": Xy1[:, :n_features],   \n",
+    "                                \"lefty\": Xy1[:, n_features:],   \n",
+    "                                \"rightX\": Xy2[:, :n_features],  \n",
+    "                                \"righty\": Xy2[:, n_features:]   \n",
+    "                                }\n",
+    "        \n",
+    "        # 如果计算的最小不纯度小于设定的最小不纯度\n",
+    "        if init_gini_impurity < self.mini_gini_impurity:\n",
+    "            # 分别构建左右子树\n",
+    "            left_branch = self._build_tree(best_sets[\"leftX\"], best_sets[\"lefty\"], current_depth + 1)\n",
+    "            right_branch = self._build_tree(best_sets[\"rightX\"], best_sets[\"righty\"], current_depth + 1)\n",
+    "            return TreeNode(feature_i=best_criteria[\"feature_i\"], threshold=best_criteria[\n",
+    "                                \"threshold\"], left_branch=left_branch, right_branch=right_branch)\n",
+    "\n",
+    "        # 计算叶子计算取值\n",
+    "        leaf_value = self._leaf_value_calculation(y)\n",
+    "\n",
+    "        return TreeNode(leaf_value=leaf_value)\n",
+    "\n",
+    "    ### 定义二叉树值预测函数\n",
+    "    def predict_value(self, x, tree=None):\n",
+    "        if tree is None:\n",
+    "            tree = self.root\n",
+    "\n",
+    "        # 如果叶子节点已有值，则直接返回已有值\n",
+    "        if tree.leaf_value is not None:\n",
+    "            return tree.leaf_value\n",
+    "\n",
+    "        # 选择特征并获取特征值\n",
+    "        feature_value = x[tree.feature_i]\n",
+    "\n",
+    "        # 判断落入左子树还是右子树\n",
+    "        branch = tree.right_branch\n",
+    "        if isinstance(feature_value, int) or isinstance(feature_value, float):\n",
+    "            if feature_value >= tree.threshold:\n",
+    "                branch = tree.left_branch\n",
+    "        elif feature_value == tree.threshold:\n",
+    "            branch = tree.right_branch\n",
+    "\n",
+    "        # 测试子集\n",
+    "        return self.predict_value(x, branch)\n",
+    "\n",
+    "    ### 数据集预测函数\n",
+    "    def predict(self, X):\n",
+    "        y_pred = [self.predict_value(sample) for sample in X]\n",
+    "        return y_pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### CART回归树\n",
+    "class RegressionTree(BinaryDecisionTree):\n",
+    "    def _calculate_variance_reduction(self, y, y1, y2):\n",
+    "        var_tot = np.var(y, axis=0)\n",
+    "        var_y1 = np.var(y1, axis=0)\n",
+    "        var_y2 = np.var(y2, axis=0)\n",
+    "        frac_1 = len(y1) / len(y)\n",
+    "        frac_2 = len(y2) / len(y)\n",
+    "        # 计算方差减少量\n",
+    "        variance_reduction = var_tot - (frac_1 * var_y1 + frac_2 * var_y2)\n",
+    "        \n",
+    "        return sum(variance_reduction)\n",
+    "\n",
+    "    # 节点值取平均\n",
+    "    def _mean_of_y(self, y):\n",
+    "        value = np.mean(y, axis=0)\n",
+    "        return value if len(value) > 1 else value[0]\n",
+    "\n",
+    "    def fit(self, X, y):\n",
+    "        self.impurity_calculation = self._calculate_variance_reduction\n",
+    "        self._leaf_value_calculation = self._mean_of_y\n",
+    "        super(RegressionTree, self).fit(X, y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### CART决策树\n",
+    "class ClassificationTree(BinaryDecisionTree):\n",
+    "    ### 定义基尼不纯度计算过程\n",
+    "    def _calculate_gini_impurity(self, y, y1, y2):\n",
+    "        p = len(y1) / len(y)\n",
+    "        gini = calculate_gini(y)\n",
+    "        gini_impurity = p * calculate_gini(y1) + (1-p) * calculate_gini(y2)\n",
+    "        return gini_impurity\n",
+    "    \n",
+    "    ### 多数投票\n",
+    "    def _majority_vote(self, y):\n",
+    "        most_common = None\n",
+    "        max_count = 0\n",
+    "        for label in np.unique(y):\n",
+    "            # 统计多数\n",
+    "            count = len(y[y == label])\n",
+    "            if count > max_count:\n",
+    "                most_common = label\n",
+    "                max_count = count\n",
+    "        return most_common\n",
+    "    \n",
+    "    # 分类树拟合\n",
+    "    def fit(self, X, y):\n",
+    "        self.impurity_calculation = self._calculate_gini_impurity\n",
+    "        self._leaf_value_calculation = self._majority_vote\n",
+    "        super(ClassificationTree, self).fit(X, y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.9777777777777777\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn import datasets\n",
+    "data = datasets.load_iris()\n",
+    "X, y = data.data, data.target\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)\n",
+    "clf = ClassificationTree()\n",
+    "clf.fit(X_train, y_train)\n",
+    "y_pred = clf.predict(X_test)\n",
+    "\n",
+    "print(accuracy_score(y_test, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "clf = DecisionTreeClassifier()\n",
+    "clf.fit(X_train, y_train)\n",
+    "y_pred = clf.predict(X_test)\n",
+    "\n",
+    "print(accuracy_score(y_test, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Squared Error: 134.4803289473684\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.datasets import load_boston\n",
+    "X, y = load_boston(return_X_y=True)\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)\n",
+    "model = RegressionTree()\n",
+    "model.fit(X_train, y_train)\n",
+    "y_pred = model.predict(X_test)\n",
+    "mse = mean_squared_error(y_test, y_pred)\n",
+    "\n",
+    "print(\"Mean Squared Error:\", mse)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Squared Error: 28.75368421052632\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.tree import DecisionTreeRegressor\n",
+    "reg = DecisionTreeRegressor()\n",
+    "reg.fit(X_train, y_train)\n",
+    "y_pred = reg.predict(X_test)\n",
+    "mse = mean_squared_error(y_test, y_pred)\n",
+    "\n",
+    "print(\"Mean Squared Error:\", mse)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}