singh-sumit
diff --git a/‎notebooks/ReinforcementLearning.ipynb‎
Lines changed: 189 additions & 0 deletions b/‎notebooks/ReinforcementLearning.ipynb‎
Lines changed: 189 additions & 0 deletions
diff --git a/‎notebooks/assets/reinforcement_learning/q_RMSerror.png‎
52.3 KB b/‎notebooks/assets/reinforcement_learning/q_RMSerror.png‎
52.3 KB
diff --git a/‎notebooks/assets/reinforcement_learning/q_utility_estimates.png‎
133 KB b/‎notebooks/assets/reinforcement_learning/q_utility_estimates.png‎
133 KB
@@ -786,6 +786,195 @@
     "pseudocode('Q Learning Agent')"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's test the Q-learning agent on the 4\\*3 cell world discussed above."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Cell \t:\tExpected Utility\n",
+      "----------------------------------\n",
+      "[1,1] \t:\t0.7447970297243381\n",
+      "[1,2] \t:\t0.7824342221178695\n",
+      "[1,3] \t:\t0.8183443976230272\n",
+      "[2,1] \t:\t0.6603248055209805\n",
+      "[2,3] \t:\t0.881618254644549\n",
+      "[3,1] \t:\t0.5780849497361085\n",
+      "[3,2] \t:\t0.41519898633959246\n",
+      "[3,3] \t:\t0.9503633060769898\n",
+      "[4,1] \t:\t-0.048891852584282955\n",
+      "[4,2] \t:\t-1.0\n",
+      "[4,3] \t:\t1.0\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "null"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import aima.core.environment.cellworld.*;\n",
+    "import aima.core.learning.reinforcement.agent.QLearningAgent;\n",
+    "import aima.core.learning.reinforcement.example.CellWorldEnvironment;\n",
+    "import aima.core.probability.example.MDPFactory;\n",
+    "import aima.core.util.JavaRandomizer;\n",
+    "\n",
+    "import java.util.*;;\n",
+    "\n",
+    "CellWorld<Double> cw = CellWorldFactory.createCellWorldForFig17_1();;\n",
+    "CellWorldEnvironment cwe = new CellWorldEnvironment(\n",
+    "            cw.getCellAt(1, 1),\n",
+    "            cw.getCells(),\n",
+    "            MDPFactory.createTransitionProbabilityFunctionForFigure17_1(cw),\n",
+    "            new JavaRandomizer());\n",
+    "QLearningAgent<Cell<Double>, CellWorldAction> qla = new QLearningAgent<Cell<Double>, CellWorldAction>(MDPFactory.createActionsFunctionForFigure17_1(cw), CellWorldAction.None, 0.2, 1.0, 5, 2.0);\n",
+    "cwe.addAgent(qla);\n",
+    "qla.reset();\n",
+    "cwe.executeTrials(100000);\n",
+    "System.out.println(\"Cell\"  + \" \\t:\\t\" + \"Expected Utility\");\n",
+    "System.out.println(\"----------------------------------\");\n",
+    "Map<Cell<Double>, Double> U = qla.getUtility();\n",
+    "for(int i = 1; i<=4; i++){\n",
+    "    for(int j = 1; j<=3; j++){\n",
+    "        if(i==2 && j==2) continue; //Ignore wall\n",
+    "        System.out.println(\"[\" + i + \",\" + j + \"]\"  + \" \\t:\\t\" + U.get(cw.getCellAt(i,j)));\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The learning curves of the Q-Learning agent for the  $4∗3$  cell world are shown below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import aima.core.environment.cellworld.*;\n",
+    "import aima.core.learning.reinforcement.agent.QLearningAgent;\n",
+    "import aima.core.learning.reinforcement.example.CellWorldEnvironment;\n",
+    "import aima.core.probability.example.MDPFactory;\n",
+    "import aima.core.util.JavaRandomizer;\n",
+    "\n",
+    "import java.util.*;\n",
+    "\n",
+    "int numRuns = 20;\n",
+    "int numTrialsPerRun = 10000;\n",
+    "int rmseTrialsToReport = 500;\n",
+    "int reportEveryN = 20;\n",
+    "\n",
+    "CellWorld<Double> cw = CellWorldFactory.createCellWorldForFig17_1();;\n",
+    "CellWorldEnvironment cwe = new CellWorldEnvironment(\n",
+    "            cw.getCellAt(1, 1),\n",
+    "            cw.getCells(),\n",
+    "            MDPFactory.createTransitionProbabilityFunctionForFigure17_1(cw),\n",
+    "            new JavaRandomizer());\n",
+    "QLearningAgent<Cell<Double>, CellWorldAction> qla = new QLearningAgent<Cell<Double>, CellWorldAction>(MDPFactory.createActionsFunctionForFigure17_1(cw), CellWorldAction.None, 0.2, 1.0, 5, 2.0);\n",
+    "cwe.addAgent(qla);\n",
+    "Map<Integer, List<Map<Cell<Double>, Double>>> runs = new HashMap<Integer, List<Map<Cell<Double>, Double>>>();\n",
+    "for (int r = 0; r < numRuns; r++) {\n",
+    "    qla.reset();\n",
+    "    List<Map<Cell<Double>, Double>> trials = new ArrayList<Map<Cell<Double>, Double>>();\n",
+    "    for (int t = 0; t < numTrialsPerRun; t++) {\n",
+    "        cwe.executeTrial();\n",
+    "        if (0 == t % reportEveryN) {\n",
+    "            Map<Cell<Double>, Double> u = qla.getUtility();\n",
+    "            trials.add(u);\n",
+    "        }\n",
+    "    }\n",
+    "    runs.put(r, trials);\n",
+    "}\n",
+    "\n",
+    "def T = [];\n",
+    "def v4_3 = [];\n",
+    "def v3_3 = [];\n",
+    "def v1_3 = [];\n",
+    "def v1_1 = [];\n",
+    "def v3_2 = [];\n",
+    "def v2_1 = [];\n",
+    "double tmp = 0.0;\n",
+    "for (int t = 0; t < (numTrialsPerRun/reportEveryN); t++) {\n",
+    "    T.add(t);\n",
+    "    Map<Cell<Double>, Double> u = runs.get(numRuns - 1).get(t);\n",
+    "    tmp = (u.containsKey(cw.getCellAt(4, 3)) ? u.get(cw.getCellAt(4, 3)) : 0.0);\n",
+    "    v4_3.add(tmp);\n",
+    "    tmp = (u.containsKey(cw.getCellAt(3, 3)) ? u.get(cw.getCellAt(3, 3)) : 0.0);\n",
+    "    v3_3.add(tmp);\n",
+    "    tmp = (u.containsKey(cw.getCellAt(1, 3)) ? u.get(cw.getCellAt(1, 3)) : 0.0);\n",
+    "    v1_3.add(tmp);\n",
+    "    tmp = (u.containsKey(cw.getCellAt(1, 1)) ? u.get(cw.getCellAt(1, 1)) : 0.0);\n",
+    "    v1_1.add(tmp);\n",
+    "    tmp = (u.containsKey(cw.getCellAt(3, 2)) ? u.get(cw.getCellAt(3, 2)) : 0.0);\n",
+    "    v3_2.add(tmp);\n",
+    "    tmp = (u.containsKey(cw.getCellAt(2, 1)) ? u.get(cw.getCellAt(2, 1)) : 0.0);\n",
+    "    v2_1.add(tmp);\n",
+    "}\n",
+    "\n",
+    "def p1 = new Plot(title: \"Learning Curve\", yLabel: \"Utility estimates\", xLabel: \"Number of trails\");\n",
+    "p1 << new Line(x: T, y: v4_3, displayName: \"v4_3\")\n",
+    "p1 << new Line(x: T, y: v3_3, displayName: \"v3_3\")\n",
+    "p1 << new Line(x: T, y: v1_3, displayName: \"v1_3\")\n",
+    "p1 << new Line(x: T, y: v1_1, displayName: \"v1_1\")\n",
+    "p1 << new Line(x: T, y: v3_2, displayName: \"v3_2\")\n",
+    "p1 << new Line(x: T, y: v2_1, displayName: \"v2_1\")\n",
+    "\n",
+    "def trails = [];\n",
+    "def rmseValues = [];\n",
+    "for (int t = 0; t < rmseTrialsToReport; t++) {\n",
+    "    trails.add(t);\n",
+    "    double xSsquared = 0;\n",
+    "    for (int r = 0; r < numRuns; r++) {\n",
+    "        Map<Cell<Double>, Double> u = runs.get(r).get(t);\n",
+    "        Double val1_1 = u.get(cw.getCellAt(1, 1));\n",
+    "        xSsquared += Math.pow(0.705 - val1_1, 2);\n",
+    "    }\n",
+    "    double rmse = Math.sqrt(xSsquared/runs.size());\n",
+    "    rmseValues.add(rmse);\n",
+    "}\n",
+    "def p2 = new Plot(yLabel: \"RMS error in utility\", xLabel: \"Number of trails\");\n",
+    "p2 << new Line(x: trails, y: rmseValues)\n",
+    "OutputCell.HIDDEN"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Utility estimates][1]][1]\n",
+    "\n",
+    "[1]: assets/reinforcement_learning/q_utility_estimates.png"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![RMS error in utility][1]][1]\n",
+    "\n",
+    "[1]: assets/reinforcement_learning/q_RMSerror.png"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,