tiny update

lazyprogrammer · lazyprogrammer · commit e5372d6fdadd · 2017-01-16T00:59:28.000-05:00
diff --git a/rl/iterative_policy_evaluation.py b/rl/iterative_policy_evaluation.py
@@ -67,6 +67,7 @@ def print_policy(P, g):
 
     if biggest_change < SMALL_ENOUGH:
       break
+  print "values for uniformly random actions:"
   print_values(V, grid)
   print "\n\n"
 
@@ -108,4 +109,5 @@ def print_policy(P, g):
 
     if biggest_change < SMALL_ENOUGH:
       break
+  print "values for fixed policy:"
   print_values(V, grid)
diff --git a/rl/policy_iteration_random.py b/rl/policy_iteration_random.py
@@ -15,6 +15,7 @@
   # this grid gives you a reward of -0.1 for every non-terminal state
   # we want to see if this will encourage finding a shorter path to the goal
   grid = negative_grid(step_cost=-1.0)
+  # grid = standard_grid()
 
   # print rewards
   print "rewards:"
diff --git a/rl/value_iteration.py b/rl/value_iteration.py
@@ -40,9 +40,9 @@
       # terminal state
       V[s] = 0
 
-  # repeat until convergence - will break out when policy does not change
+  # repeat until convergence
+  # V[s] = max[a]{ sum[s',r] { p(s',r|s,a)[r + gamma*V[s']] } }
   while True:
-
     biggest_change = 0
     for s in states:
       old_v = V[s]
@@ -76,6 +76,7 @@
         best_a = a
     policy[s] = best_a
 
+  # our goal here is to verify that we get the same answer as with policy iteration
   print "values:"
   print_values(V, grid)
   print "policy:"