Skip to content

Commit e5372d6

Browse files
tiny update
1 parent ff1c81c commit e5372d6

File tree

3 files changed

+6
-2
lines changed

3 files changed

+6
-2
lines changed

rl/iterative_policy_evaluation.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ def print_policy(P, g):
6767

6868
if biggest_change < SMALL_ENOUGH:
6969
break
70+
print "values for uniformly random actions:"
7071
print_values(V, grid)
7172
print "\n\n"
7273

@@ -108,4 +109,5 @@ def print_policy(P, g):
108109

109110
if biggest_change < SMALL_ENOUGH:
110111
break
112+
print "values for fixed policy:"
111113
print_values(V, grid)

rl/policy_iteration_random.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
# this grid gives you a reward of -0.1 for every non-terminal state
1616
# we want to see if this will encourage finding a shorter path to the goal
1717
grid = negative_grid(step_cost=-1.0)
18+
# grid = standard_grid()
1819

1920
# print rewards
2021
print "rewards:"

rl/value_iteration.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,9 @@
4040
# terminal state
4141
V[s] = 0
4242

43-
# repeat until convergence - will break out when policy does not change
43+
# repeat until convergence
44+
# V[s] = max[a]{ sum[s',r] { p(s',r|s,a)[r + gamma*V[s']] } }
4445
while True:
45-
4646
biggest_change = 0
4747
for s in states:
4848
old_v = V[s]
@@ -76,6 +76,7 @@
7676
best_a = a
7777
policy[s] = best_a
7878

79+
# our goal here is to verify that we get the same answer as with policy iteration
7980
print "values:"
8081
print_values(V, grid)
8182
print "policy:"

0 commit comments

Comments
 (0)