Skip to content

Commit f8c609d

Browse files
committed
6
1 parent b60ff87 commit f8c609d

File tree

4 files changed

+718
-0
lines changed

4 files changed

+718
-0
lines changed
Lines changed: 258 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,258 @@
1+
#######################################################################
2+
# Copyright (C) #
3+
# 2016-2018 Shangtong Zhang([email protected]) #
4+
# 2016 Kenta Shimada([email protected]) #
5+
# Permission given to modify the code as long as you keep this #
6+
# declaration at the top #
7+
#######################################################################
8+
9+
import numpy as np
10+
import matplotlib
11+
matplotlib.use('Agg')
12+
import matplotlib.pyplot as plt
13+
from tqdm import tqdm
14+
15+
# world height
16+
WORLD_HEIGHT = 4
17+
18+
# world width
19+
WORLD_WIDTH = 12
20+
21+
# probability for exploration
22+
EPSILON = 0.1
23+
24+
# step size
25+
ALPHA = 0.5
26+
27+
# gamma for Q-Learning and Expected Sarsa
28+
GAMMA = 1
29+
30+
# all possible actions
31+
ACTION_UP = 0
32+
ACTION_DOWN = 1
33+
ACTION_LEFT = 2
34+
ACTION_RIGHT = 3
35+
ACTIONS = [ACTION_UP, ACTION_DOWN, ACTION_LEFT, ACTION_RIGHT]
36+
37+
# initial state action pair values
38+
START = [3, 0]
39+
GOAL = [3, 11]
40+
41+
def step(state, action):
42+
i, j = state
43+
if action == ACTION_UP:
44+
next_state = [max(i - 1, 0), j]
45+
elif action == ACTION_LEFT:
46+
next_state = [i, max(j - 1, 0)]
47+
elif action == ACTION_RIGHT:
48+
next_state = [i, min(j + 1, WORLD_WIDTH - 1)]
49+
elif action == ACTION_DOWN:
50+
next_state = [min(i + 1, WORLD_HEIGHT - 1), j]
51+
else:
52+
assert False
53+
54+
reward = -1
55+
if (action == ACTION_DOWN and i == 2 and 1 <= j <= 10) or (
56+
action == ACTION_RIGHT and state == START):
57+
reward = -100
58+
next_state = START
59+
60+
return next_state, reward
61+
62+
# reward for each action in each state
63+
# actionRewards = np.zeros((WORLD_HEIGHT, WORLD_WIDTH, 4))
64+
# actionRewards[:, :, :] = -1.0
65+
# actionRewards[2, 1:11, ACTION_DOWN] = -100.0
66+
# actionRewards[3, 0, ACTION_RIGHT] = -100.0
67+
68+
# set up destinations for each action in each state
69+
# actionDestination = []
70+
# for i in range(0, WORLD_HEIGHT):
71+
# actionDestination.append([])
72+
# for j in range(0, WORLD_WIDTH):
73+
# destinaion = dict()
74+
# destinaion[ACTION_UP] = [max(i - 1, 0), j]
75+
# destinaion[ACTION_LEFT] = [i, max(j - 1, 0)]
76+
# destinaion[ACTION_RIGHT] = [i, min(j + 1, WORLD_WIDTH - 1)]
77+
# if i == 2 and 1 <= j <= 10:
78+
# destinaion[ACTION_DOWN] = START
79+
# else:
80+
# destinaion[ACTION_DOWN] = [min(i + 1, WORLD_HEIGHT - 1), j]
81+
# actionDestination[-1].append(destinaion)
82+
# actionDestination[3][0][ACTION_RIGHT] = START
83+
84+
# choose an action based on epsilon greedy algorithm
85+
def choose_action(state, q_value):
86+
if np.random.binomial(1, EPSILON) == 1:
87+
return np.random.choice(ACTIONS)
88+
else:
89+
values_ = q_value[state[0], state[1], :]
90+
return np.random.choice([action_ for action_, value_ in enumerate(values_) if value_ == np.max(values_)])
91+
92+
# an episode with Sarsa
93+
# @q_value: values for state action pair, will be updated
94+
# @expected: if True, will use expected Sarsa algorithm
95+
# @step_size: step size for updating
96+
# @return: total rewards within this episode
97+
def sarsa(q_value, expected=False, step_size=ALPHA):
98+
state = START
99+
action = choose_action(state, q_value)
100+
rewards = 0.0
101+
while state != GOAL:
102+
next_state, reward = step(state, action)
103+
next_action = choose_action(next_state, q_value)
104+
rewards += reward
105+
if not expected:
106+
target = q_value[next_state[0], next_state[1], next_action]
107+
else:
108+
# calculate the expected value of new state
109+
target = 0.0
110+
q_next = q_value[next_state[0], next_state[1], :]
111+
best_actions = np.argwhere(q_next == np.max(q_next))
112+
for action_ in ACTIONS:
113+
if action_ in best_actions:
114+
target += ((1.0 - EPSILON) / len(best_actions) + EPSILON / len(ACTIONS)) * q_value[next_state[0], next_state[1], action_]
115+
else:
116+
target += EPSILON / len(ACTIONS) * q_value[next_state[0], next_state[1], action_]
117+
target *= GAMMA
118+
q_value[state[0], state[1], action] += step_size * (
119+
reward + target - q_value[state[0], state[1], action])
120+
state = next_state
121+
action = next_action
122+
return rewards
123+
124+
# an episode with Q-Learning
125+
# @q_value: values for state action pair, will be updated
126+
# @step_size: step size for updating
127+
# @return: total rewards within this episode
128+
def q_learning(q_value, step_size=ALPHA):
129+
state = START
130+
rewards = 0.0
131+
while state != GOAL:
132+
action = choose_action(state, q_value)
133+
next_state, reward = step(state, action)
134+
rewards += reward
135+
# Q-Learning update
136+
q_value[state[0], state[1], action] += step_size * (
137+
reward + GAMMA * np.max(q_value[next_state[0], next_state[1], :]) -
138+
q_value[state[0], state[1], action])
139+
state = next_state
140+
return rewards
141+
142+
# print optimal policy
143+
def print_optimal_policy(q_value):
144+
optimal_policy = []
145+
for i in range(0, WORLD_HEIGHT):
146+
optimal_policy.append([])
147+
for j in range(0, WORLD_WIDTH):
148+
if [i, j] == GOAL:
149+
optimal_policy[-1].append('G')
150+
continue
151+
bestAction = np.argmax(q_value[i, j, :])
152+
if bestAction == ACTION_UP:
153+
optimal_policy[-1].append('U')
154+
elif bestAction == ACTION_DOWN:
155+
optimal_policy[-1].append('D')
156+
elif bestAction == ACTION_LEFT:
157+
optimal_policy[-1].append('L')
158+
elif bestAction == ACTION_RIGHT:
159+
optimal_policy[-1].append('R')
160+
for row in optimal_policy:
161+
print(row)
162+
163+
# Use multiple runs instead of a single run and a sliding window
164+
# With a single run I failed to present a smooth curve
165+
# However the optimal policy converges well with a single run
166+
# Sarsa converges to the safe path, while Q-Learning converges to the optimal path
167+
def figure_6_4():
168+
# episodes of each run
169+
episodes = 500
170+
171+
# perform 40 independent runs
172+
runs = 50
173+
174+
rewards_sarsa = np.zeros(episodes)
175+
rewards_q_learning = np.zeros(episodes)
176+
for r in tqdm(range(runs)):
177+
q_sarsa = np.zeros((WORLD_HEIGHT, WORLD_WIDTH, 4))
178+
q_q_learning = np.copy(q_sarsa)
179+
for i in range(0, episodes):
180+
# cut off the value by -100 to draw the figure more elegantly
181+
# rewards_sarsa[i] += max(sarsa(q_sarsa), -100)
182+
# rewards_q_learning[i] += max(q_learning(q_q_learning), -100)
183+
rewards_sarsa[i] += sarsa(q_sarsa)
184+
rewards_q_learning[i] += q_learning(q_q_learning)
185+
186+
# averaging over independt runs
187+
rewards_sarsa /= runs
188+
rewards_q_learning /= runs
189+
190+
# draw reward curves
191+
plt.plot(rewards_sarsa, label='Sarsa')
192+
plt.plot(rewards_q_learning, label='Q-Learning')
193+
plt.xlabel('Episodes')
194+
plt.ylabel('Sum of rewards during episode')
195+
plt.ylim([-100, 0])
196+
plt.legend()
197+
198+
plt.savefig('../images/figure_6_4.png')
199+
plt.close()
200+
201+
# display optimal policy
202+
print('Sarsa Optimal Policy:')
203+
print_optimal_policy(q_sarsa)
204+
print('Q-Learning Optimal Policy:')
205+
print_optimal_policy(q_q_learning)
206+
207+
# Due to limited capacity of calculation of my machine, I can't complete this experiment
208+
# with 100,000 episodes and 50,000 runs to get the fully averaged performance
209+
# However even I only play for 1,000 episodes and 10 runs, the curves looks still good.
210+
def figure_6_6():
211+
step_sizes = np.arange(0.1, 1.1, 0.1)
212+
episodes = 1000
213+
runs = 10
214+
215+
ASY_SARSA = 0
216+
ASY_EXPECTED_SARSA = 1
217+
ASY_QLEARNING = 2
218+
INT_SARSA = 3
219+
INT_EXPECTED_SARSA = 4
220+
INT_QLEARNING = 5
221+
methods = range(0, 6)
222+
223+
performace = np.zeros((6, len(step_sizes)))
224+
for run in range(runs):
225+
for ind, step_size in tqdm(list(zip(range(0, len(step_sizes)), step_sizes))):
226+
q_sarsa = np.zeros((WORLD_HEIGHT, WORLD_WIDTH, 4))
227+
q_expected_sarsa = np.copy(q_sarsa)
228+
q_q_learning = np.copy(q_sarsa)
229+
for ep in range(episodes):
230+
sarsa_reward = sarsa(q_sarsa, expected=False, step_size=step_size)
231+
expected_sarsa_reward = sarsa(q_expected_sarsa, expected=True, step_size=step_size)
232+
q_learning_reward = q_learning(q_q_learning, step_size=step_size)
233+
performace[ASY_SARSA, ind] += sarsa_reward
234+
performace[ASY_EXPECTED_SARSA, ind] += expected_sarsa_reward
235+
performace[ASY_QLEARNING, ind] += q_learning_reward
236+
237+
if ep < 100:
238+
performace[INT_SARSA, ind] += sarsa_reward
239+
performace[INT_EXPECTED_SARSA, ind] += expected_sarsa_reward
240+
performace[INT_QLEARNING, ind] += q_learning_reward
241+
242+
performace[:3, :] /= episodes * runs
243+
performace[3:, :] /= 100 * runs
244+
labels = ['Asymptotic Sarsa', 'Asymptotic Expected Sarsa', 'Asymptotic Q-Learning',
245+
'Interim Sarsa', 'Interim Expected Sarsa', 'Interim Q-Learning']
246+
247+
for method, label in zip(methods, labels):
248+
plt.plot(step_sizes, performace[method, :], label=label)
249+
plt.xlabel('alpha')
250+
plt.ylabel('reward per episode')
251+
plt.legend()
252+
253+
plt.savefig('../images/figure_6_6.png')
254+
plt.close()
255+
256+
if __name__ == '__main__':
257+
figure_6_4()
258+
figure_6_6()
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
#######################################################################
2+
# Copyright (C) #
3+
# 2016-2018 Shangtong Zhang([email protected]) #
4+
# 2016 Kenta Shimada([email protected]) #
5+
# Permission given to modify the code as long as you keep this #
6+
# declaration at the top #
7+
#######################################################################
8+
9+
import numpy as np
10+
import matplotlib
11+
matplotlib.use('Agg')
12+
import matplotlib.pyplot as plt
13+
from tqdm import tqdm
14+
import copy
15+
16+
# state A
17+
STATE_A = 0
18+
19+
# state B
20+
STATE_B = 1
21+
22+
# use one terminal state
23+
STATE_TERMINAL = 2
24+
25+
# starts from state A
26+
STATE_START = STATE_A
27+
28+
# possible actions in A
29+
ACTION_A_RIGHT = 0
30+
ACTION_A_LEFT = 1
31+
32+
# probability for exploration
33+
EPSILON = 0.1
34+
35+
# step size
36+
ALPHA = 0.1
37+
38+
# discount for max value
39+
GAMMA = 1.0
40+
41+
# possible actions in B, maybe 10 actions
42+
ACTIONS_B = range(0, 10)
43+
44+
# all possible actions
45+
STATE_ACTIONS = [[ACTION_A_RIGHT, ACTION_A_LEFT], ACTIONS_B]
46+
47+
# state action pair values, if a state is a terminal state, then the value is always 0
48+
INITIAL_Q = [np.zeros(2), np.zeros(len(ACTIONS_B)), np.zeros(1)]
49+
50+
# set up destination for each state and each action
51+
TRANSITION = [[STATE_TERMINAL, STATE_B], [STATE_TERMINAL] * len(ACTIONS_B)]
52+
53+
# choose an action based on epsilon greedy algorithm
54+
def choose_action(state, q_value):
55+
if np.random.binomial(1, EPSILON) == 1:
56+
return np.random.choice(STATE_ACTIONS[state])
57+
else:
58+
values_ = q_value[state]
59+
return np.random.choice([action_ for action_, value_ in enumerate(values_) if value_ == np.max(values_)])
60+
61+
# take @action in @state, return the reward
62+
def take_action(state, action):
63+
if state == STATE_A:
64+
return 0
65+
return np.random.normal(-0.1, 1)
66+
67+
# if there are two state action pair value array, use double Q-Learning
68+
# otherwise use normal Q-Learning
69+
def q_learning(q1, q2=None):
70+
state = STATE_START
71+
# track the # of action left in state A
72+
left_count = 0
73+
while state != STATE_TERMINAL:
74+
if q2 is None:
75+
action = choose_action(state, q1)
76+
else:
77+
# derive a action form Q1 and Q2
78+
action = choose_action(state, [item1 + item2 for item1, item2 in zip(q1, q2)])
79+
if state == STATE_A and action == ACTION_A_LEFT:
80+
left_count += 1
81+
reward = take_action(state, action)
82+
next_state = TRANSITION[state][action]
83+
if q2 is None:
84+
active_q = q1
85+
target = np.max(active_q[next_state])
86+
else:
87+
if np.random.binomial(1, 0.5) == 1:
88+
active_q = q1
89+
target_q = q2
90+
else:
91+
active_q = q2
92+
target_q = q1
93+
best_action = np.random.choice([action_ for action_, value_ in enumerate(active_q[next_state]) if value_ == np.max(active_q[next_state])])
94+
target = target_q[next_state][best_action]
95+
96+
# Q-Learning update
97+
active_q[state][action] += ALPHA * (
98+
reward + GAMMA * target - active_q[state][action])
99+
state = next_state
100+
return left_count
101+
102+
# Figure 6.7, 1,000 runs may be enough, # of actions in state B will also affect the curves
103+
def figure_6_7():
104+
# each independent run has 300 episodes
105+
episodes = 300
106+
runs = 1000
107+
left_counts_q = np.zeros((runs, episodes))
108+
left_counts_double_q = np.zeros((runs, episodes))
109+
for run in tqdm(range(runs)):
110+
q = copy.deepcopy(INITIAL_Q)
111+
q1 = copy.deepcopy(INITIAL_Q)
112+
q2 = copy.deepcopy(INITIAL_Q)
113+
for ep in range(0, episodes):
114+
left_counts_q[run, ep] = q_learning(q)
115+
left_counts_double_q[run, ep] = q_learning(q1, q2)
116+
left_counts_q = left_counts_q.mean(axis=0)
117+
left_counts_double_q = left_counts_double_q.mean(axis=0)
118+
119+
plt.plot(left_counts_q, label='Q-Learning')
120+
plt.plot(left_counts_double_q, label='Double Q-Learning')
121+
plt.plot(np.ones(episodes) * 0.05, label='Optimal')
122+
plt.xlabel('episodes')
123+
plt.ylabel('% left actions from A')
124+
plt.legend()
125+
126+
plt.savefig('../images/figure_6_7.png')
127+
plt.close()
128+
129+
if __name__ == '__main__':
130+
figure_6_7()

0 commit comments

Comments
 (0)