Skip to content

Commit c03550e

Browse files
committed
Create counterexample.py
1 parent d49ac5f commit c03550e

File tree

1 file changed

+345
-0
lines changed
  • deep-learning/Deep-Reinforcement-Learning-Complete-Collection/DeepRL-Code/chapter11

1 file changed

+345
-0
lines changed
Lines changed: 345 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,345 @@
1+
#######################################################################
2+
# Copyright (C) #
3+
# 2016 - 2018 Shangtong Zhang([email protected]) #
4+
# 2016 Kenta Shimada([email protected]) #
5+
# Permission given to modify the code as long as you keep this #
6+
# declaration at the top #
7+
#######################################################################
8+
9+
import numpy as np
10+
import matplotlib
11+
matplotlib.use('Agg')
12+
import matplotlib.pyplot as plt
13+
from tqdm import tqdm
14+
from mpl_toolkits.mplot3d.axes3d import Axes3D
15+
16+
# all states: state 0-5 are upper states
17+
STATES = np.arange(0, 7)
18+
# state 6 is lower state
19+
LOWER_STATE = 6
20+
# discount factor
21+
DISCOUNT = 0.99
22+
23+
# each state is represented by a vector of length 8
24+
FEATURE_SIZE = 8
25+
FEATURES = np.zeros((len(STATES), FEATURE_SIZE))
26+
for i in range(LOWER_STATE):
27+
FEATURES[i, i] = 2
28+
FEATURES[i, 7] = 1
29+
FEATURES[LOWER_STATE, 6] = 1
30+
FEATURES[LOWER_STATE, 7] = 2
31+
32+
# all possible actions
33+
DASHED = 0
34+
SOLID = 1
35+
ACTIONS = [DASHED, SOLID]
36+
37+
# reward is always zero
38+
REWARD = 0
39+
40+
# take @action at @state, return the new state
41+
def step(state, action):
42+
if action == SOLID:
43+
return LOWER_STATE
44+
return np.random.choice(STATES[: LOWER_STATE])
45+
46+
# target policy
47+
def target_policy(state):
48+
return SOLID
49+
50+
# state distribution for the behavior policy
51+
STATE_DISTRIBUTION = np.ones(len(STATES)) / 7
52+
STATE_DISTRIBUTION_MAT = np.matrix(np.diag(STATE_DISTRIBUTION))
53+
# projection matrix for minimize MSVE
54+
PROJECTION_MAT = np.matrix(FEATURES) * \
55+
np.linalg.pinv(np.matrix(FEATURES.T) * STATE_DISTRIBUTION_MAT * np.matrix(FEATURES)) * \
56+
np.matrix(FEATURES.T) * \
57+
STATE_DISTRIBUTION_MAT
58+
59+
# behavior policy
60+
BEHAVIOR_SOLID_PROBABILITY = 1.0 / 7
61+
def behavior_policy(state):
62+
if np.random.binomial(1, BEHAVIOR_SOLID_PROBABILITY) == 1:
63+
return SOLID
64+
return DASHED
65+
66+
# Semi-gradient off-policy temporal difference
67+
# @state: current state
68+
# @theta: weight for each component of the feature vector
69+
# @alpha: step size
70+
# @return: next state
71+
def semi_gradient_off_policy_TD(state, theta, alpha):
72+
action = behavior_policy(state)
73+
next_state = step(state, action)
74+
# get the importance ratio
75+
if action == DASHED:
76+
rho = 0.0
77+
else:
78+
rho = 1.0 / BEHAVIOR_SOLID_PROBABILITY
79+
delta = REWARD + DISCOUNT * np.dot(FEATURES[next_state, :], theta) - \
80+
np.dot(FEATURES[state, :], theta)
81+
delta *= rho * alpha
82+
# derivatives happen to be the same matrix due to the linearity
83+
theta += FEATURES[state, :] * delta
84+
return next_state
85+
86+
# Semi-gradient DP
87+
# @theta: weight for each component of the feature vector
88+
# @alpha: step size
89+
def semi_gradient_DP(theta, alpha):
90+
delta = 0.0
91+
# go through all the states
92+
for state in STATES:
93+
expected_return = 0.0
94+
# compute bellman error for each state
95+
for next_state in STATES:
96+
if next_state == LOWER_STATE:
97+
expected_return += REWARD + DISCOUNT * np.dot(theta, FEATURES[next_state, :])
98+
bellmanError = expected_return - np.dot(theta, FEATURES[state, :])
99+
# accumulate gradients
100+
delta += bellmanError * FEATURES[state, :]
101+
# derivatives happen to be the same matrix due to the linearity
102+
theta += alpha / len(STATES) * delta
103+
104+
# temporal difference with gradient correction
105+
# @state: current state
106+
# @theta: weight of each component of the feature vector
107+
# @weight: auxiliary trace for gradient correction
108+
# @alpha: step size of @theta
109+
# @beta: step size of @weight
110+
def TDC(state, theta, weight, alpha, beta):
111+
action = behavior_policy(state)
112+
next_state = step(state, action)
113+
# get the importance ratio
114+
if action == DASHED:
115+
rho = 0.0
116+
else:
117+
rho = 1.0 / BEHAVIOR_SOLID_PROBABILITY
118+
delta = REWARD + DISCOUNT * np.dot(FEATURES[next_state, :], theta) - \
119+
np.dot(FEATURES[state, :], theta)
120+
theta += alpha * rho * (delta * FEATURES[state, :] - DISCOUNT * FEATURES[next_state, :] * np.dot(FEATURES[state, :], weight))
121+
weight += beta * rho * (delta - np.dot(FEATURES[state, :], weight)) * FEATURES[state, :]
122+
return next_state
123+
124+
# expected temporal difference with gradient correction
125+
# @theta: weight of each component of the feature vector
126+
# @weight: auxiliary trace for gradient correction
127+
# @alpha: step size of @theta
128+
# @beta: step size of @weight
129+
def expected_TDC(theta, weight, alpha, beta):
130+
for state in STATES:
131+
# When computing expected update target, if next state is not lower state, importance ratio will be 0,
132+
# so we can safely ignore this case and assume next state is always lower state
133+
delta = REWARD + DISCOUNT * np.dot(FEATURES[LOWER_STATE, :], theta) - np.dot(FEATURES[state, :], theta)
134+
rho = 1 / BEHAVIOR_SOLID_PROBABILITY
135+
# Under behavior policy, state distribution is uniform, so the probability for each state is 1.0 / len(STATES)
136+
expected_update_theta = 1.0 / len(STATES) * BEHAVIOR_SOLID_PROBABILITY * rho * (
137+
delta * FEATURES[state, :] - DISCOUNT * FEATURES[LOWER_STATE, :] * np.dot(weight, FEATURES[state, :]))
138+
theta += alpha * expected_update_theta
139+
expected_update_weight = 1.0 / len(STATES) * BEHAVIOR_SOLID_PROBABILITY * rho * (
140+
delta - np.dot(weight, FEATURES[state, :])) * FEATURES[state, :]
141+
weight += beta * expected_update_weight
142+
143+
# if *accumulate* expected update and actually apply update here, then it's synchronous
144+
# theta += alpha * expectedUpdateTheta
145+
# weight += beta * expectedUpdateWeight
146+
147+
# interest is 1 for every state
148+
INTEREST = 1
149+
150+
# expected update of ETD
151+
# @theta: weight of each component of the feature vector
152+
# @emphasis: current emphasis
153+
# @alpha: step size of @theta
154+
# @return: expected next emphasis
155+
def expected_emphatic_TD(theta, emphasis, alpha):
156+
# we perform synchronous update for both theta and emphasis
157+
expected_update = 0
158+
expected_next_emphasis = 0.0
159+
# go through all the states
160+
for state in STATES:
161+
# compute rho(t-1)
162+
if state == LOWER_STATE:
163+
rho = 1.0 / BEHAVIOR_SOLID_PROBABILITY
164+
else:
165+
rho = 0
166+
# update emphasis
167+
next_emphasis = DISCOUNT * rho * emphasis + INTEREST
168+
expected_next_emphasis += next_emphasis
169+
# When computing expected update target, if next state is not lower state, importance ratio will be 0,
170+
# so we can safely ignore this case and assume next state is always lower state
171+
next_state = LOWER_STATE
172+
delta = REWARD + DISCOUNT * np.dot(FEATURES[next_state, :], theta) - np.dot(FEATURES[state, :], theta)
173+
expected_update += 1.0 / len(STATES) * BEHAVIOR_SOLID_PROBABILITY * next_emphasis * 1 / BEHAVIOR_SOLID_PROBABILITY * delta * FEATURES[state, :]
174+
theta += alpha * expected_update
175+
return expected_next_emphasis / len(STATES)
176+
177+
# compute RMSVE for a value function parameterized by @theta
178+
# true value function is always 0 in this example
179+
def compute_RMSVE(theta):
180+
return np.sqrt(np.dot(np.power(np.dot(FEATURES, theta), 2), STATE_DISTRIBUTION))
181+
182+
# compute RMSPBE for a value function parameterized by @theta
183+
# true value function is always 0 in this example
184+
def compute_RMSPBE(theta):
185+
bellman_error = np.zeros(len(STATES))
186+
for state in STATES:
187+
for next_state in STATES:
188+
if next_state == LOWER_STATE:
189+
bellman_error[state] += REWARD + DISCOUNT * np.dot(theta, FEATURES[next_state, :]) - np.dot(theta, FEATURES[state, :])
190+
bellman_error = np.dot(np.asarray(PROJECTION_MAT), bellman_error)
191+
return np.sqrt(np.dot(np.power(bellman_error, 2), STATE_DISTRIBUTION))
192+
193+
figureIndex = 0
194+
195+
# Figure 11.2(left), semi-gradient off-policy TD
196+
def figure_11_2_left():
197+
# Initialize the theta
198+
theta = np.ones(FEATURE_SIZE)
199+
theta[6] = 10
200+
201+
alpha = 0.01
202+
203+
steps = 1000
204+
thetas = np.zeros((FEATURE_SIZE, steps))
205+
state = np.random.choice(STATES)
206+
for step in tqdm(range(steps)):
207+
state = semi_gradient_off_policy_TD(state, theta, alpha)
208+
thetas[:, step] = theta
209+
210+
for i in range(FEATURE_SIZE):
211+
plt.plot(thetas[i, :], label='theta' + str(i + 1))
212+
plt.xlabel('Steps')
213+
plt.ylabel('Theta value')
214+
plt.title('semi-gradient off-policy TD')
215+
plt.legend()
216+
217+
# Figure 11.2(right), semi-gradient DP
218+
def figure_11_2_right():
219+
# Initialize the theta
220+
theta = np.ones(FEATURE_SIZE)
221+
theta[6] = 10
222+
223+
alpha = 0.01
224+
225+
sweeps = 1000
226+
thetas = np.zeros((FEATURE_SIZE, sweeps))
227+
for sweep in tqdm(range(sweeps)):
228+
semi_gradient_DP(theta, alpha)
229+
thetas[:, sweep] = theta
230+
231+
for i in range(FEATURE_SIZE):
232+
plt.plot(thetas[i, :], label='theta' + str(i + 1))
233+
plt.xlabel('Sweeps')
234+
plt.ylabel('Theta value')
235+
plt.title('semi-gradient DP')
236+
plt.legend()
237+
238+
def figure_11_2():
239+
plt.figure(figsize=(10, 20))
240+
plt.subplot(2, 1, 1)
241+
figure_11_2_left()
242+
plt.subplot(2, 1, 2)
243+
figure_11_2_right()
244+
245+
plt.savefig('../images/figure_11_2.png')
246+
plt.close()
247+
248+
# Figure 11.6(left), temporal difference with gradient correction
249+
def figure_11_6_left():
250+
# Initialize the theta
251+
theta = np.ones(FEATURE_SIZE)
252+
theta[6] = 10
253+
weight = np.zeros(FEATURE_SIZE)
254+
255+
alpha = 0.005
256+
beta = 0.05
257+
258+
steps = 1000
259+
thetas = np.zeros((FEATURE_SIZE, steps))
260+
RMSVE = np.zeros(steps)
261+
RMSPBE = np.zeros(steps)
262+
state = np.random.choice(STATES)
263+
for step in tqdm(range(steps)):
264+
state = TDC(state, theta, weight, alpha, beta)
265+
thetas[:, step] = theta
266+
RMSVE[step] = compute_RMSVE(theta)
267+
RMSPBE[step] = compute_RMSPBE(theta)
268+
269+
for i in range(FEATURE_SIZE):
270+
plt.plot(thetas[i, :], label='theta' + str(i + 1))
271+
plt.plot(RMSVE, label='RMSVE')
272+
plt.plot(RMSPBE, label='RMSPBE')
273+
plt.xlabel('Steps')
274+
plt.title('TDC')
275+
plt.legend()
276+
277+
# Figure 11.6(right), expected temporal difference with gradient correction
278+
def figure_11_6_right():
279+
# Initialize the theta
280+
theta = np.ones(FEATURE_SIZE)
281+
theta[6] = 10
282+
weight = np.zeros(FEATURE_SIZE)
283+
284+
alpha = 0.005
285+
beta = 0.05
286+
287+
sweeps = 1000
288+
thetas = np.zeros((FEATURE_SIZE, sweeps))
289+
RMSVE = np.zeros(sweeps)
290+
RMSPBE = np.zeros(sweeps)
291+
for sweep in tqdm(range(sweeps)):
292+
expected_TDC(theta, weight, alpha, beta)
293+
thetas[:, sweep] = theta
294+
RMSVE[sweep] = compute_RMSVE(theta)
295+
RMSPBE[sweep] = compute_RMSPBE(theta)
296+
297+
for i in range(FEATURE_SIZE):
298+
plt.plot(thetas[i, :], label='theta' + str(i + 1))
299+
plt.plot(RMSVE, label='RMSVE')
300+
plt.plot(RMSPBE, label='RMSPBE')
301+
plt.xlabel('Sweeps')
302+
plt.title('Expected TDC')
303+
plt.legend()
304+
305+
def figure_11_6():
306+
plt.figure(figsize=(10, 20))
307+
plt.subplot(2, 1, 1)
308+
figure_11_6_left()
309+
plt.subplot(2, 1, 2)
310+
figure_11_6_right()
311+
312+
plt.savefig('../images/figure_11_6.png')
313+
plt.close()
314+
315+
# Figure 11.7, expected ETD
316+
def figure_11_7():
317+
# Initialize the theta
318+
theta = np.ones(FEATURE_SIZE)
319+
theta[6] = 10
320+
321+
alpha = 0.03
322+
323+
sweeps = 1000
324+
thetas = np.zeros((FEATURE_SIZE, sweeps))
325+
RMSVE = np.zeros(sweeps)
326+
emphasis = 0.0
327+
for sweep in tqdm(range(sweeps)):
328+
emphasis = expected_emphatic_TD(theta, emphasis, alpha)
329+
thetas[:, sweep] = theta
330+
RMSVE[sweep] = compute_RMSVE(theta)
331+
332+
for i in range(FEATURE_SIZE):
333+
plt.plot(thetas[i, :], label='theta' + str(i + 1))
334+
plt.plot(RMSVE, label='RMSVE')
335+
plt.xlabel('Sweeps')
336+
plt.title('emphatic TD')
337+
plt.legend()
338+
339+
plt.savefig('../images/figure_11_7.png')
340+
plt.close()
341+
342+
if __name__ == '__main__':
343+
figure_11_2()
344+
figure_11_6()
345+
figure_11_7()

0 commit comments

Comments
 (0)