Skip to content

Commit 79e160a

Browse files
committed
Merge branch 'a1c'
2 parents fd07768 + 68a3e85 commit 79e160a

File tree

12 files changed

+968
-2
lines changed

12 files changed

+968
-2
lines changed

PolicyGradient/README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,4 +59,6 @@
5959
- [Solution](Continuous MountainCar Actor Critic Solution.ipynb)
6060
- Deterministic Policy Gradients for Continuous Action Spaces (WIP)
6161
- Deep Deterministic Policy Gradients (WIP)
62-
- Asynchronous Advantage Actor Critic (A3C) (WIP)
62+
- Asynchronous Advantage Actor Critic (A3C)
63+
- Exercise
64+
- [Solution](a3c/)

PolicyGradient/a3c/README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
## Implementation of A#C (Asynchronous Advantage Actor Critic)
2+
3+
#### Running
4+
5+
```
6+
./train.py --model_dir /tmp/a3c --env Breakout-v0 --t_max 5 --eval_every 300 --parallelism 8
7+
```
8+
9+
See `./train.py --help` for a full list of options. Then, monitor training progress in Tensorboard:
10+
11+
```
12+
tensorboard --logdir=/tmp/a3c
13+
```
14+
15+
#### Components
16+
17+
- [`train.py`](train.py) contains the main method to start training.
18+
- [`estimators.py`](estimators.py) contains the Tensorflow graph definitions for the Policy and Value networks.
19+
- [`worker.py`](worker.py) contains code that runs in each worker threads.
20+
- [`policy_monitor.py`](policy_monitor.py) contains code that evaluates the policy network by running an episode and saving rewards to Tensorboard.
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import unittest
2+
import gym
3+
import sys
4+
import os
5+
import numpy as np
6+
import tensorflow as tf
7+
8+
from inspect import getsourcefile
9+
current_path = os.path.dirname(os.path.abspath(getsourcefile(lambda:0)))
10+
import_path = os.path.abspath(os.path.join(current_path, "../.."))
11+
12+
if import_path not in sys.path:
13+
sys.path.append(import_path)
14+
15+
# from lib import plotting
16+
from lib.atari.state_processor import StateProcessor
17+
from lib.atari import helpers as atari_helpers
18+
from estimators import ValueEstimator, PolicyEstimator
19+
20+
21+
def make_env():
22+
return gym.envs.make("Breakout-v0")
23+
24+
VALID_ACTIONS = [0, 1, 2, 3]
25+
26+
class PolicyEstimatorTest(tf.test.TestCase):
27+
def testPredict(self):
28+
env = make_env()
29+
sp = StateProcessor()
30+
estimator = PolicyEstimator(len(VALID_ACTIONS))
31+
32+
with self.test_session() as sess:
33+
sess.run(tf.initialize_all_variables())
34+
35+
# Generate a state
36+
state = sp.process(env.reset())
37+
processed_state = atari_helpers.atari_make_initial_state(state)
38+
processed_states = np.array([processed_state])
39+
40+
# Run feeds
41+
feed_dict = {
42+
estimator.states: processed_states,
43+
estimator.targets: [1.0],
44+
estimator.actions: [1]
45+
}
46+
loss = sess.run(estimator.loss, feed_dict)
47+
pred = sess.run(estimator.predictions, feed_dict)
48+
49+
# Assertions
50+
self.assertTrue(loss != 0.0)
51+
self.assertEqual(pred["probs"].shape, (1, len(VALID_ACTIONS)))
52+
self.assertEqual(pred["logits"].shape, (1, len(VALID_ACTIONS)))
53+
54+
def testGradient(self):
55+
env = make_env()
56+
sp = StateProcessor()
57+
estimator = PolicyEstimator(len(VALID_ACTIONS))
58+
grads = [g for g, _ in estimator.grads_and_vars]
59+
60+
with self.test_session() as sess:
61+
sess.run(tf.initialize_all_variables())
62+
63+
# Generate a state
64+
state = sp.process(env.reset())
65+
processed_state = atari_helpers.atari_make_initial_state(state)
66+
processed_states = np.array([processed_state])
67+
68+
# Run feeds to get gradients
69+
feed_dict = {
70+
estimator.states: processed_states,
71+
estimator.targets: [1.0],
72+
estimator.actions: [1]
73+
}
74+
grads_ = sess.run(grads, feed_dict)
75+
76+
# Apply calculated gradients
77+
grad_feed_dict = { k: v for k, v in zip(grads, grads_) }
78+
_ = sess.run(estimator.train_op, grad_feed_dict)
79+
80+
81+
class ValueEstimatorTest(tf.test.TestCase):
82+
def testPredict(self):
83+
env = make_env()
84+
sp = StateProcessor()
85+
estimator = ValueEstimator()
86+
87+
with self.test_session() as sess:
88+
sess.run(tf.initialize_all_variables())
89+
90+
# Generate a state
91+
state = sp.process(env.reset())
92+
processed_state = atari_helpers.atari_make_initial_state(state)
93+
processed_states = np.array([processed_state])
94+
95+
# Run feeds
96+
feed_dict = {
97+
estimator.states: processed_states,
98+
estimator.targets: [1.0],
99+
}
100+
loss = sess.run(estimator.loss, feed_dict)
101+
pred = sess.run(estimator.predictions, feed_dict)
102+
103+
# Assertions
104+
self.assertTrue(loss != 0.0)
105+
self.assertEqual(pred["logits"].shape, (1,))
106+
107+
def testGradient(self):
108+
env = make_env()
109+
sp = StateProcessor()
110+
estimator = ValueEstimator()
111+
grads = [g for g, _ in estimator.grads_and_vars]
112+
113+
with self.test_session() as sess:
114+
sess.run(tf.initialize_all_variables())
115+
116+
# Generate a state
117+
state = sp.process(env.reset())
118+
processed_state = atari_helpers.atari_make_initial_state(state)
119+
processed_states = np.array([processed_state])
120+
121+
# Run feeds
122+
feed_dict = {
123+
estimator.states: processed_states,
124+
estimator.targets: [1.0],
125+
}
126+
grads_ = sess.run(grads, feed_dict)
127+
128+
# Apply calculated gradients
129+
grad_feed_dict = { k: v for k, v in zip(grads, grads_) }
130+
_ = sess.run(estimator.train_op, grad_feed_dict)
131+
132+
if __name__ == '__main__':
133+
unittest.main()

PolicyGradient/a3c/estimators.py

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
import numpy as np
2+
import tensorflow as tf
3+
4+
def build_shared_network(X, add_summaries=False):
5+
"""
6+
Builds a 3-layer network conv -> conv -> fc as described
7+
in the A3C paper. This network is shared by bother the policy and value net.
8+
9+
Args:
10+
X: Inputs
11+
add_summaries: If true, add layer summaries to Tensorboard.
12+
13+
Returns:
14+
Final layer activations.
15+
"""
16+
17+
# Three convolutional layers
18+
conv1 = tf.contrib.layers.conv2d(
19+
X, 16, 8, 4, activation_fn=tf.nn.relu, scope="conv1")
20+
conv2 = tf.contrib.layers.conv2d(
21+
conv1, 32, 4, 2, activation_fn=tf.nn.relu, scope="conv2")
22+
23+
# Fully connected layer
24+
fc1 = tf.contrib.layers.fully_connected(
25+
inputs=tf.contrib.layers.flatten(conv2),
26+
num_outputs=256,
27+
scope="fc1")
28+
29+
if add_summaries:
30+
tf.contrib.layers.summarize_activation(conv1)
31+
tf.contrib.layers.summarize_activation(conv2)
32+
tf.contrib.layers.summarize_activation(fc1)
33+
34+
return fc1
35+
36+
class PolicyEstimator():
37+
"""
38+
Policy Function approximator. Given a observation, returns probabilities
39+
over all possible actions.
40+
41+
Args:
42+
num_outputs: Size of the action space.
43+
reuse: If true, an existing shared network will be re-used.
44+
trainable: If true we add train ops to the network.
45+
Actor threads that don't update their local models and don't need
46+
train ops would set this to false.
47+
"""
48+
49+
def __init__(self, num_outputs, reuse=False, trainable=True):
50+
self.num_outputs = num_outputs
51+
52+
# Placeholders for our input
53+
# Our input are 4 RGB frames of shape 160, 160 each
54+
self.states = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X")
55+
# The TD target value
56+
self.targets = tf.placeholder(shape=[None], dtype=tf.float32, name="y")
57+
# Integer id of which action was selected
58+
self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="actions")
59+
60+
# Normalize
61+
X = tf.to_float(self.states) / 255.0
62+
batch_size = tf.shape(self.states)[0]
63+
64+
# Graph shared with Value Net
65+
with tf.variable_scope("shared", reuse=reuse):
66+
fc1 = build_shared_network(X, add_summaries=(not reuse))
67+
68+
69+
with tf.variable_scope("policy_net"):
70+
self.logits = tf.contrib.layers.fully_connected(fc1, num_outputs, activation_fn=None)
71+
self.probs = tf.nn.softmax(self.logits) + 1e-8
72+
73+
self.predictions = {
74+
"logits": self.logits,
75+
"probs": self.probs
76+
}
77+
78+
# We add cross-entropy to the loss to encourage exploration
79+
self.cross_entropy = -tf.reduce_sum(self.probs * tf.log(self.probs), 1, name="cross_entropy")
80+
self.cross_entropy_mean = tf.reduce_mean(self.cross_entropy, name="cross_entropy_mean")
81+
82+
# Get the predictions for the chosen actions only
83+
gather_indices = tf.range(batch_size) * tf.shape(self.probs)[1] + self.actions
84+
self.picked_action_probs = tf.gather(tf.reshape(self.probs, [-1]), gather_indices)
85+
86+
self.losses = - (tf.log(self.picked_action_probs) * self.targets + 0.01 * self.cross_entropy)
87+
self.loss = tf.reduce_sum(self.losses, name="loss")
88+
89+
tf.scalar_summary(self.loss.op.name, self.loss)
90+
tf.scalar_summary(self.cross_entropy_mean.op.name, self.cross_entropy_mean)
91+
tf.histogram_summary(self.cross_entropy.op.name, self.cross_entropy)
92+
93+
if trainable:
94+
# self.optimizer = tf.train.AdamOptimizer(1e-4)
95+
self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
96+
self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
97+
self.grads_and_vars = [[grad, var] for grad, var in self.grads_and_vars if grad is not None]
98+
self.train_op = self.optimizer.apply_gradients(self.grads_and_vars,
99+
global_step=tf.contrib.framework.get_global_step())
100+
101+
# Merge summaries from this network and the shared network (but not the value net)
102+
var_scope_name = tf.get_variable_scope().name
103+
summary_ops = tf.get_collection(tf.GraphKeys.SUMMARIES)
104+
sumaries = [s for s in summary_ops if "policy_net" in s.name or "shared" in s.name]
105+
sumaries = [s for s in summary_ops if var_scope_name in s.name]
106+
self.summaries = tf.merge_summary(sumaries)
107+
108+
109+
class ValueEstimator():
110+
"""
111+
Value Function approximator. Returns a value estimator for a batch of observations.
112+
113+
Args:
114+
reuse: If true, an existing shared network will be re-used.
115+
trainable: If true we add train ops to the network.
116+
Actor threads that don't update their local models and don't need
117+
train ops would set this to false.
118+
"""
119+
120+
def __init__(self, reuse=False, trainable=True):
121+
# Placeholders for our input
122+
# Our input are 4 RGB frames of shape 160, 160 each
123+
self.states = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X")
124+
# The TD target value
125+
self.targets = tf.placeholder(shape=[None], dtype=tf.float32, name="y")
126+
127+
X = tf.to_float(self.states) / 255.0
128+
batch_size = tf.shape(self.states)[0]
129+
130+
# Graph shared with Value Net
131+
with tf.variable_scope("shared", reuse=reuse):
132+
fc1 = build_shared_network(X, add_summaries=(not reuse))
133+
134+
with tf.variable_scope("value_net"):
135+
self.logits = tf.contrib.layers.fully_connected(
136+
inputs=fc1,
137+
num_outputs=1,
138+
activation_fn=None)
139+
self.logits = tf.squeeze(self.logits, squeeze_dims=[1], name="logits")
140+
141+
self.losses = tf.squared_difference(self.logits, self.targets)
142+
self.loss = tf.reduce_sum(self.losses, name="loss")
143+
144+
self.predictions = {
145+
"logits": self.logits
146+
}
147+
148+
# Summaries
149+
prefix = tf.get_variable_scope().name
150+
tf.scalar_summary(self.loss.name, self.loss)
151+
tf.scalar_summary("{}/max_value".format(prefix), tf.reduce_max(self.logits))
152+
tf.scalar_summary("{}/min_value".format(prefix), tf.reduce_min(self.logits))
153+
tf.scalar_summary("{}/mean_value".format(prefix), tf.reduce_mean(self.logits))
154+
tf.scalar_summary("{}/reward_max".format(prefix), tf.reduce_max(self.targets))
155+
tf.scalar_summary("{}/reward_min".format(prefix), tf.reduce_min(self.targets))
156+
tf.scalar_summary("{}/reward_mean".format(prefix), tf.reduce_mean(self.targets))
157+
tf.histogram_summary("{}/reward_targets".format(prefix), self.targets)
158+
tf.histogram_summary("{}/values".format(prefix), self.logits)
159+
160+
if trainable:
161+
# self.optimizer = tf.train.AdamOptimizer(1e-4)
162+
self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
163+
self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
164+
self.grads_and_vars = [[grad, var] for grad, var in self.grads_and_vars if grad is not None]
165+
self.train_op = self.optimizer.apply_gradients(self.grads_and_vars,
166+
global_step=tf.contrib.framework.get_global_step())
167+
168+
var_scope_name = tf.get_variable_scope().name
169+
summary_ops = tf.get_collection(tf.GraphKeys.SUMMARIES)
170+
sumaries = [s for s in summary_ops if "policy_net" in s.name or "shared" in s.name]
171+
sumaries = [s for s in summary_ops if var_scope_name in s.name]
172+
self.summaries = tf.merge_summary(sumaries)

0 commit comments

Comments
 (0)