25
25
26
26
27
27
env = gym .make ('CartPole-v1' )
28
- env .seed ( args .seed )
28
+ env .reset ( seed = args .seed )
29
29
torch .manual_seed (args .seed )
30
30
31
31
@@ -56,7 +56,7 @@ def forward(self, x):
56
56
"""
57
57
x = F .relu (self .affine1 (x ))
58
58
59
- # actor: choses action to take from state s_t
59
+ # actor: choses action to take from state s_t
60
60
# by returning probability of each action
61
61
action_prob = F .softmax (self .action_head (x ), dim = - 1 )
62
62
@@ -65,7 +65,7 @@ def forward(self, x):
65
65
66
66
# return values for both actor and critic as a tuple of 2 values:
67
67
# 1. a list with the probability of each action over the action space
68
- # 2. the value from state s_t
68
+ # 2. the value from state s_t
69
69
return action_prob , state_values
70
70
71
71
@@ -113,7 +113,7 @@ def finish_episode():
113
113
for (log_prob , value ), R in zip (saved_actions , returns ):
114
114
advantage = R - value .item ()
115
115
116
- # calculate actor (policy) loss
116
+ # calculate actor (policy) loss
117
117
policy_losses .append (- log_prob * advantage )
118
118
119
119
# calculate critic (value) loss using L1 smooth loss
@@ -141,18 +141,18 @@ def main():
141
141
for i_episode in count (1 ):
142
142
143
143
# reset environment and episode reward
144
- state = env .reset ()
144
+ state , _ = env .reset ()
145
145
ep_reward = 0
146
146
147
- # for each episode, only run 9999 steps so that we don't
147
+ # for each episode, only run 9999 steps so that we don't
148
148
# infinite loop while learning
149
149
for t in range (1 , 10000 ):
150
150
151
151
# select action from policy
152
152
action = select_action (state )
153
153
154
154
# take the action
155
- state , reward , done , _ = env .step (action )
155
+ state , reward , done , _ , _ = env .step (action )
156
156
157
157
if args .render :
158
158
env .render ()
0 commit comments