8
8
import torch .nn as nn
9
9
import torch .nn .functional as F
10
10
import torch .optim as optim
11
- import torch .autograd as autograd
12
11
from torch .autograd import Variable
12
+ from torch .distributions import Multinomial
13
13
14
14
15
15
parser = argparse .ArgumentParser (description = 'PyTorch actor-critic example' )
29
29
torch .manual_seed (args .seed )
30
30
31
31
32
- SavedAction = namedtuple ('SavedAction' , ['action' , 'value' ])
32
+ SavedAction = namedtuple ('SavedAction' , ['log_prob' , 'value' ])
33
+
34
+
33
35
class Policy (nn .Module ):
34
36
def __init__ (self ):
35
37
super (Policy , self ).__init__ ()
@@ -54,29 +56,28 @@ def forward(self, x):
54
56
def select_action (state ):
55
57
state = torch .from_numpy (state ).float ().unsqueeze (0 )
56
58
probs , state_value = model (Variable (state ))
57
- action = probs .multinomial ()
58
- model .saved_actions .append (SavedAction (action , state_value ))
59
+ m = Multinomial (probs )
60
+ action = m .sample ()
61
+ model .saved_actions .append (SavedAction (m .log_prob (action ), state_value ))
59
62
return action .data
60
63
61
64
62
65
def finish_episode ():
63
66
R = 0
64
67
saved_actions = model .saved_actions
65
- value_loss = 0
68
+ policy_loss , value_loss = 0 , 0
66
69
rewards = []
67
70
for r in model .rewards [::- 1 ]:
68
71
R = r + args .gamma * R
69
72
rewards .insert (0 , R )
70
73
rewards = torch .Tensor (rewards )
71
74
rewards = (rewards - rewards .mean ()) / (rewards .std () + np .finfo (np .float32 ).eps )
72
- for (action , value ), r in zip (saved_actions , rewards ):
73
- reward = r - value .data [0 ,0 ]
74
- action . reinforce ( reward )
75
+ for (log_prob , value ), r in zip (saved_actions , rewards ):
76
+ reward = r - value .data [0 , 0 ]
77
+ policy_loss -= ( log_prob * reward ). sum ( )
75
78
value_loss += F .smooth_l1_loss (value , Variable (torch .Tensor ([r ])))
76
79
optimizer .zero_grad ()
77
- final_nodes = [value_loss ] + list (map (lambda p : p .action , saved_actions ))
78
- gradients = [torch .ones (1 )] + [None ] * len (saved_actions )
79
- autograd .backward (final_nodes , gradients )
80
+ (policy_loss + value_loss ).backward ()
80
81
optimizer .step ()
81
82
del model .rewards [:]
82
83
del model .saved_actions [:]
@@ -85,9 +86,9 @@ def finish_episode():
85
86
running_reward = 10
86
87
for i_episode in count (1 ):
87
88
state = env .reset ()
88
- for t in range (10000 ): # Don't infinite loop while learning
89
+ for t in range (10000 ): # Don't infinite loop while learning
89
90
action = select_action (state )
90
- state , reward , done , _ = env .step (action [0 ,0 ])
91
+ state , reward , done , _ = env .step (action [0 , 0 ])
91
92
if args .render :
92
93
env .render ()
93
94
model .rewards .append (reward )
0 commit comments