@@ -63,15 +63,29 @@ def make_inputs(self) -> None:
63
63
64
64
if self .policy_model .brain .vector_action_space_type == "continuous" :
65
65
action_length = self .policy_model .act_size [0 ]
66
+ self .action_in_policy = tf .placeholder (
67
+ shape = [None , action_length ], dtype = tf .float32
68
+ )
66
69
self .action_in_expert = tf .placeholder (
67
70
shape = [None , action_length ], dtype = tf .float32
68
71
)
69
72
self .expert_action = tf .identity (self .action_in_expert )
73
+ self .policy_action = tf .identity (self .action_in_policy )
70
74
else :
71
75
action_length = len (self .policy_model .act_size )
76
+ self .action_in_policy = tf .placeholder (
77
+ shape = [None , action_length ], dtype = tf .int32
78
+ )
72
79
self .action_in_expert = tf .placeholder (
73
80
shape = [None , action_length ], dtype = tf .int32
74
81
)
82
+ self .policy_action = tf .concat (
83
+ [
84
+ tf .one_hot (self .action_in_policy [:, i ], act_size )
85
+ for i , act_size in enumerate (self .policy_model .act_size )
86
+ ],
87
+ axis = 1 ,
88
+ )
75
89
self .expert_action = tf .concat (
76
90
[
77
91
tf .one_hot (self .action_in_expert [:, i ], act_size )
@@ -84,6 +98,9 @@ def make_inputs(self) -> None:
84
98
encoded_expert_list = []
85
99
86
100
if self .policy_model .vec_obs_size > 0 :
101
+ self .vector_in = tf .placeholder (
102
+ shape = [None , self .policy_model .vec_obs_size ], dtype = tf .float32
103
+ )
87
104
self .obs_in_expert = tf .placeholder (
88
105
shape = [None , self .policy_model .vec_obs_size ], dtype = tf .float32
89
106
)
@@ -92,26 +109,33 @@ def make_inputs(self) -> None:
92
109
self .policy_model .normalize_vector_obs (self .obs_in_expert )
93
110
)
94
111
encoded_policy_list .append (
95
- self .policy_model .normalize_vector_obs (self .policy_model . vector_in )
112
+ self .policy_model .normalize_vector_obs (self .vector_in )
96
113
)
97
114
else :
98
115
encoded_expert_list .append (self .obs_in_expert )
99
- encoded_policy_list .append (self .policy_model . vector_in )
116
+ encoded_policy_list .append (self .vector_in )
100
117
101
118
if self .policy_model .vis_obs_size > 0 :
102
119
self .expert_visual_in : List [tf .Tensor ] = []
120
+ self .visual_in : List [tf .Tensor ] = []
103
121
visual_policy_encoders = []
104
122
visual_expert_encoders = []
105
123
for i in range (self .policy_model .vis_obs_size ):
106
- # Create input ops for next (t+1) visual observations.
124
+ # Create input ops for visual observations.
107
125
visual_input = self .policy_model .create_visual_input (
108
126
self .policy_model .brain .camera_resolutions [i ],
109
- name = "visual_observation_" + str (i ),
127
+ name = "gail_visual_observation_" + str (i ),
128
+ )
129
+ self .visual_in .append (visual_input )
130
+ # Create input ops for next (t+1) visual observations.
131
+ ex_visual_input = self .policy_model .create_visual_input (
132
+ self .policy_model .brain .camera_resolutions [i ],
133
+ name = "expert_visual_observation_" + str (i ),
110
134
)
111
- self .expert_visual_in .append (visual_input )
135
+ self .expert_visual_in .append (ex_visual_input )
112
136
113
137
encoded_policy_visual = self .policy_model .create_visual_observation_encoder (
114
- self .policy_model . visual_in [i ],
138
+ self .visual_in [i ],
115
139
self .encoding_size ,
116
140
LearningModel .swish ,
117
141
1 ,
@@ -217,10 +241,7 @@ def create_network(self) -> None:
217
241
self .encoded_expert , self .expert_action , self .done_expert , reuse = False
218
242
)
219
243
self .policy_estimate , self .z_mean_policy , _ = self .create_encoder (
220
- self .encoded_policy ,
221
- self .policy_model .selected_actions ,
222
- self .done_policy ,
223
- reuse = True ,
244
+ self .encoded_policy , self .policy_action , self .done_policy , reuse = True
224
245
)
225
246
self .discriminator_score = tf .reshape (
226
247
self .policy_estimate , [- 1 ], name = "GAIL_reward"
@@ -233,11 +254,7 @@ def create_gradient_magnitude(self) -> tf.Tensor:
233
254
for off-policy. Compute gradients w.r.t randomly interpolated input.
234
255
"""
235
256
expert = [self .encoded_expert , self .expert_action , self .done_expert ]
236
- policy = [
237
- self .encoded_policy ,
238
- self .policy_model .selected_actions ,
239
- self .done_policy ,
240
- ]
257
+ policy = [self .encoded_policy , self .policy_action , self .done_policy ]
241
258
interp = []
242
259
for _expert_in , _policy_in in zip (expert , policy ):
243
260
alpha = tf .random_uniform (tf .shape (_expert_in ))
0 commit comments