diff --git a/rl_coach/architectures/tensorflow_components/heads/ppo_head.py b/rl_coach/architectures/tensorflow_components/heads/ppo_head.py index 63f95a3ba..54a8049e5 100644 --- a/rl_coach/architectures/tensorflow_components/heads/ppo_head.py +++ b/rl_coach/architectures/tensorflow_components/heads/ppo_head.py @@ -110,8 +110,8 @@ def _build_discrete_net(self, input_layer, action_space): self.policy_mean = tf.nn.softmax(policy_values, name="policy") # define the distributions for the policy and the old policy - self.policy_distribution = tf.contrib.distributions.Categorical(probs=self.policy_mean) - self.old_policy_distribution = tf.contrib.distributions.Categorical(probs=self.old_policy_mean) + self.policy_distribution = tf.contrib.distributions.Categorical(probs=self.policy_mean + eps) + self.old_policy_distribution = tf.contrib.distributions.Categorical(probs=self.old_policy_mean + eps) self.output = self.policy_mean