Fix for divide-by-zero error with Discrete Actions (#1520)

* Enable buffer padding to be set other than 0 Allows buffer padding in AgentBufferField to be set to a custom value. In particular, 0-padding for `action_masks` causes a divide-by-zero error, and should be padded with 1’s instead. This is done as a parameter passed to the `append` method, so that the pad value can be set right after the instantiation of an AgentBufferField.
Unity-Technologies · vincentpierre · Jan 11, 2019 · Dec 14, 2018 · Dec 21, 2018 · Jan 2, 2019
commit 73199e818284afb31a4ecdf268d27af84e3a858e
diff --git a/ml-agents/mlagents/trainers/buffer.py b/ml-agents/mlagents/trainers/buffer.py
@@ -28,12 +28,27 @@ class AgentBufferField(list):
             AgentBufferField with the append method.
             """
 
+            def __init__(self):
+                self.padding_value = 0
+                super(Buffer.AgentBuffer.AgentBufferField, self).__init__()
+
             def __str__(self):
                 return str(np.array(self).shape)
 
+            def append(self, element, padding_value=0):
+                """
+                Adds an element to this list. Also lets you change the padding 
+                type, so that it can be set on append (e.g. action_masks should
+                be padded with 1.) 
+                :param element: The element to append to the list.
+                :param padding_value: The value used to pad when get_batch is called.
+                """
+                super(Buffer.AgentBuffer.AgentBufferField, self).append(element)
+                self.padding_value = padding_value
+
             def extend(self, data):
                 """
-                Ads a list of np.arrays to the end of the list of np.arrays.
+                Adds a list of np.arrays to the end of the list of np.arrays.
                 :param data: The np.array list to append.
                 """
                 self += list(np.array(data))
@@ -99,7 +114,7 @@ def get_batch(self, batch_size=None, training_length=1, sequential=True):
                             raise BufferException("The batch size and training length requested for get_batch where"
                                                   " too large given the current number of data points.")
                         tmp_list = []
-                        padding = np.array(self[-1]) * 0
+                        padding = np.array(self[-1]) * self.padding_value
                         # The padding is made with zeros and its shape is given by the shape of the last element
                         for end in range(len(self), len(self) % training_length, -training_length)[:batch_size]:
                             tmp_list += [np.array(self[end - training_length:end])]

diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -224,7 +224,7 @@ def add_experiences(self, curr_all_info: AllBrainInfo, next_all_info: AllBrainIn
                             epsilons[idx])
                     else:
                         self.training_buffer[agent_id]['action_mask'].append(
-                            stored_info.action_masks[idx])
+                            stored_info.action_masks[idx], padding_value=1)
                     a_dist = stored_take_action_outputs['log_probs']
                     value = stored_take_action_outputs['value']
                     self.training_buffer[agent_id]['actions'].append(actions[idx])