Added logging per Brain of time to update policy, time elapsed during…

… training, time to collect experiences, buffer length, average return
Unity-Technologies · eshvk · Apr 3, 2019 · Mar 1, 2019 · Mar 29, 2019 · Apr 2, 2019
commit 4849ef0e188775f51e5351a87634f9ecf8a74f5b
diff --git a/docs/Training-ML-Agents.md b/docs/Training-ML-Agents.md
@@ -146,6 +146,11 @@ environment, you can set the following command line options when invoking
   training doesn't involve visual observations (reading from Pixels). See
   [here](https://docs.unity3d.com/Manual/CommandLineArguments.html) for more
   details.
+* `--debug` - Specify this option to run ML-Agents in debug mode and log Trainer
+  Metrics to a CSV stored in the `summaries` directory. The metrics  stored are:
+  brain name, Time to update policy, Time since start of training, Time for last experience collection, Number of experiences used for training, Mean return. This
+  option is not available currently for Imitation Learning.
+  `
 
 ### Training config file
 

diff --git a/ml-agents/mlagents/trainers/__init__.py b/ml-agents/mlagents/trainers/__init__.py
@@ -3,6 +3,7 @@
 from .curriculum import *
 from .meta_curriculum import *
 from .models import *
+from .trainer_metrics import *
 from .trainer import *
 from .policy import *
 from .trainer_controller import *

diff --git a/ml-agents/mlagents/trainers/bc/offline_trainer.py b/ml-agents/mlagents/trainers/bc/offline_trainer.py
@@ -15,7 +15,8 @@
 class OfflineBCTrainer(BCTrainer):
     """The OfflineBCTrainer is an implementation of Offline Behavioral Cloning."""
 
-    def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
+    def __init__(self, brain, trainer_parameters, training, load, seed,
+                 run_id):
         """
         Responsible for collecting experiences and training PPO model.
         :param  trainer_parameters: The parameters for the trainer (dictionary).

diff --git a/ml-agents/mlagents/trainers/bc/online_trainer.py b/ml-agents/mlagents/trainers/bc/online_trainer.py
@@ -14,7 +14,8 @@
 class OnlineBCTrainer(BCTrainer):
     """The OnlineBCTrainer is an implementation of Online Behavioral Cloning."""
 
-    def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
+    def __init__(self, brain, trainer_parameters, training, load, seed,
+                 run_id):
         """
         Responsible for collecting experiences and training PPO model.
         :param  trainer_parameters: The parameters for the trainer (dictionary).

diff --git a/ml-agents/mlagents/trainers/bc/trainer.py b/ml-agents/mlagents/trainers/bc/trainer.py
@@ -3,7 +3,6 @@
 # Contains an implementation of Behavioral Cloning Algorithm
 
 import logging
-import os
 
 import numpy as np
 import tensorflow as tf
@@ -19,7 +18,8 @@
 class BCTrainer(Trainer):
     """The BCTrainer is an implementation of Behavioral Cloning."""
 
-    def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
+    def __init__(self, brain, trainer_parameters, training, load, seed,
+                 run_id):
         """
         Responsible for collecting experiences and training PPO model.
         :param  trainer_parameters: The parameters for the trainer (dictionary).
@@ -28,22 +28,20 @@ def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
         :param seed: The seed the model will be initialized with
         :param run_id: The The identifier of the current run
         """
-        super(BCTrainer, self).__init__(brain, trainer_parameters, training, run_id)
+        super(BCTrainer, self).__init__(brain, trainer_parameters, training,
+                                        run_id)
         self.policy = BCPolicy(seed, brain, trainer_parameters, load)
         self.n_sequences = 1
         self.cumulative_rewards = {}
         self.episode_steps = {}
         self.stats = {'Losses/Cloning Loss': [], 'Environment/Episode Length': [],
                       'Environment/Cumulative Reward': []}
 
-        self.summary_path = trainer_parameters['summary_path']
         self.batches_per_epoch = trainer_parameters['batches_per_epoch']
-        if not os.path.exists(self.summary_path):
-            os.makedirs(self.summary_path)
+
 
         self.demonstration_buffer = Buffer()
         self.evaluation_buffer = Buffer()
-        self.summary_writer = tf.summary.FileWriter(self.summary_path)
 
     @property
     def parameters(self):

diff --git a/ml-agents/mlagents/trainers/learn.py b/ml-agents/mlagents/trainers/learn.py
@@ -46,7 +46,6 @@ def run_training(sub_id: int, run_seed: int, run_options, process_queue):
     fast_simulation = not bool(run_options['--slow'])
     no_graphics = run_options['--no-graphics']
     trainer_config_path = run_options['<trainer-config-path>']
-
     # Recognize and use docker volume if one is passed as an argument
     if not docker_target_name:
         model_path = './models/{run_id}'.format(run_id=run_id)
@@ -79,7 +78,8 @@ def run_training(sub_id: int, run_seed: int, run_options, process_queue):
     tc = TrainerController(model_path, summaries_dir, run_id + '-' + str(sub_id),
                            save_freq, maybe_meta_curriculum,
                            load_model, train_model,
-                           keep_checkpoints, lesson, external_brains, run_seed)
+                           keep_checkpoints, lesson, external_brains,
+                           run_seed)
 
     # Signal that environment has been launched.
     process_queue.put(True)
@@ -155,9 +155,9 @@ def init_environment(env_path, docker_target_name, no_graphics, worker_id, fast_
     if docker_training and env_path is not None:
             """
             Comments for future maintenance:
-                Some OS/VM instances (e.g. COS GCP Image) mount filesystems 
-                with COS flag which prevents execution of the Unity scene, 
-                to get around this, we will copy the executable into the 
+                Some OS/VM instances (e.g. COS GCP Image) mount filesystems
+                with COS flag which prevents execution of the Unity scene,
+                to get around this, we will copy the executable into the
                 container.
             """
             # Navigate in docker path and find env_path and copy it.
@@ -175,7 +175,7 @@ def init_environment(env_path, docker_target_name, no_graphics, worker_id, fast_
 def main():
     try:
         print('''
-    
+
                         ▄▄▄▓▓▓▓
                    ╓▓▓▓▓▓▓█▓▓▓▓▓
               ,▄▄▄m▀▀▀'  ,▓▓▓▀▓▓▄                           ▓▓▓  ▓▓▌
@@ -193,7 +193,6 @@ def main():
     except:
         print('\n\n\tUnity Technologies\n')
 
-    logger = logging.getLogger('mlagents.trainers')
     _USAGE = '''
     Usage:
       mlagents-learn <trainer-config-path> [options]
@@ -206,18 +205,24 @@ def main():
       --lesson=<n>               Start learning from this lesson [default: 0].
       --load                     Whether to load the model or randomly initialize [default: False].
       --run-id=<path>            The directory name for model and summary statistics [default: ppo].
-      --num-runs=<n>             Number of concurrent training sessions [default: 1]. 
+      --num-runs=<n>             Number of concurrent training sessions [default: 1].
       --save-freq=<n>            Frequency at which to save model [default: 50000].
       --seed=<n>                 Random seed used for training [default: -1].
       --slow                     Whether to run the game at training speed [default: False].
       --train                    Whether to train model, or only run inference [default: False].
       --worker-id=<n>            Number to add to communication port (5005) [default: 0].
       --docker-target-name=<dt>  Docker volume to store training-specific files [default: None].
       --no-graphics              Whether to run the environment in no-graphics mode [default: False].
+      --debug                    Whether to run ML-Agents in debug mode with detailed logging [default: False].
     '''
 
     options = docopt(_USAGE)
-    logger.info(options)
+    trainer_logger = logging.getLogger('mlagents.trainers')
+    env_logger = logging.getLogger('mlagents.envs')
+    trainer_logger.info(options)
+    if options['--debug']:
+        trainer_logger.setLevel('DEBUG')
+        env_logger.setLevel('DEBUG')
     num_runs = int(options['--num-runs'])
     seed = int(options['--seed'])
 

diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -3,7 +3,6 @@
 # Contains an implementation of PPO as described (https://arxiv.org/abs/1707.06347).
 
 import logging
-import os
 from collections import deque
 
 import numpy as np
@@ -20,7 +19,8 @@
 class PPOTrainer(Trainer):
     """The PPOTrainer is an implementation of the PPO algorithm."""
 
-    def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id):
+    def __init__(self, brain, reward_buff_cap, trainer_parameters, training,
+                 load, seed, run_id):
         """
         Responsible for collecting experiences and training PPO model.
         :param trainer_parameters: The parameters for the trainer (dictionary).
@@ -29,7 +29,8 @@ def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, s
         :param seed: The seed the model will be initialized with
         :param run_id: The The identifier of the current run
         """
-        super(PPOTrainer, self).__init__(brain, trainer_parameters, training, run_id)
+        super(PPOTrainer, self).__init__(brain, trainer_parameters,
+                                         training, run_id)
         self.param_keys = ['batch_size', 'beta', 'buffer_size', 'epsilon', 'gamma', 'hidden_units', 'lambd',
                            'learning_rate', 'max_steps', 'normalize', 'num_epoch', 'num_layers',
                            'time_horizon', 'sequence_length', 'summary_freq', 'use_recurrent',
@@ -56,11 +57,6 @@ def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, s
         self.cumulative_rewards = {}
         self._reward_buffer = deque(maxlen=reward_buff_cap)
         self.episode_steps = {}
-        self.summary_path = trainer_parameters['summary_path']
-        if not os.path.exists(self.summary_path):
-            os.makedirs(self.summary_path)
-
-        self.summary_writer = tf.summary.FileWriter(self.summary_path)
 
     def __str__(self):
         return '''Hyperparameters for the PPO Trainer of brain {0}: \n{1}'''.format(
@@ -275,6 +271,8 @@ def process_experiences(self, current_info: AllBrainInfo, new_info: AllBrainInfo
 
                 self.training_buffer[agent_id].reset_agent()
                 if info.local_done[l]:
+                    self.cumulative_returns_since_policy_update.append(self.
+                                                                       cumulative_rewards.get(agent_id, 0))
                     self.stats['Environment/Cumulative Reward'].append(
                         self.cumulative_rewards.get(agent_id, 0))
                     self.reward_buffer.appendleft(self.cumulative_rewards.get(agent_id, 0))
@@ -289,7 +287,7 @@ def process_experiences(self, current_info: AllBrainInfo, new_info: AllBrainInfo
 
     def end_episode(self):
         """
-        A signal that the Episode has ended. The buffer must be reset. 
+        A signal that the Episode has ended. The buffer must be reset.
         Get only called when the academy resets.
         """
         self.training_buffer.reset_local_buffers()
@@ -313,13 +311,16 @@ def update_policy(self):
         """
         Uses demonstration_buffer to update the policy.
         """
+        self.trainer_metrics.end_experience_collection_timer()
+        self.trainer_metrics.start_policy_update_timer(number_experiences=len(self.training_buffer.update_buffer['actions']),
+                                        mean_return = float(np.mean(self.cumulative_returns_since_policy_update)))
         n_sequences = max(int(self.trainer_parameters['batch_size'] / self.policy.sequence_length), 1)
         value_total, policy_total, forward_total, inverse_total = [], [], [], []
         advantages = self.training_buffer.update_buffer['advantages'].get_batch()
         self.training_buffer.update_buffer['advantages'].set(
             (advantages - advantages.mean()) / (advantages.std() + 1e-10))
         num_epoch = self.trainer_parameters['num_epoch']
-        for k in range(num_epoch):
+        for _ in range(num_epoch):
             self.training_buffer.update_buffer.shuffle()
             buffer = self.training_buffer.update_buffer
             for l in range(len(self.training_buffer.update_buffer['actions']) // n_sequences):
@@ -337,7 +338,7 @@ def update_policy(self):
             self.stats['Losses/Forward Loss'].append(np.mean(forward_total))
             self.stats['Losses/Inverse Loss'].append(np.mean(inverse_total))
         self.training_buffer.reset_update_buffer()
-
+        self.trainer_metrics.end_policy_update()
 
 def discount_rewards(r, gamma=0.99, value_next=0.0):
     """

diff --git a/ml-agents/mlagents/trainers/tests/test_learn.py b/ml-agents/mlagents/trainers/tests/test_learn.py
@@ -20,6 +20,7 @@ def basic_options():
         '--slow': False,
         '--no-graphics': False,
         '<trainer-config-path>': 'basic_path',
+        '--debug': False,
     }