Skip to content

Added logging per Brain of time to update policy, time elapsed during training, time to collect experiences, buffer length, average return per policy #1858

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 3, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Added logging per Brain of time to update policy, time elapsed during…
… training, time to collect experiences, buffer length, average return
  • Loading branch information
eshvk committed Apr 2, 2019
commit 4849ef0e188775f51e5351a87634f9ecf8a74f5b
5 changes: 5 additions & 0 deletions docs/Training-ML-Agents.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,11 @@ environment, you can set the following command line options when invoking
training doesn't involve visual observations (reading from Pixels). See
[here](https://docs.unity3d.com/Manual/CommandLineArguments.html) for more
details.
* `--debug` - Specify this option to run ML-Agents in debug mode and log Trainer
Metrics to a CSV stored in the `summaries` directory. The metrics stored are:
brain name, Time to update policy, Time since start of training, Time for last experience collection, Number of experiences used for training, Mean return. This
option is not available currently for Imitation Learning.
`

### Training config file

Expand Down
1 change: 1 addition & 0 deletions ml-agents/mlagents/trainers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .curriculum import *
from .meta_curriculum import *
from .models import *
from .trainer_metrics import *
from .trainer import *
from .policy import *
from .trainer_controller import *
Expand Down
3 changes: 2 additions & 1 deletion ml-agents/mlagents/trainers/bc/offline_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
class OfflineBCTrainer(BCTrainer):
"""The OfflineBCTrainer is an implementation of Offline Behavioral Cloning."""

def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
def __init__(self, brain, trainer_parameters, training, load, seed,
run_id):
"""
Responsible for collecting experiences and training PPO model.
:param trainer_parameters: The parameters for the trainer (dictionary).
Expand Down
3 changes: 2 additions & 1 deletion ml-agents/mlagents/trainers/bc/online_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
class OnlineBCTrainer(BCTrainer):
"""The OnlineBCTrainer is an implementation of Online Behavioral Cloning."""

def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
def __init__(self, brain, trainer_parameters, training, load, seed,
run_id):
"""
Responsible for collecting experiences and training PPO model.
:param trainer_parameters: The parameters for the trainer (dictionary).
Expand Down
12 changes: 5 additions & 7 deletions ml-agents/mlagents/trainers/bc/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# Contains an implementation of Behavioral Cloning Algorithm

import logging
import os

import numpy as np
import tensorflow as tf
Expand All @@ -19,7 +18,8 @@
class BCTrainer(Trainer):
"""The BCTrainer is an implementation of Behavioral Cloning."""

def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
def __init__(self, brain, trainer_parameters, training, load, seed,
run_id):
"""
Responsible for collecting experiences and training PPO model.
:param trainer_parameters: The parameters for the trainer (dictionary).
Expand All @@ -28,22 +28,20 @@ def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
:param seed: The seed the model will be initialized with
:param run_id: The The identifier of the current run
"""
super(BCTrainer, self).__init__(brain, trainer_parameters, training, run_id)
super(BCTrainer, self).__init__(brain, trainer_parameters, training,
run_id)
self.policy = BCPolicy(seed, brain, trainer_parameters, load)
self.n_sequences = 1
self.cumulative_rewards = {}
self.episode_steps = {}
self.stats = {'Losses/Cloning Loss': [], 'Environment/Episode Length': [],
'Environment/Cumulative Reward': []}

self.summary_path = trainer_parameters['summary_path']
self.batches_per_epoch = trainer_parameters['batches_per_epoch']
if not os.path.exists(self.summary_path):
os.makedirs(self.summary_path)


self.demonstration_buffer = Buffer()
self.evaluation_buffer = Buffer()
self.summary_writer = tf.summary.FileWriter(self.summary_path)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you test that BC is still training and logging tensorboard ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just tried it, looks like it is still running and logging


@property
def parameters(self):
Expand Down
23 changes: 14 additions & 9 deletions ml-agents/mlagents/trainers/learn.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ def run_training(sub_id: int, run_seed: int, run_options, process_queue):
fast_simulation = not bool(run_options['--slow'])
no_graphics = run_options['--no-graphics']
trainer_config_path = run_options['<trainer-config-path>']

# Recognize and use docker volume if one is passed as an argument
if not docker_target_name:
model_path = './models/{run_id}'.format(run_id=run_id)
Expand Down Expand Up @@ -79,7 +78,8 @@ def run_training(sub_id: int, run_seed: int, run_options, process_queue):
tc = TrainerController(model_path, summaries_dir, run_id + '-' + str(sub_id),
save_freq, maybe_meta_curriculum,
load_model, train_model,
keep_checkpoints, lesson, external_brains, run_seed)
keep_checkpoints, lesson, external_brains,
run_seed)

# Signal that environment has been launched.
process_queue.put(True)
Expand Down Expand Up @@ -155,9 +155,9 @@ def init_environment(env_path, docker_target_name, no_graphics, worker_id, fast_
if docker_training and env_path is not None:
"""
Comments for future maintenance:
Some OS/VM instances (e.g. COS GCP Image) mount filesystems
with COS flag which prevents execution of the Unity scene,
to get around this, we will copy the executable into the
Some OS/VM instances (e.g. COS GCP Image) mount filesystems
with COS flag which prevents execution of the Unity scene,
to get around this, we will copy the executable into the
container.
"""
# Navigate in docker path and find env_path and copy it.
Expand All @@ -175,7 +175,7 @@ def init_environment(env_path, docker_target_name, no_graphics, worker_id, fast_
def main():
try:
print('''

▄▄▄▓▓▓▓
╓▓▓▓▓▓▓█▓▓▓▓▓
,▄▄▄m▀▀▀' ,▓▓▓▀▓▓▄ ▓▓▓ ▓▓▌
Expand All @@ -193,7 +193,6 @@ def main():
except:
print('\n\n\tUnity Technologies\n')

logger = logging.getLogger('mlagents.trainers')
_USAGE = '''
Usage:
mlagents-learn <trainer-config-path> [options]
Expand All @@ -206,18 +205,24 @@ def main():
--lesson=<n> Start learning from this lesson [default: 0].
--load Whether to load the model or randomly initialize [default: False].
--run-id=<path> The directory name for model and summary statistics [default: ppo].
--num-runs=<n> Number of concurrent training sessions [default: 1].
--num-runs=<n> Number of concurrent training sessions [default: 1].
--save-freq=<n> Frequency at which to save model [default: 50000].
--seed=<n> Random seed used for training [default: -1].
--slow Whether to run the game at training speed [default: False].
--train Whether to train model, or only run inference [default: False].
--worker-id=<n> Number to add to communication port (5005) [default: 0].
--docker-target-name=<dt> Docker volume to store training-specific files [default: None].
--no-graphics Whether to run the environment in no-graphics mode [default: False].
--debug Whether to run ML-Agents in debug mode with detailed logging [default: False].
'''

options = docopt(_USAGE)
logger.info(options)
trainer_logger = logging.getLogger('mlagents.trainers')
env_logger = logging.getLogger('mlagents.envs')
trainer_logger.info(options)
if options['--debug']:
trainer_logger.setLevel('DEBUG')
env_logger.setLevel('DEBUG')
num_runs = int(options['--num-runs'])
seed = int(options['--seed'])

Expand Down
23 changes: 12 additions & 11 deletions ml-agents/mlagents/trainers/ppo/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# Contains an implementation of PPO as described (https://arxiv.org/abs/1707.06347).

import logging
import os
from collections import deque

import numpy as np
Expand All @@ -20,7 +19,8 @@
class PPOTrainer(Trainer):
"""The PPOTrainer is an implementation of the PPO algorithm."""

def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id):
def __init__(self, brain, reward_buff_cap, trainer_parameters, training,
load, seed, run_id):
"""
Responsible for collecting experiences and training PPO model.
:param trainer_parameters: The parameters for the trainer (dictionary).
Expand All @@ -29,7 +29,8 @@ def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, s
:param seed: The seed the model will be initialized with
:param run_id: The The identifier of the current run
"""
super(PPOTrainer, self).__init__(brain, trainer_parameters, training, run_id)
super(PPOTrainer, self).__init__(brain, trainer_parameters,
training, run_id)
self.param_keys = ['batch_size', 'beta', 'buffer_size', 'epsilon', 'gamma', 'hidden_units', 'lambd',
'learning_rate', 'max_steps', 'normalize', 'num_epoch', 'num_layers',
'time_horizon', 'sequence_length', 'summary_freq', 'use_recurrent',
Expand All @@ -56,11 +57,6 @@ def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, s
self.cumulative_rewards = {}
self._reward_buffer = deque(maxlen=reward_buff_cap)
self.episode_steps = {}
self.summary_path = trainer_parameters['summary_path']
if not os.path.exists(self.summary_path):
os.makedirs(self.summary_path)

self.summary_writer = tf.summary.FileWriter(self.summary_path)

def __str__(self):
return '''Hyperparameters for the PPO Trainer of brain {0}: \n{1}'''.format(
Expand Down Expand Up @@ -275,6 +271,8 @@ def process_experiences(self, current_info: AllBrainInfo, new_info: AllBrainInfo

self.training_buffer[agent_id].reset_agent()
if info.local_done[l]:
self.cumulative_returns_since_policy_update.append(self.
cumulative_rewards.get(agent_id, 0))
self.stats['Environment/Cumulative Reward'].append(
self.cumulative_rewards.get(agent_id, 0))
self.reward_buffer.appendleft(self.cumulative_rewards.get(agent_id, 0))
Expand All @@ -289,7 +287,7 @@ def process_experiences(self, current_info: AllBrainInfo, new_info: AllBrainInfo

def end_episode(self):
"""
A signal that the Episode has ended. The buffer must be reset.
A signal that the Episode has ended. The buffer must be reset.
Get only called when the academy resets.
"""
self.training_buffer.reset_local_buffers()
Expand All @@ -313,13 +311,16 @@ def update_policy(self):
"""
Uses demonstration_buffer to update the policy.
"""
self.trainer_metrics.end_experience_collection_timer()
self.trainer_metrics.start_policy_update_timer(number_experiences=len(self.training_buffer.update_buffer['actions']),
mean_return = float(np.mean(self.cumulative_returns_since_policy_update)))
n_sequences = max(int(self.trainer_parameters['batch_size'] / self.policy.sequence_length), 1)
value_total, policy_total, forward_total, inverse_total = [], [], [], []
advantages = self.training_buffer.update_buffer['advantages'].get_batch()
self.training_buffer.update_buffer['advantages'].set(
(advantages - advantages.mean()) / (advantages.std() + 1e-10))
num_epoch = self.trainer_parameters['num_epoch']
for k in range(num_epoch):
for _ in range(num_epoch):
self.training_buffer.update_buffer.shuffle()
buffer = self.training_buffer.update_buffer
for l in range(len(self.training_buffer.update_buffer['actions']) // n_sequences):
Expand All @@ -337,7 +338,7 @@ def update_policy(self):
self.stats['Losses/Forward Loss'].append(np.mean(forward_total))
self.stats['Losses/Inverse Loss'].append(np.mean(inverse_total))
self.training_buffer.reset_update_buffer()

self.trainer_metrics.end_policy_update()

def discount_rewards(r, gamma=0.99, value_next=0.0):
"""
Expand Down
1 change: 1 addition & 0 deletions ml-agents/mlagents/trainers/tests/test_learn.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def basic_options():
'--slow': False,
'--no-graphics': False,
'<trainer-config-path>': 'basic_path',
'--debug': False,
}


Expand Down
Loading