Skip to content
2 changes: 2 additions & 0 deletions com.unity.ml-agents/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ and this project adheres to
### Minor Changes
#### com.unity.ml-agents / com.unity.ml-agents.extensions (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
-Added support for Elo as a curriculum learning completion criteria. (#5646)

### Bug Fixes
#### com.unity.ml-agents / com.unity.ml-agents.extensions (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
Expand Down
46 changes: 46 additions & 0 deletions config/poca/SoccerTwosCurriculum.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
behaviors:
SoccerTwos:
trainer_type: poca
hyperparameters:
batch_size: 2048
buffer_size: 20480
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: constant
network_settings:
normalize: false
hidden_units: 512
num_layers: 2
vis_encode_type: simple
reward_signals:
extrinsic:
gamma: 0.99
strength: 1.0
keep_checkpoints: 5
max_steps: 50000000
time_horizon: 1000
summary_freq: 10000
self_play:
save_steps: 50000
team_change: 200000
swap_steps: 2000
window: 10
play_against_latest_model_ratio: 0.5
initial_elo: 1200.0
environment_parameters:
ball_touch:
curriculum:
- name: Lesson0 # The '-' is important as this is a list
completion_criteria:
measure: Elo
behavior: SoccerTwos
signal_smoothing: false
min_lesson_length: 100
threshold: 1250.0
value: 1.0
- name: Lesson1 # The '-' is important as this is a list
value: 0.0

4 changes: 4 additions & 0 deletions ml-agents/mlagents/trainers/environment_parameter_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ def update_lessons(
trainer_steps: Dict[str, int],
trainer_max_steps: Dict[str, int],
trainer_reward_buffer: Dict[str, List[float]],
trainer_elo_score: Dict[str, int],
) -> Tuple[bool, bool]:
"""
Given progress metrics, calculates if at least one environment parameter is
Expand All @@ -148,6 +149,8 @@ def update_lessons(
of training steps this behavior's trainer has performed.
:param trainer_reward_buffer: A dictionary from behavior_name to the list of
the most recent episode returns for this behavior's trainer.
:trainer_elo_score: A Dictionary from behavior_name to the minimum Elo score
to be reached.
:returns: A tuple of two booleans : (True if any lesson has changed, True if
environment needs to reset)
"""
Expand All @@ -169,6 +172,7 @@ def update_lessons(
float(trainer_steps[behavior_to_consider])
/ float(trainer_max_steps[behavior_to_consider]),
trainer_reward_buffer[behavior_to_consider],
trainer_elo_score[behavior_to_consider] if trainer_elo_score else None,
self._smoothed_values[param_name],
)
self._smoothed_values[param_name] = new_smoothing
Expand Down
12 changes: 10 additions & 2 deletions ml-agents/mlagents/trainers/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,7 @@ class CompletionCriteriaSettings:
class MeasureType(Enum):
PROGRESS: str = "progress"
REWARD: str = "reward"
ELO: str = "Elo"

behavior: str
measure: MeasureType = attr.ib(default=MeasureType.REWARD)
Expand All @@ -516,7 +517,7 @@ def _check_threshold_value(self, attribute, value):
)

def need_increment(
self, progress: float, reward_buffer: List[float], smoothing: float
self, progress: float, reward_buffer: List[float], elo_score: float, smoothing: float
) -> Tuple[bool, float]:
"""
Given measures, this method returns a boolean indicating if the lesson
Expand All @@ -528,7 +529,7 @@ def need_increment(
if self.measure == CompletionCriteriaSettings.MeasureType.PROGRESS:
if progress > self.threshold:
return True, smoothing
if self.measure == CompletionCriteriaSettings.MeasureType.REWARD:
elif self.measure == CompletionCriteriaSettings.MeasureType.REWARD:
if len(reward_buffer) < 1:
return False, smoothing
measure = np.mean(reward_buffer)
Expand All @@ -539,6 +540,13 @@ def need_increment(
smoothing = measure
if measure > self.threshold:
return True, smoothing
elif self.measure == CompletionCriteriaSettings.MeasureType.ELO:
if elo_score is None:
raise TrainerConfigError(
"Elo isn't a valid completion criteria measure if not using self-play."
)
if elo_score > self.threshold:
return True, smoothing
return False, smoothing


Expand Down
39 changes: 34 additions & 5 deletions ml-agents/mlagents/trainers/tests/test_env_param_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def test_curriculum_conversion():
assert lesson.value.max_value == 3


test_bad_curriculum_no_competion_criteria_config_yaml = """
test_bad_curriculum_no_completion_criteria_config_yaml = """
environment_parameters:
param_1:
curriculum:
Expand All @@ -154,7 +154,7 @@ def test_curriculum_conversion():
"""


test_bad_curriculum_all_competion_criteria_config_yaml = """
test_bad_curriculum_all_completion_criteria_config_yaml = """
environment_parameters:
param_1:
curriculum:
Expand All @@ -175,6 +175,14 @@ def test_curriculum_conversion():
require_reset: true
value: 2
- name: Lesson3
completion_criteria:
measure: Elo
behavior: fake_behavior
threshold: 1300
min_lesson_length: 100
require_reset: true
value: 3
- name: Lesson4
completion_criteria:
measure: reward
behavior: fake_behavior
Expand All @@ -192,14 +200,14 @@ def test_curriculum_conversion():
def test_curriculum_raises_no_completion_criteria_conversion():
with pytest.raises(TrainerConfigError):
RunOptions.from_dict(
yaml.safe_load(test_bad_curriculum_no_competion_criteria_config_yaml)
yaml.safe_load(test_bad_curriculum_no_completion_criteria_config_yaml)
)


def test_curriculum_raises_all_completion_criteria_conversion():
with pytest.warns(TrainerConfigWarning):
run_options = RunOptions.from_dict(
yaml.safe_load(test_bad_curriculum_all_competion_criteria_config_yaml)
yaml.safe_load(test_bad_curriculum_all_completion_criteria_config_yaml)
)

param_manager = EnvironmentParameterManager(
Expand All @@ -209,19 +217,36 @@ def test_curriculum_raises_all_completion_criteria_conversion():
trainer_steps={"fake_behavior": 500},
trainer_max_steps={"fake_behavior": 1000},
trainer_reward_buffer={"fake_behavior": [1000] * 101},
trainer_elo_score={"fake_behavior": 1200.0}, #TODO: trainer_elo_scores aren't set properly for tests
) == (True, True)
assert param_manager.update_lessons(
trainer_steps={"fake_behavior": 500},
trainer_max_steps={"fake_behavior": 1000},
trainer_reward_buffer={"fake_behavior": [1000] * 101},
trainer_elo_score={"fake_behavior": 1200.0},
) == (True, True)
assert param_manager.get_current_lesson_number() == {"param_1": 2}
assert param_manager.update_lessons(
trainer_steps={"fake_behavior": 500},
trainer_max_steps={"fake_behavior": 1000},
trainer_reward_buffer={"fake_behavior": [1000] * 101},
trainer_elo_score={"fake_behavior": 1200.0},
) == (False, False)
assert param_manager.get_current_lesson_number() == {"param_1": 2}

assert param_manager.update_lessons(
trainer_steps={"fake_behavior": 500},
trainer_max_steps={"fake_behavior": 1000},
trainer_reward_buffer={"fake_behavior": [1000] * 101},
trainer_elo_score={"fake_behavior": 1500.0},
) == (True, True)
assert param_manager.get_current_lesson_number() == {"param_1": 3}
assert param_manager.update_lessons(
trainer_steps={"fake_behavior": 500},
trainer_max_steps={"fake_behavior": 1000},
trainer_reward_buffer={"fake_behavior": [1000] * 101},
trainer_elo_score={"fake_behavior": 1500.0},
) == (False, False) # No step to advance to
assert param_manager.get_current_lesson_number() == {"param_1": 3}

test_everything_config_yaml = """
environment_parameters:
Expand Down Expand Up @@ -279,17 +304,20 @@ def test_create_manager():
trainer_steps={"fake_behavior": 500},
trainer_max_steps={"fake_behavior": 1000},
trainer_reward_buffer={"fake_behavior": [1000] * 99},
trainer_elo_score={"fake_behavior": 1200.0},
) == (False, False)
# Not enough episodes reward
assert param_manager.update_lessons(
trainer_steps={"fake_behavior": 500},
trainer_max_steps={"fake_behavior": 1000},
trainer_reward_buffer={"fake_behavior": [1] * 101},
trainer_elo_score={"fake_behavior": 1200.0},
) == (False, False)
assert param_manager.update_lessons(
trainer_steps={"fake_behavior": 500},
trainer_max_steps={"fake_behavior": 1000},
trainer_reward_buffer={"fake_behavior": [1000] * 101},
trainer_elo_score={"fake_behavior": 1200.0},
) == (True, True)
assert param_manager.get_current_lesson_number() == {
"param_1": 1,
Expand All @@ -310,6 +338,7 @@ def test_create_manager():
trainer_steps={"fake_behavior": 700},
trainer_max_steps={"fake_behavior": 1000},
trainer_reward_buffer={"fake_behavior": [0] * 101},
trainer_elo_score={"fake_behavior": 1200.0},
) == (True, False)
assert param_manager.get_current_samplers() == {
"param_1": UniformSettings(seed=1337 + 2, min_value=1, max_value=3),
Expand Down
7 changes: 6 additions & 1 deletion ml-agents/mlagents/trainers/trainer_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,10 +211,15 @@ def reset_env_if_ready(self, env: EnvManager) -> None:
reward_buff = {k: list(t.reward_buffer) for (k, t) in self.trainers.items()}
curr_step = {k: int(t.get_step) for (k, t) in self.trainers.items()}
max_step = {k: int(t.get_max_steps) for (k, t) in self.trainers.items()}
try:
curr_elo = {k: float(t.current_elo) for (k, t) in self.trainers.items()}
except AttributeError:
curr_elo = None

# Attempt to increment the lessons of the brains who
# were ready.
updated, param_must_reset = self.param_manager.update_lessons(
curr_step, max_step, reward_buff
curr_step, max_step, reward_buff, curr_elo
)
if updated:
for trainer in self.trainers.values():
Expand Down