Unity-Technologies · jrupert-unity · Dec 1, 2021 · Dec 2, 2021 · Dec 17, 2021 · Dec 20, 2021
diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md
@@ -13,6 +13,8 @@ and this project adheres to
 ### Minor Changes
 #### com.unity.ml-agents / com.unity.ml-agents.extensions (C#)
 #### ml-agents / ml-agents-envs / gym-unity (Python)
+-Added support for Elo as a curriculum learning completion criteria. (#5646)
+
 ### Bug Fixes
 #### com.unity.ml-agents / com.unity.ml-agents.extensions (C#)
 #### ml-agents / ml-agents-envs / gym-unity (Python)

diff --git a/config/poca/SoccerTwosCurriculum.yaml b/config/poca/SoccerTwosCurriculum.yaml
@@ -0,0 +1,46 @@
+behaviors:
+  SoccerTwos:
+    trainer_type: poca
+    hyperparameters:
+      batch_size: 2048
+      buffer_size: 20480
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: constant
+    network_settings:
+      normalize: false
+      hidden_units: 512
+      num_layers: 2
+      vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        gamma: 0.99
+        strength: 1.0
+    keep_checkpoints: 5
+    max_steps: 50000000
+    time_horizon: 1000
+    summary_freq: 10000
+    self_play:
+      save_steps: 50000
+      team_change: 200000
+      swap_steps: 2000
+      window: 10
+      play_against_latest_model_ratio: 0.5
+      initial_elo: 1200.0
+environment_parameters:
+  ball_touch:
+    curriculum:
+      - name: Lesson0 # The '-' is important as this is a list
+        completion_criteria:
+          measure: Elo
+          behavior: SoccerTwos
+          signal_smoothing: false
+          min_lesson_length: 100
+          threshold: 1250.0
+        value: 1.0    
+      - name: Lesson1 # The '-' is important as this is a list
+        value: 0.0    
+
diff --git a/ml-agents/mlagents/trainers/environment_parameter_manager.py b/ml-agents/mlagents/trainers/environment_parameter_manager.py
@@ -137,6 +137,7 @@ def update_lessons(
         trainer_steps: Dict[str, int],
         trainer_max_steps: Dict[str, int],
         trainer_reward_buffer: Dict[str, List[float]],
+        trainer_elo_score: Dict[str, int],
     ) -> Tuple[bool, bool]:
         """
         Given progress metrics, calculates if at least one environment parameter is
@@ -148,6 +149,8 @@ def update_lessons(
         of training steps this behavior's trainer has performed.
         :param trainer_reward_buffer: A dictionary from behavior_name to the list of
         the most recent episode returns for this behavior's trainer.
+        :trainer_elo_score: A Dictionary from behavior_name to the minimum Elo score
+        to be reached.
         :returns: A tuple of two booleans : (True if any lesson has changed, True if
         environment needs to reset)
         """
@@ -169,6 +172,7 @@ def update_lessons(
                         float(trainer_steps[behavior_to_consider])
                         / float(trainer_max_steps[behavior_to_consider]),
                         trainer_reward_buffer[behavior_to_consider],
+                        trainer_elo_score[behavior_to_consider] if trainer_elo_score else None,
                         self._smoothed_values[param_name],
                     )
                     self._smoothed_values[param_name] = new_smoothing

diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
@@ -491,6 +491,7 @@ class CompletionCriteriaSettings:
     class MeasureType(Enum):
         PROGRESS: str = "progress"
         REWARD: str = "reward"
+        ELO: str = "Elo"
 
     behavior: str
     measure: MeasureType = attr.ib(default=MeasureType.REWARD)
@@ -516,7 +517,7 @@ def _check_threshold_value(self, attribute, value):
                 )
 
     def need_increment(
-        self, progress: float, reward_buffer: List[float], smoothing: float
+        self, progress: float, reward_buffer: List[float], elo_score: float, smoothing: float
     ) -> Tuple[bool, float]:
         """
         Given measures, this method returns a boolean indicating if the lesson
@@ -528,7 +529,7 @@ def need_increment(
         if self.measure == CompletionCriteriaSettings.MeasureType.PROGRESS:
             if progress > self.threshold:
                 return True, smoothing
-        if self.measure == CompletionCriteriaSettings.MeasureType.REWARD:
+        elif self.measure == CompletionCriteriaSettings.MeasureType.REWARD:
             if len(reward_buffer) < 1:
                 return False, smoothing
             measure = np.mean(reward_buffer)
@@ -539,6 +540,13 @@ def need_increment(
                 smoothing = measure
             if measure > self.threshold:
                 return True, smoothing
+        elif self.measure == CompletionCriteriaSettings.MeasureType.ELO:
+            if elo_score is None:
+                raise TrainerConfigError(
+                    "Elo isn't a valid completion criteria measure if not using self-play."
+                )
+            if elo_score > self.threshold:
+                return True, smoothing
         return False, smoothing
 
 

diff --git a/ml-agents/mlagents/trainers/tests/test_env_param_manager.py b/ml-agents/mlagents/trainers/tests/test_env_param_manager.py
@@ -131,7 +131,7 @@ def test_curriculum_conversion():
     assert lesson.value.max_value == 3
 
 
-test_bad_curriculum_no_competion_criteria_config_yaml = """
+test_bad_curriculum_no_completion_criteria_config_yaml = """
 environment_parameters:
     param_1:
       curriculum:
@@ -154,7 +154,7 @@ def test_curriculum_conversion():
 """
 
 
-test_bad_curriculum_all_competion_criteria_config_yaml = """
+test_bad_curriculum_all_completion_criteria_config_yaml = """
 environment_parameters:
     param_1:
       curriculum:
@@ -175,6 +175,14 @@ def test_curriculum_conversion():
                 require_reset: true
             value: 2
           - name: Lesson3
+            completion_criteria:
+                measure: Elo
+                behavior: fake_behavior
+                threshold: 1300
+                min_lesson_length: 100
+                require_reset: true
+            value: 3
+          - name: Lesson4
             completion_criteria:
                 measure: reward
                 behavior: fake_behavior
@@ -192,14 +200,14 @@ def test_curriculum_conversion():
 def test_curriculum_raises_no_completion_criteria_conversion():
     with pytest.raises(TrainerConfigError):
         RunOptions.from_dict(
-            yaml.safe_load(test_bad_curriculum_no_competion_criteria_config_yaml)
+            yaml.safe_load(test_bad_curriculum_no_completion_criteria_config_yaml)
         )
 
 
 def test_curriculum_raises_all_completion_criteria_conversion():
     with pytest.warns(TrainerConfigWarning):
         run_options = RunOptions.from_dict(
-            yaml.safe_load(test_bad_curriculum_all_competion_criteria_config_yaml)
+            yaml.safe_load(test_bad_curriculum_all_completion_criteria_config_yaml)
         )
 
     param_manager = EnvironmentParameterManager(
@@ -209,19 +217,36 @@ def test_curriculum_raises_all_completion_criteria_conversion():
         trainer_steps={"fake_behavior": 500},
         trainer_max_steps={"fake_behavior": 1000},
         trainer_reward_buffer={"fake_behavior": [1000] * 101},
+        trainer_elo_score={"fake_behavior": 1200.0},   #TODO: trainer_elo_scores aren't set properly for tests
     ) == (True, True)
     assert param_manager.update_lessons(
         trainer_steps={"fake_behavior": 500},
         trainer_max_steps={"fake_behavior": 1000},
         trainer_reward_buffer={"fake_behavior": [1000] * 101},
+        trainer_elo_score={"fake_behavior": 1200.0},
     ) == (True, True)
+    assert param_manager.get_current_lesson_number() == {"param_1": 2}
     assert param_manager.update_lessons(
         trainer_steps={"fake_behavior": 500},
         trainer_max_steps={"fake_behavior": 1000},
         trainer_reward_buffer={"fake_behavior": [1000] * 101},
+        trainer_elo_score={"fake_behavior": 1200.0},
     ) == (False, False)
     assert param_manager.get_current_lesson_number() == {"param_1": 2}
-
+    assert param_manager.update_lessons(
+        trainer_steps={"fake_behavior": 500},
+        trainer_max_steps={"fake_behavior": 1000},
+        trainer_reward_buffer={"fake_behavior": [1000] * 101},
+        trainer_elo_score={"fake_behavior": 1500.0},
+    ) == (True, True)
+    assert param_manager.get_current_lesson_number() == {"param_1": 3}
+    assert param_manager.update_lessons(
+        trainer_steps={"fake_behavior": 500},
+        trainer_max_steps={"fake_behavior": 1000},
+        trainer_reward_buffer={"fake_behavior": [1000] * 101},
+        trainer_elo_score={"fake_behavior": 1500.0},
+    ) == (False, False)   # No step to advance to
+    assert param_manager.get_current_lesson_number() == {"param_1": 3}
 
 test_everything_config_yaml = """
 environment_parameters:
@@ -279,17 +304,20 @@ def test_create_manager():
         trainer_steps={"fake_behavior": 500},
         trainer_max_steps={"fake_behavior": 1000},
         trainer_reward_buffer={"fake_behavior": [1000] * 99},
+        trainer_elo_score={"fake_behavior": 1200.0},
     ) == (False, False)
     # Not enough episodes reward
     assert param_manager.update_lessons(
         trainer_steps={"fake_behavior": 500},
         trainer_max_steps={"fake_behavior": 1000},
         trainer_reward_buffer={"fake_behavior": [1] * 101},
+        trainer_elo_score={"fake_behavior": 1200.0},
     ) == (False, False)
     assert param_manager.update_lessons(
         trainer_steps={"fake_behavior": 500},
         trainer_max_steps={"fake_behavior": 1000},
         trainer_reward_buffer={"fake_behavior": [1000] * 101},
+        trainer_elo_score={"fake_behavior": 1200.0},
     ) == (True, True)
     assert param_manager.get_current_lesson_number() == {
         "param_1": 1,
@@ -310,6 +338,7 @@ def test_create_manager():
         trainer_steps={"fake_behavior": 700},
         trainer_max_steps={"fake_behavior": 1000},
         trainer_reward_buffer={"fake_behavior": [0] * 101},
+        trainer_elo_score={"fake_behavior": 1200.0},
     ) == (True, False)
     assert param_manager.get_current_samplers() == {
         "param_1": UniformSettings(seed=1337 + 2, min_value=1, max_value=3),

diff --git a/ml-agents/mlagents/trainers/trainer_controller.py b/ml-agents/mlagents/trainers/trainer_controller.py
@@ -211,10 +211,15 @@ def reset_env_if_ready(self, env: EnvManager) -> None:
         reward_buff = {k: list(t.reward_buffer) for (k, t) in self.trainers.items()}
         curr_step = {k: int(t.get_step) for (k, t) in self.trainers.items()}
         max_step = {k: int(t.get_max_steps) for (k, t) in self.trainers.items()}
+        try:
+            curr_elo = {k: float(t.current_elo) for (k, t) in self.trainers.items()}
+        except AttributeError:
+            curr_elo = None
+
         # Attempt to increment the lessons of the brains who
         # were ready.
         updated, param_must_reset = self.param_manager.update_lessons(
-            curr_step, max_step, reward_buff
+            curr_step, max_step, reward_buff, curr_elo
         )
         if updated:
             for trainer in self.trainers.values():