Switched go gymnasium interface

- added dependency to gymnasium instead of gym - adjusted Unity gym interface to return `truncated` on step - adjusted Unity gym interface to accept `seed` and `options` on reset (not used) - adjusted Unity gym interface to not accept `mode` on reset (not used anyway) - relaxed dependencies to numpy, pettingzoo and Python version
Unity-Technologies · alexander-zap · May 28, 2025 · May 28, 2025 · May 28, 2025 · May 28, 2025
commit 48a29df751f9c357826c00807c73abfed2caf64d
diff --git a/docs/Python-Gym-API-Documentation.md b/docs/Python-Gym-API-Documentation.md
@@ -59,18 +59,22 @@ Environment initialization
 #### reset
 
 ```python
- | reset() -> Union[List[np.ndarray], np.ndarray]
+ | reset(*, seed: int | None = None, options: dict[str, Any] | None = None) -> Tuple[np.ndarray, Dict]
 ```
 
-Resets the state of the environment and returns an initial observation.
-Returns: observation (object/list): the initial observation of the
-space.
+Resets the state of the environment and returns an initial observation and info.
+
+**Returns**:
+
+- `observation` _object/list_ - the initial observation of the
+  space.
+- `info` _dict_ - contains auxiliary diagnostic information.
 
 <a name="mlagents_envs.envs.unity_gym_env.UnityToGymWrapper.step"></a>
 #### step
 
 ```python
- | step(action: List[Any]) -> GymStepResult
+ | step(action: Any) -> GymStepResult
 ```
 
 Run one timestep of the environment's dynamics. When end of
@@ -86,14 +90,15 @@ Accepts an action and returns a tuple (observation, reward, done, info).
 
 - `observation` _object/list_ - agent's observation of the current environment
   reward (float/list) : amount of reward returned after previous action
-- `done` _boolean/list_ - whether the episode has ended.
+- `terminated` _boolean/list_ - whether the episode has ended by termination.
+- `truncated` _boolean/list_ - whether the episode has ended by truncation.
 - `info` _dict_ - contains auxiliary diagnostic information.
 
 <a name="mlagents_envs.envs.unity_gym_env.UnityToGymWrapper.render"></a>
 #### render
 
 ```python
- | render(mode="rgb_array")
+ | render()
 ```
 
 Return the latest visual observations.

diff --git a/ml-agents-envs/mlagents_envs/envs/unity_gym_env.py b/ml-agents-envs/mlagents_envs/envs/unity_gym_env.py
@@ -3,8 +3,8 @@
 import numpy as np
 from typing import Any, Dict, List, Optional, Tuple, Union
 
-import gym
-from gym import error, spaces
+import gymnasium as gym
+from gymnasium import error, spaces
 
 from mlagents_envs.base_env import ActionTuple, BaseEnv
 from mlagents_envs.base_env import DecisionSteps, TerminalSteps
@@ -20,7 +20,7 @@ class UnityGymException(error.Error):
 
 
 logger = logging_util.get_logger(__name__)
-GymStepResult = Tuple[np.ndarray, float, bool, Dict]
+GymStepResult = Tuple[np.ndarray, float, bool, bool, Dict]
 
 
 class UnityToGymWrapper(gym.Env):
@@ -151,21 +151,26 @@ def __init__(
         else:
             self._observation_space = list_spaces[0]  # only return the first one
 
-    def reset(self) -> Union[List[np.ndarray], np.ndarray]:
-        """Resets the state of the environment and returns an initial observation.
-        Returns: observation (object/list): the initial observation of the
+    def reset(
+        self, *, seed: int | None = None, options: dict[str, Any] | None = None
+    ) -> Tuple[np.ndarray, Dict]:
+        """Resets the state of the environment and returns an initial observation and info.
+        Returns:
+            observation (object/list): the initial observation of the
         space.
+            info (dict): contains auxiliary diagnostic information.
         """
+        super().reset(seed=seed, options=options)
         self._env.reset()
         decision_step, _ = self._env.get_steps(self.name)
         n_agents = len(decision_step)
         self._check_agents(n_agents)
         self.game_over = False
 
         res: GymStepResult = self._single_step(decision_step)
-        return res[0]
+        return res[0], res[4]
 
-    def step(self, action: List[Any]) -> GymStepResult:
+    def step(self, action: Any) -> GymStepResult:
         """Run one timestep of the environment's dynamics. When end of
         episode is reached, you are responsible for calling `reset()`
         to reset this environment's state.
@@ -175,14 +180,15 @@ def step(self, action: List[Any]) -> GymStepResult:
         Returns:
             observation (object/list): agent's observation of the current environment
             reward (float/list) : amount of reward returned after previous action
-            done (boolean/list): whether the episode has ended.
+            terminated (boolean/list): whether the episode has ended by termination.
+            truncated (boolean/list): whether the episode has ended by truncation.
             info (dict): contains auxiliary diagnostic information.
         """
         if self.game_over:
             raise UnityGymException(
                 "You are calling 'step()' even though this environment has already "
-                "returned done = True. You must always call 'reset()' once you "
-                "receive 'done = True'."
+                "returned `terminated` or `truncated` as True. You must always call 'reset()' once you "
+                "receive `terminated` or `truncated` as True."
             )
         if self._flattener is not None:
             # Translate action into list
@@ -227,9 +233,19 @@ def _single_step(self, info: Union[DecisionSteps, TerminalSteps]) -> GymStepResu
             visual_obs = self._get_vis_obs_list(info)
             self.visual_obs = self._preprocess_single(visual_obs[0][0])
 
-        done = isinstance(info, TerminalSteps)
+        if isinstance(info, TerminalSteps):
+            interrupted = info.interrupted
+            terminated, truncated = not interrupted, interrupted
+        else:
+            terminated, truncated = False, False
 
-        return (default_observation, info.reward[0], done, {"step": info})
+        return (
+            default_observation,
+            info.reward[0],
+            terminated,
+            truncated,
+            {"step": info},
+        )
 
     def _preprocess_single(self, single_visual_obs: np.ndarray) -> np.ndarray:
         if self.uint8_visual:
@@ -276,7 +292,7 @@ def _get_vec_obs_size(self) -> int:
                 result += obs_spec.shape[0]
         return result
 
-    def render(self, mode="rgb_array"):
+    def render(self):
         """
         Return the latest visual observations.
         Note that it will not render a new frame of the environment.

diff --git a/ml-agents-envs/setup.py b/ml-agents-envs/setup.py
@@ -58,12 +58,12 @@ def run(self):
         "Pillow>=4.2.1",
         "protobuf>=3.6,<3.21",
         "pyyaml>=3.1.0",
-        "gym>=0.21.0",
-        "pettingzoo==1.15.0",
-        "numpy>=1.23.5,<1.24.0",
+        "gymnasium>=0.25.0",
+        "pettingzoo>=1.15.0",
+        "numpy>=1.23.5,<2.0",
         "filelock>=3.4.0",
     ],
-    python_requires=">=3.10.1,<=3.10.12",
+    python_requires=">=3.9,<4",
     # TODO: Remove this once mypy stops having spurious setuptools issues.
     cmdclass={"verify": VerifyVersionCommand},  # type: ignore
 )