1
- import yaml
2
1
import math
2
+ import random
3
3
import tempfile
4
+ import pytest
5
+ import yaml
4
6
from typing import Any , Dict
5
7
6
8
@@ -31,21 +33,25 @@ class Simple1DEnvironment(BaseUnityEnvironment):
31
33
it reaches -1. The position is incremented by the action amount (clamped to [-step_size, step_size]).
32
34
"""
33
35
34
- def __init__ (self ):
36
+ def __init__ (self , use_discrete ):
37
+ super ().__init__ ()
38
+ self .discrete = use_discrete
35
39
self ._brains : Dict [str , BrainParameters ] = {}
36
40
self ._brains [BRAIN_NAME ] = BrainParameters (
37
41
brain_name = BRAIN_NAME ,
38
42
vector_observation_space_size = OBS_SIZE ,
39
43
num_stacked_vector_observations = 1 ,
40
44
camera_resolutions = [],
41
- vector_action_space_size = [1 ],
45
+ vector_action_space_size = [2 ] if use_discrete else [ 1 ],
42
46
vector_action_descriptions = ["moveDirection" ],
43
- vector_action_space_type = 1 , # "continuous"
47
+ vector_action_space_type = 0 if use_discrete else 1 ,
44
48
)
45
49
46
50
# state
47
51
self .position = 0.0
48
52
self .step_count = 0
53
+ self .random = random .Random (str (self ._brains ))
54
+ self .goal = random .choice ([- 1 , 1 ])
49
55
50
56
def step (
51
57
self ,
@@ -56,21 +62,23 @@ def step(
56
62
) -> AllBrainInfo :
57
63
assert vector_action is not None
58
64
59
- delta = vector_action [BRAIN_NAME ][0 ][0 ]
65
+ if self .discrete :
66
+ act = vector_action [BRAIN_NAME ][0 ][0 ]
67
+ delta = 1 if act else - 1
68
+ else :
69
+ delta = vector_action [BRAIN_NAME ][0 ][0 ]
60
70
delta = clamp (delta , - STEP_SIZE , STEP_SIZE )
61
71
self .position += delta
62
72
self .position = clamp (self .position , - 1 , 1 )
63
73
self .step_count += 1
64
74
done = self .position >= 1.0 or self .position <= - 1.0
65
75
if done :
66
- reward = SUCCESS_REWARD * self .position
76
+ reward = SUCCESS_REWARD * self .position * self . goal
67
77
else :
68
78
reward = - TIME_PENALTY
69
79
70
80
agent_info = AgentInfoProto (
71
- stacked_vector_observation = [self .position ] * OBS_SIZE ,
72
- reward = reward ,
73
- done = done ,
81
+ stacked_vector_observation = [self .goal ] * OBS_SIZE , reward = reward , done = done
74
82
)
75
83
76
84
if done :
@@ -85,6 +93,7 @@ def step(
85
93
def _reset_agent (self ):
86
94
self .position = 0.0
87
95
self .step_count = 0
96
+ self .goal = random .choice ([- 1 , 1 ])
88
97
89
98
def reset (
90
99
self ,
@@ -95,7 +104,7 @@ def reset(
95
104
self ._reset_agent ()
96
105
97
106
agent_info = AgentInfoProto (
98
- stacked_vector_observation = [self .position ] * OBS_SIZE ,
107
+ stacked_vector_observation = [self .goal ] * OBS_SIZE ,
99
108
done = False ,
100
109
max_step_reached = False ,
101
110
)
@@ -121,7 +130,7 @@ def close(self):
121
130
pass
122
131
123
132
124
- def test_simple ( ):
133
+ def _check_environment_trains ( env ):
125
134
config = """
126
135
default:
127
136
trainer: ppo
@@ -167,11 +176,16 @@ def test_simple():
167
176
)
168
177
169
178
# Begin training
170
- env = Simple1DEnvironment ()
171
179
env_manager = SimpleEnvManager (env )
172
180
trainer_config = yaml .safe_load (config )
173
181
tc .start_learning (env_manager , trainer_config )
174
182
175
183
for brain_name , mean_reward in tc ._get_measure_vals ().items ():
176
184
assert not math .isnan (mean_reward )
177
185
assert mean_reward > 0.99
186
+
187
+
188
+ @pytest .mark .parametrize ("use_discrete" , [True , False ])
189
+ def test_simple_rl (use_discrete ):
190
+ env = Simple1DEnvironment (use_discrete = use_discrete )
191
+ _check_environment_trains (env )
0 commit comments