File size: 6,532 Bytes
05c9ac2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import yaml
import pytest

from mlagents.trainers.upgrade_config import convert_behaviors, remove_nones, convert
from mlagents.trainers.settings import RewardSignalType
from mlagents.trainers.ppo.trainer import PPOSettings, TRAINER_NAME as PPO_TRAINER_NAME
from mlagents.trainers.sac.trainer import SACSettings, TRAINER_NAME as SAC_TRAINER_NAME


BRAIN_NAME = "testbehavior"

# Check one per category
BATCH_SIZE = 256
HIDDEN_UNITS = 32
SUMMARY_FREQ = 500

PPO_CONFIG = f"""
    default:
        trainer: ppo
        batch_size: 1024
        beta: 5.0e-3
        buffer_size: 10240
        epsilon: 0.2
        hidden_units: 128
        lambd: 0.95
        learning_rate: 3.0e-4
        learning_rate_schedule: linear
        beta_schedule: constant
        epsilon_schedule: linear
        max_steps: 5.0e5
        memory_size: 256
        normalize: false
        num_epoch: 3
        num_layers: 2
        time_horizon: 64
        sequence_length: 64
        summary_freq: 10000
        use_recurrent: false
        vis_encode_type: simple
        reward_signals:
            extrinsic:
                strength: 1.0
                gamma: 0.99

    {BRAIN_NAME}:
        trainer: ppo
        batch_size: {BATCH_SIZE}
        beta: 5.0e-3
        buffer_size: 64
        epsilon: 0.2
        hidden_units: {HIDDEN_UNITS}
        lambd: 0.95
        learning_rate: 5.0e-3
        max_steps: 2500
        memory_size: 256
        normalize: false
        num_epoch: 3
        num_layers: 2
        time_horizon: 64
        sequence_length: 64
        summary_freq: {SUMMARY_FREQ}
        use_recurrent: false
        reward_signals:
            curiosity:
                strength: 1.0
                gamma: 0.99
                encoding_size: 128
    """

SAC_CONFIG = f"""
    default:
        trainer: sac
        batch_size: 128
        buffer_size: 50000
        buffer_init_steps: 0
        hidden_units: 128
        init_entcoef: 1.0
        learning_rate: 3.0e-4
        learning_rate_schedule: constant
        max_steps: 5.0e5
        memory_size: 256
        normalize: false
        num_update: 1
        train_interval: 1
        num_layers: 2
        time_horizon: 64
        sequence_length: 64
        summary_freq: 10000
        tau: 0.005
        use_recurrent: false
        vis_encode_type: simple
        reward_signals:
            extrinsic:
                strength: 1.0
                gamma: 0.99

    {BRAIN_NAME}:
        trainer: sac
        batch_size: {BATCH_SIZE}
        buffer_size: 64
        buffer_init_steps: 100
        hidden_units: {HIDDEN_UNITS}
        init_entcoef: 0.01
        learning_rate: 3.0e-4
        max_steps: 1000
        memory_size: 256
        normalize: false
        num_update: 1
        train_interval: 1
        num_layers: 1
        time_horizon: 64
        sequence_length: 64
        summary_freq: {SUMMARY_FREQ}
        tau: 0.005
        use_recurrent: false
        curiosity_enc_size: 128
        demo_path: None
        vis_encode_type: simple
        reward_signals:
            curiosity:
                strength: 1.0
                gamma: 0.99
                encoding_size: 128
    """

CURRICULUM = """

  BigWallJump:
    measure: progress
    thresholds: [0.1, 0.3, 0.5]
    min_lesson_length: 200
    signal_smoothing: true
    parameters:
      big_wall_min_height: [0.0, 4.0, 6.0, 8.0]
      big_wall_max_height: [4.0, 7.0, 8.0, 8.0]
  SmallWallJump:
    measure: progress
    thresholds: [0.1, 0.3, 0.5]
    min_lesson_length: 100
    signal_smoothing: true
    parameters:
      small_wall_height: [1.5, 2.0, 2.5, 4.0]
      """

RANDOMIZATION = """
  resampling-interval: 5000
  mass:
    sampler-type: uniform
    min_value: 0.5
    max_value: 10
  gravity:
    sampler-type: uniform
    min_value: 7
    max_value: 12
  scale:
    sampler-type: uniform
    min_value: 0.75
    max_value: 3
    """


@pytest.mark.parametrize("use_recurrent", [True, False])
@pytest.mark.parametrize("trainer_type", [PPO_TRAINER_NAME, SAC_TRAINER_NAME])
def test_convert_behaviors(trainer_type, use_recurrent):
    if trainer_type == PPO_TRAINER_NAME:
        trainer_config = PPO_CONFIG
        trainer_settings_type = PPOSettings
    else:
        trainer_config = SAC_CONFIG
        trainer_settings_type = SACSettings

    old_config = yaml.safe_load(trainer_config)
    old_config[BRAIN_NAME]["use_recurrent"] = use_recurrent
    new_config = convert_behaviors(old_config)

    # Test that the new config can be converted to TrainerSettings w/o exceptions
    trainer_settings = new_config[BRAIN_NAME]

    # Test that the trainer_settings contains the settings for BRAIN_NAME and
    # the defaults where specified
    assert trainer_settings.trainer_type == trainer_type
    assert isinstance(trainer_settings.hyperparameters, trainer_settings_type)
    assert trainer_settings.hyperparameters.batch_size == BATCH_SIZE
    assert trainer_settings.network_settings.hidden_units == HIDDEN_UNITS
    assert RewardSignalType.CURIOSITY in trainer_settings.reward_signals


def test_convert():
    old_behaviors = yaml.safe_load(PPO_CONFIG)
    old_curriculum = yaml.safe_load(CURRICULUM)
    old_sampler = yaml.safe_load(RANDOMIZATION)
    config = convert(old_behaviors, old_curriculum, old_sampler)
    assert BRAIN_NAME in config["behaviors"]
    assert "big_wall_min_height" in config["environment_parameters"]

    curriculum = config["environment_parameters"]["big_wall_min_height"]["curriculum"]
    assert len(curriculum) == 4
    for i, expected_value in enumerate([0.0, 4.0, 6.0, 8.0]):
        assert curriculum[i][f"Lesson{i}"]["value"] == expected_value
    for i, threshold in enumerate([0.1, 0.3, 0.5]):
        criteria = curriculum[i][f"Lesson{i}"]["completion_criteria"]
        assert criteria["threshold"] == threshold
        assert criteria["behavior"] == "BigWallJump"
        assert criteria["signal_smoothing"]
        assert criteria["min_lesson_length"] == 200
        assert criteria["measure"] == "progress"

    assert "gravity" in config["environment_parameters"]
    gravity = config["environment_parameters"]["gravity"]
    assert gravity["sampler_type"] == "uniform"
    assert gravity["sampler_parameters"]["min_value"] == 7
    assert gravity["sampler_parameters"]["max_value"] == 12


def test_remove_nones():
    dict_with_nones = {"hello": {"hello2": 2, "hello3": None}, "hello4": None}
    dict_without_nones = {"hello": {"hello2": 2}}
    output = remove_nones(dict_with_nones)
    assert output == dict_without_nones