File size: 6,624 Bytes
079c32c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import os
from easydict import EasyDict
module_path = os.path.dirname(__file__)
collector_env_num = 8
evaluator_env_num = 8
expert_replay_buffer_size = int(5e3)
"""agent config"""
lunarlander_r2d3_config = dict(
exp_name='lunarlander_r2d3_ppoexpert_seed0',
env=dict(
# Whether to use shared memory. Only effective if "env_manager_type" is 'subprocess'
collector_env_num=collector_env_num,
evaluator_env_num=evaluator_env_num,
env_id='LunarLander-v2',
n_evaluator_episode=8,
stop_value=200,
),
policy=dict(
cuda=True,
on_policy=False,
priority=True,
priority_IS_weight=True,
model=dict(
obs_shape=8,
action_shape=4,
encoder_hidden_size_list=[128, 128, 512],
),
discount_factor=0.997,
nstep=5,
burnin_step=2,
# (int) the whole sequence length to unroll the RNN network minus
# the timesteps of burnin part,
# i.e., <the whole sequence length> = <unroll_len> = <burnin_step> + <learn_unroll_len>
learn_unroll_len=40,
learn=dict(
# according to the r2d3 paper, actor parameter update interval is 400
# environment timesteps, and in per collect phase, we collect 32 sequence
# samples, the length of each samlpe sequence is <burnin_step> + <unroll_len>,
# which is 100 in our seeting, 32*100/400=8, so we set update_per_collect=8
# in most environments
value_rescale=True,
update_per_collect=8,
batch_size=64,
learning_rate=0.0005,
target_update_theta=0.001,
# DQFD related parameters
lambda1=1.0, # n-step return
lambda2=1.0, # supervised loss
lambda3=1e-5, # L2 it's very important to set Adam optimizer optim_type='adamw'.
lambda_one_step_td=1, # 1-step return
margin_function=0.8, # margin function in JE, here we implement this as a constant
per_train_iter_k=0, # TODO(pu)
),
collect=dict(
# NOTE: It is important that set key traj_len_inf=True here,
# to make sure self._traj_len=INF in serial_sample_collector.py.
# In sequence-based policy, for each collect_env,
# we want to collect data of length self._traj_len=INF
# unless the episode enters the 'done' state.
# In each collect phase, we collect a total of <n_sample> sequence samples.
n_sample=32,
traj_len_inf=True,
env_num=collector_env_num,
# The hyperparameter pho, the demo ratio, control the propotion of data coming
# from expert demonstrations versus from the agent's own experience.
pho=1 / 4., # TODO(pu)
),
eval=dict(env_num=evaluator_env_num, ),
other=dict(
eps=dict(
type='exp',
start=0.95,
end=0.05,
decay=100000,
),
replay_buffer=dict(
replay_buffer_size=int(1e5),
# (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization
alpha=0.6, # priority exponent default=0.6
# (Float type) How much correction is used: 0 means no correction while 1 means full correction
beta=0.4,
)
),
),
)
lunarlander_r2d3_config = EasyDict(lunarlander_r2d3_config)
main_config = lunarlander_r2d3_config
lunarlander_r2d3_create_config = dict(
env=dict(
type='lunarlander',
import_names=['dizoo.box2d.lunarlander.envs.lunarlander_env'],
),
env_manager=dict(type='subprocess'),
policy=dict(type='r2d3'),
)
lunarlander_r2d3_create_config = EasyDict(lunarlander_r2d3_create_config)
create_config = lunarlander_r2d3_create_config
"""export config"""
expert_lunarlander_r2d3_config = dict(
exp_name='expert_lunarlander_r2d3_ppoexpert_seed0',
env=dict(
# Whether to use shared memory. Only effective if "env_manager_type" is 'subprocess'
manager=dict(shared_memory=True, reset_inplace=True),
collector_env_num=collector_env_num,
evaluator_env_num=evaluator_env_num,
n_evaluator_episode=5,
stop_value=200,
),
policy=dict(
cuda=True,
on_policy=False,
priority=True,
model=dict(
obs_shape=8,
action_shape=4,
encoder_hidden_size_list=[128, 128, 64], # ppo
),
discount_factor=0.997,
burnin_step=2,
nstep=5,
learn=dict(expert_replay_buffer_size=expert_replay_buffer_size, ),
collect=dict(
# Users should add their own model path here. Model path should lead to a model.
# Absolute path is recommended.
# In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``.
model_path='model_path_placeholder',
# Cut trajectories into pieces with length "unroll_len",
# which should set as self._sequence_len of r2d2
unroll_len=42, # NOTE: should equals self._sequence_len in r2d2 policy
env_num=collector_env_num,
),
eval=dict(env_num=evaluator_env_num, ),
other=dict(
replay_buffer=dict(
replay_buffer_size=expert_replay_buffer_size,
# (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization
alpha=0.9, # priority exponent default=0.6
# (Float type) How much correction is used: 0 means no correction while 1 means full correction
beta=0.4,
)
),
),
)
expert_lunarlander_r2d3_config = EasyDict(expert_lunarlander_r2d3_config)
expert_main_config = expert_lunarlander_r2d3_config
expert_lunarlander_r2d3_create_config = dict(
env=dict(
type='lunarlander',
import_names=['dizoo.box2d.lunarlander.envs.lunarlander_env'],
),
env_manager=dict(type='subprocess'),
policy=dict(type='offppo_collect_traj'), # this policy is designed to collect off-ppo expert traj for r2d3
)
expert_lunarlander_r2d3_create_config = EasyDict(expert_lunarlander_r2d3_create_config)
expert_create_config = expert_lunarlander_r2d3_create_config
if __name__ == "__main__":
from ding.entry import serial_pipeline_r2d3
serial_pipeline_r2d3([main_config, create_config], [expert_main_config, expert_create_config], seed=0)
|