File size: 11,834 Bytes
079c32c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 |
from typing import Union, Optional, List, Any, Tuple
import os
import pickle
import numpy as np
import torch
from functools import partial
from copy import deepcopy
from ding.config import compile_config, read_config
from ding.worker import SampleSerialCollector, InteractionSerialEvaluator, EpisodeSerialCollector
from ding.envs import create_env_manager, get_vec_env_setting
from ding.policy import create_policy
from ding.torch_utils import to_device, to_ndarray
from ding.utils import set_pkg_seed
from ding.utils.data import offline_data_save_type
from ding.rl_utils import get_nstep_return_data
from ding.utils.data import default_collate
def eval(
input_cfg: Union[str, Tuple[dict, dict]],
seed: int = 0,
env_setting: Optional[List[Any]] = None,
model: Optional[torch.nn.Module] = None,
state_dict: Optional[dict] = None,
load_path: Optional[str] = None,
replay_path: Optional[str] = None,
) -> float:
"""
Overview:
Pure policy evaluation entry. Evaluate mean episode return and save replay videos.
Arguments:
- input_cfg (:obj:`Union[str, Tuple[dict, dict]]`): Config in dict type. \
``str`` type means config file path. \
``Tuple[dict, dict]`` type means [user_config, create_cfg].
- seed (:obj:`int`): Random seed.
- env_setting (:obj:`Optional[List[Any]]`): A list with 3 elements: \
``BaseEnv`` subclass, collector env config, and evaluator env config.
- model (:obj:`Optional[torch.nn.Module]`): Instance of torch.nn.Module.
- state_dict (:obj:`Optional[dict]`): The state_dict of policy or model.
- load_path (:obj:`Optional[str]`): Path to load ckpt.
- replay_path (:obj:`Optional[str]`): Path to save replay.
"""
if isinstance(input_cfg, str):
cfg, create_cfg = read_config(input_cfg)
else:
cfg, create_cfg = deepcopy(input_cfg)
env_fn = None if env_setting is None else env_setting[0]
cfg = compile_config(
cfg, seed=seed, env=env_fn, auto=True, create_cfg=create_cfg, save_cfg=True, save_path='eval_config.py'
)
# Create components: env, policy, evaluator
if env_setting is None:
env_fn, _, evaluator_env_cfg = get_vec_env_setting(cfg.env, collect=False)
else:
env_fn, _, evaluator_env_cfg = env_setting
evaluator_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in evaluator_env_cfg])
evaluator_env.seed(seed, dynamic_seed=False)
if replay_path is None: # argument > config
replay_path = cfg.env.get('replay_path', None)
if replay_path:
evaluator_env.enable_save_replay(replay_path)
set_pkg_seed(seed, use_cuda=cfg.policy.cuda)
policy = create_policy(cfg.policy, model=model, enable_field=['eval'])
if state_dict is None:
if load_path is None:
load_path = cfg.policy.learn.learner.load_path
state_dict = torch.load(load_path, map_location='cpu')
policy.eval_mode.load_state_dict(state_dict)
evaluator = InteractionSerialEvaluator(cfg.policy.eval.evaluator, evaluator_env, policy.eval_mode)
# Evaluate
_, episode_info = evaluator.eval()
episode_return = np.mean(episode_info['eval_episode_return'])
print('Eval is over! The performance of your RL policy is {}'.format(episode_return))
return episode_return
def collect_demo_data(
input_cfg: Union[str, dict],
seed: int,
collect_count: int,
expert_data_path: Optional[str] = None,
env_setting: Optional[List[Any]] = None,
model: Optional[torch.nn.Module] = None,
state_dict: Optional[dict] = None,
state_dict_path: Optional[str] = None,
) -> None:
r"""
Overview:
Collect demonstration data by the trained policy.
Arguments:
- input_cfg (:obj:`Union[str, Tuple[dict, dict]]`): Config in dict type. \
``str`` type means config file path. \
``Tuple[dict, dict]`` type means [user_config, create_cfg].
- seed (:obj:`int`): Random seed.
- collect_count (:obj:`int`): The count of collected data.
- expert_data_path (:obj:`str`): File path of the expert demo data will be written to.
- env_setting (:obj:`Optional[List[Any]]`): A list with 3 elements: \
``BaseEnv`` subclass, collector env config, and evaluator env config.
- model (:obj:`Optional[torch.nn.Module]`): Instance of torch.nn.Module.
- state_dict (:obj:`Optional[dict]`): The state_dict of policy or model.
- state_dict_path (:obj:`Optional[str]`): The path of the state_dict of policy or model.
"""
if isinstance(input_cfg, str):
cfg, create_cfg = read_config(input_cfg)
else:
cfg, create_cfg = deepcopy(input_cfg)
env_fn = None if env_setting is None else env_setting[0]
cfg = compile_config(
cfg,
seed=seed,
env=env_fn,
auto=True,
create_cfg=create_cfg,
save_cfg=True,
save_path='collect_demo_data_config.py'
)
if expert_data_path is None:
expert_data_path = cfg.policy.collect.save_path
# Create components: env, policy, collector
if env_setting is None:
env_fn, collector_env_cfg, _ = get_vec_env_setting(cfg.env, eval_=False)
else:
env_fn, collector_env_cfg, _ = env_setting
collector_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in collector_env_cfg])
collector_env.seed(seed)
set_pkg_seed(seed, use_cuda=cfg.policy.cuda)
policy = create_policy(cfg.policy, model=model, enable_field=['collect', 'eval'])
# for policies like DQN (in collect_mode has eps-greedy)
# collect_demo_policy = policy.collect_function(
# policy._forward_eval,
# policy._process_transition,
# policy._get_train_sample,
# policy._reset_eval,
# policy._get_attribute,
# policy._set_attribute,
# policy._state_dict_collect,
# policy._load_state_dict_collect,
# )
collect_demo_policy = policy.collect_mode
if state_dict is None:
assert state_dict_path is not None
state_dict = torch.load(state_dict_path, map_location='cpu')
policy.collect_mode.load_state_dict(state_dict)
collector = SampleSerialCollector(cfg.policy.collect.collector, collector_env, collect_demo_policy)
if hasattr(cfg.policy.other, 'eps'):
policy_kwargs = {'eps': 0.}
else:
policy_kwargs = None
# Let's collect some expert demonstrations
exp_data = collector.collect(n_sample=collect_count, policy_kwargs=policy_kwargs)
if cfg.policy.cuda:
exp_data = to_device(exp_data, 'cpu')
# Save data transitions.
offline_data_save_type(exp_data, expert_data_path, data_type=cfg.policy.collect.get('data_type', 'naive'))
print('Collect demo data successfully')
def collect_episodic_demo_data(
input_cfg: Union[str, dict],
seed: int,
collect_count: int,
expert_data_path: str,
env_setting: Optional[List[Any]] = None,
model: Optional[torch.nn.Module] = None,
state_dict: Optional[dict] = None,
state_dict_path: Optional[str] = None,
) -> None:
r"""
Overview:
Collect episodic demonstration data by the trained policy.
Arguments:
- input_cfg (:obj:`Union[str, Tuple[dict, dict]]`): Config in dict type. \
``str`` type means config file path. \
``Tuple[dict, dict]`` type means [user_config, create_cfg].
- seed (:obj:`int`): Random seed.
- collect_count (:obj:`int`): The count of collected data.
- expert_data_path (:obj:`str`): File path of the expert demo data will be written to.
- env_setting (:obj:`Optional[List[Any]]`): A list with 3 elements: \
``BaseEnv`` subclass, collector env config, and evaluator env config.
- model (:obj:`Optional[torch.nn.Module]`): Instance of torch.nn.Module.
- state_dict (:obj:`Optional[dict]`): The state_dict of policy or model.
- state_dict_path (:obj:'str') the abs path of the state dict
"""
if isinstance(input_cfg, str):
cfg, create_cfg = read_config(input_cfg)
else:
cfg, create_cfg = deepcopy(input_cfg)
env_fn = None if env_setting is None else env_setting[0]
cfg = compile_config(
cfg,
collector=EpisodeSerialCollector,
seed=seed,
env=env_fn,
auto=True,
create_cfg=create_cfg,
save_cfg=True,
save_path='collect_demo_data_config.py'
)
# Create components: env, policy, collector
if env_setting is None:
env_fn, collector_env_cfg, _ = get_vec_env_setting(cfg.env, eval_=False)
else:
env_fn, collector_env_cfg, _ = env_setting
collector_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in collector_env_cfg])
collector_env.seed(seed)
set_pkg_seed(seed, use_cuda=cfg.policy.cuda)
policy = create_policy(cfg.policy, model=model, enable_field=['collect', 'eval'])
collect_demo_policy = policy.collect_mode
if state_dict is None:
assert state_dict_path is not None
state_dict = torch.load(state_dict_path, map_location='cpu')
policy.collect_mode.load_state_dict(state_dict)
collector = EpisodeSerialCollector(cfg.policy.collect.collector, collector_env, collect_demo_policy)
if hasattr(cfg.policy.other, 'eps'):
policy_kwargs = {'eps': 0.}
else:
policy_kwargs = None
# Let's collect some expert demonstrations
exp_data = collector.collect(n_episode=collect_count, policy_kwargs=policy_kwargs)
if cfg.policy.cuda:
exp_data = to_device(exp_data, 'cpu')
# Save data transitions.
offline_data_save_type(exp_data, expert_data_path, data_type=cfg.policy.collect.get('data_type', 'naive'))
print('Collect episodic demo data successfully')
def episode_to_transitions(data_path: str, expert_data_path: str, nstep: int) -> None:
r"""
Overview:
Transfer episodic data into nstep transitions.
Arguments:
- data_path (:obj:str): data path that stores the pkl file
- expert_data_path (:obj:`str`): File path of the expert demo data will be written to.
- nstep (:obj:`int`): {s_{t}, a_{t}, s_{t+n}}.
"""
with open(data_path, 'rb') as f:
_dict = pickle.load(f) # class is list; length is cfg.reward_model.collect_count
post_process_data = []
for i in range(len(_dict)):
data = get_nstep_return_data(_dict[i], nstep)
post_process_data.extend(data)
offline_data_save_type(
post_process_data,
expert_data_path,
)
def episode_to_transitions_filter(data_path: str, expert_data_path: str, nstep: int, min_episode_return: int) -> None:
r"""
Overview:
Transfer episodic data into n-step transitions and only take the episode data whose return is larger than
min_episode_return.
Arguments:
- data_path (:obj:str): data path that stores the pkl file
- expert_data_path (:obj:`str`): File path of the expert demo data will be written to.
- nstep (:obj:`int`): {s_{t}, a_{t}, s_{t+n}}.
"""
with open(data_path, 'rb') as f:
_dict = pickle.load(f) # class is list; length is cfg.reward_model.collect_count
post_process_data = []
for i in range(len(_dict)):
episode_returns = torch.stack([_dict[i][j]['reward'] for j in range(_dict[i].__len__())], axis=0)
if episode_returns.sum() < min_episode_return:
continue
data = get_nstep_return_data(_dict[i], nstep)
post_process_data.extend(data)
offline_data_save_type(
post_process_data,
expert_data_path,
)
|