Second Push

05c9ac2 over 1 year ago

9.79 kB

	import os
	from typing import List, Tuple
	import numpy as np
	from mlagents.trainers.buffer import AgentBuffer, BufferKey
	from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (
	AgentInfoActionPairProto,
	)
	from mlagents.trainers.trajectory import ObsUtil
	from mlagents_envs.rpc_utils import behavior_spec_from_proto, steps_from_proto
	from mlagents_envs.base_env import BehaviorSpec
	from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
	from mlagents_envs.communicator_objects.demonstration_meta_pb2 import (
	DemonstrationMetaProto,
	)
	from mlagents_envs.timers import timed, hierarchical_timer
	from google.protobuf.internal.decoder import _DecodeVarint32 # type: ignore
	from google.protobuf.internal.encoder import _EncodeVarint # type: ignore


	INITIAL_POS = 33
	SUPPORTED_DEMONSTRATION_VERSIONS = frozenset([0, 1])


	@timed
	def make_demo_buffer(
	pair_infos: List[AgentInfoActionPairProto],
	behavior_spec: BehaviorSpec,
	sequence_length: int,
	) -> AgentBuffer:
	# Create and populate buffer using experiences
	demo_raw_buffer = AgentBuffer()
	demo_processed_buffer = AgentBuffer()
	for idx, current_pair_info in enumerate(pair_infos):
	if idx > len(pair_infos) - 2:
	break
	next_pair_info = pair_infos[idx + 1]
	current_decision_step, current_terminal_step = steps_from_proto(
	[current_pair_info.agent_info], behavior_spec
	)
	next_decision_step, next_terminal_step = steps_from_proto(
	[next_pair_info.agent_info], behavior_spec
	)
	previous_action = (
	np.array(
	pair_infos[idx].action_info.vector_actions_deprecated, dtype=np.float32
	)
	* 0
	)
	if idx > 0:
	previous_action = np.array(
	pair_infos[idx - 1].action_info.vector_actions_deprecated,
	dtype=np.float32,
	)

	next_done = len(next_terminal_step) == 1
	next_reward = 0
	if len(next_terminal_step) == 1:
	next_reward = next_terminal_step.reward[0]
	else:
	next_reward = next_decision_step.reward[0]
	current_obs = None
	if len(current_terminal_step) == 1:
	current_obs = list(current_terminal_step.values())[0].obs
	else:
	current_obs = list(current_decision_step.values())[0].obs

	demo_raw_buffer[BufferKey.DONE].append(next_done)
	demo_raw_buffer[BufferKey.ENVIRONMENT_REWARDS].append(next_reward)
	for i, obs in enumerate(current_obs):
	demo_raw_buffer[ObsUtil.get_name_at(i)].append(obs)
	if (
	len(current_pair_info.action_info.continuous_actions) == 0
	and len(current_pair_info.action_info.discrete_actions) == 0
	):
	if behavior_spec.action_spec.continuous_size > 0:
	demo_raw_buffer[BufferKey.CONTINUOUS_ACTION].append(
	current_pair_info.action_info.vector_actions_deprecated
	)
	else:
	demo_raw_buffer[BufferKey.DISCRETE_ACTION].append(
	current_pair_info.action_info.vector_actions_deprecated
	)
	else:
	if behavior_spec.action_spec.continuous_size > 0:
	demo_raw_buffer[BufferKey.CONTINUOUS_ACTION].append(
	current_pair_info.action_info.continuous_actions
	)
	if behavior_spec.action_spec.discrete_size > 0:
	demo_raw_buffer[BufferKey.DISCRETE_ACTION].append(
	current_pair_info.action_info.discrete_actions
	)
	demo_raw_buffer[BufferKey.PREV_ACTION].append(previous_action)
	if next_done:
	demo_raw_buffer.resequence_and_append(
	demo_processed_buffer, batch_size=None, training_length=sequence_length
	)
	demo_raw_buffer.reset_agent()
	demo_raw_buffer.resequence_and_append(
	demo_processed_buffer, batch_size=None, training_length=sequence_length
	)
	return demo_processed_buffer


	@timed
	def demo_to_buffer(
	file_path: str, sequence_length: int, expected_behavior_spec: BehaviorSpec = None
	) -> Tuple[BehaviorSpec, AgentBuffer]:
	"""
	Loads demonstration file and uses it to fill training buffer.
	:param file_path: Location of demonstration file (.demo).
	:param sequence_length: Length of trajectories to fill buffer.
	:return:
	"""
	behavior_spec, info_action_pair, _ = load_demonstration(file_path)
	demo_buffer = make_demo_buffer(info_action_pair, behavior_spec, sequence_length)
	if expected_behavior_spec:
	# check action dimensions in demonstration match
	if behavior_spec.action_spec != expected_behavior_spec.action_spec:
	raise RuntimeError(
	"The actions {} in demonstration do not match the policy's {}.".format(
	behavior_spec.action_spec, expected_behavior_spec.action_spec
	)
	)
	# check observations match
	if len(behavior_spec.observation_specs) != len(
	expected_behavior_spec.observation_specs
	):
	raise RuntimeError(
	"The demonstrations do not have the same number of observations as the policy."
	)
	else:
	for i, (demo_obs, policy_obs) in enumerate(
	zip(
	behavior_spec.observation_specs,
	expected_behavior_spec.observation_specs,
	)
	):
	if demo_obs.shape != policy_obs.shape:
	raise RuntimeError(
	f"The shape {demo_obs} for observation {i} in demonstration \
	do not match the policy's {policy_obs}."
	)
	return behavior_spec, demo_buffer


	def get_demo_files(path: str) -> List[str]:
	"""
	Retrieves the demonstration file(s) from a path.
	:param path: Path of demonstration file or directory.
	:return: List of demonstration files

	Raises errors if \|path\| is invalid.
	"""
	if os.path.isfile(path):
	if not path.endswith(".demo"):
	raise ValueError("The path provided is not a '.demo' file.")
	return [path]
	elif os.path.isdir(path):
	paths = [
	os.path.join(path, name)
	for name in os.listdir(path)
	if name.endswith(".demo")
	]
	if not paths:
	raise ValueError("There are no '.demo' files in the provided directory.")
	return paths
	else:
	raise FileNotFoundError(
	f"The demonstration file or directory {path} does not exist."
	)


	@timed
	def load_demonstration(
	file_path: str,
	) -> Tuple[BehaviorSpec, List[AgentInfoActionPairProto], int]:
	"""
	Loads and parses a demonstration file.
	:param file_path: Location of demonstration file (.demo).
	:return: BrainParameter and list of AgentInfoActionPairProto containing demonstration data.
	"""

	# First 32 bytes of file dedicated to meta-data.
	file_paths = get_demo_files(file_path)
	behavior_spec = None
	brain_param_proto = None
	info_action_pairs = []
	total_expected = 0
	for _file_path in file_paths:
	with open(_file_path, "rb") as fp:
	with hierarchical_timer("read_file"):
	data = fp.read()
	next_pos, pos, obs_decoded = 0, 0, 0
	while pos < len(data):
	next_pos, pos = _DecodeVarint32(data, pos)
	if obs_decoded == 0:
	meta_data_proto = DemonstrationMetaProto()
	meta_data_proto.ParseFromString(data[pos : pos + next_pos])
	if (
	meta_data_proto.api_version
	not in SUPPORTED_DEMONSTRATION_VERSIONS
	):
	raise RuntimeError(
	f"Can't load Demonstration data from an unsupported version ({meta_data_proto.api_version})"
	)
	total_expected += meta_data_proto.number_steps
	pos = INITIAL_POS
	if obs_decoded == 1:
	brain_param_proto = BrainParametersProto()
	brain_param_proto.ParseFromString(data[pos : pos + next_pos])
	pos += next_pos
	if obs_decoded > 1:
	agent_info_action = AgentInfoActionPairProto()
	agent_info_action.ParseFromString(data[pos : pos + next_pos])
	if behavior_spec is None:
	behavior_spec = behavior_spec_from_proto(
	brain_param_proto, agent_info_action.agent_info
	)
	info_action_pairs.append(agent_info_action)
	if len(info_action_pairs) == total_expected:
	break
	pos += next_pos
	obs_decoded += 1
	if not behavior_spec:
	raise RuntimeError(
	f"No BrainParameters found in demonstration file at {file_path}."
	)
	return behavior_spec, info_action_pairs, total_expected


	def write_delimited(f, message):
	msg_string = message.SerializeToString()
	msg_size = len(msg_string)
	_EncodeVarint(f.write, msg_size)
	f.write(msg_string)


	def write_demo(demo_path, meta_data_proto, brain_param_proto, agent_info_protos):
	with open(demo_path, "wb") as f:
	# write metadata
	write_delimited(f, meta_data_proto)
	f.seek(INITIAL_POS)
	write_delimited(f, brain_param_proto)

	for agent in agent_info_protos:
	write_delimited(f, agent)