dummy_m4

Build error

App Files Files Community

dummy_m4 / m4 /training /dataset_utils.py

ysharma HF staff

Duplicate from HuggingFaceM4/m4-dialogue

e7d3e35 over 1 year ago

raw

history blame

12 kB

	import logging
	import random

	import webdataset as wds
	from webdataset.tariterators import group_by_keys, tar_file_expander, url_opener

	from m4.training.types import DatasetTypes


	meta_prefix = "__"
	meta_suffix = "__"

	logger = logging.getLogger(__name__)
	trace = False


	def webdoc_valid_sample(sample):
	"""Check whether a sample is valid.

	:param sample: sample to be checked
	"""
	return (
	sample is not None
	and isinstance(sample, dict)
	and len(list(sample.keys())) > 0
	and not sample.get("__bad__", False)
	and sample_has_all_files(sample)
	)


	def sample_has_all_files(current_sample):
	meta = current_sample.get("metadata.value", None)
	if meta is None:
	return False
	meta = meta.decode("utf-8")
	if len(meta) == 0:
	return False
	target_file_list = meta.split("\n")
	fname_keys = [key for key in current_sample.keys() if key.endswith(".fname")]
	fnames = [current_sample[key] for key in fname_keys]
	check = all([fname in fnames for fname in target_file_list])
	if not check:
	return False
	return True


	class ImageDecoder:
	def __call__(self, bytes_):
	import io

	import PIL.Image

	img = PIL.Image.open(io.BytesIO(bytes_))
	img.load()
	return img


	# Taken from https://github.com/mlfoundations/open_clip/blob/c48111dacac55db24878af229d8a5662c03e6f1c/src/training/data.py#L180-L183
	def log_and_continue(exn):
	"""Call in an exception handler to ignore any exception, issue a warning, and continue."""
	logging.warning(f"Handling webdataset error ({repr(exn)}). Ignoring.")
	return True


	# Adapt group_by_keys to our webdocument format in which each samples contains several text and image files
	# https://github.com/webdataset/webdataset/blob/039d74319ae55e5696dcef89829be9671802cf70/webdataset/tariterators.py#L195-L250
	def group_by_keys_interleaved(data, handler=log_and_continue):
	"""Return function over iterator that groups key, value pairs into samples."""
	current_sample = None
	for filesample in data:
	try:
	assert isinstance(filesample, dict)
	fname, value = filesample["fname"], filesample["data"]
	fname = fname.strip("./")
	if fname.endswith(".metadata.txt"):
	prefix, data_type, extension = fname.split(".")
	suffix = data_type
	else:
	prefix, idx, data_type, extension = fname.split(".")
	if data_type not in ["text", "image"]:
	raise ValueError(f"{fname}: unknown data type {data_type}")
	suffix = idx
	if trace:
	print(
	f"prefix: {prefix}, idx: {idx}, data_type: {data_type}, extension: {extension}, keys:"
	f" {current_sample.keys() if isinstance(current_sample, dict) else None}"
	)
	if prefix is None:
	continue
	if current_sample is None or prefix != current_sample["__key__"]:
	valid = webdoc_valid_sample(current_sample)
	if valid:
	yield current_sample
	elif current_sample is not None:
	logging.warning(f"{fname}: invalid sample {current_sample} ignored")
	current_sample = dict(__key__=prefix, __url__=filesample["__url__"])
	if suffix in current_sample:
	raise ValueError(f"{fname}: duplicate file name in tar file {suffix} {current_sample.keys()}")
	current_sample[f"{suffix}.value"] = value
	current_sample[f"{suffix}.type"] = data_type
	current_sample[f"{suffix}.fname"] = fname
	except Exception as exn:
	exn.args = exn.args + (filesample.get("stream"), filesample.get("url"))
	if handler(exn):
	continue
	else:
	break

	if webdoc_valid_sample(current_sample):
	yield current_sample


	def _tarfile_to_webdocument_samples(src, handler=log_and_continue):
	streams = url_opener(src, handler=handler)
	files = tar_file_expander(streams, handler=handler)
	samples = group_by_keys_interleaved(files, handler=handler)
	return samples


	tarfile_to_webdocument_samples = wds.filters.pipelinefilter(_tarfile_to_webdocument_samples)


	def _collate_texts_and_images_webdocument(data, handler=log_and_continue):
	for sample in data:
	try:
	max_example_indices = max(
	[int(key.split(".")[0]) for key in sample.keys() if key.endswith(".value") and key != "metadata.value"]
	)
	texts = [None for _ in range(max_example_indices + 1)]
	images = [None for _ in range(max_example_indices + 1)]
	for idx in range(max_example_indices + 1):
	if f"{idx}.value" not in sample:
	continue
	if "text" in sample[f"{idx}.type"]:
	texts[idx] = sample[f"{idx}.value"]
	elif "image" in sample[f"{idx}.type"]:
	images[idx] = sample[f"{idx}.value"]
	else:
	raise ValueError(f"Unknown data type: {sample[f'{idx}.type']}")
	example = {"__key__": sample["__key__"], "__url__": sample["__url__"], "texts": texts, "images": images}
	yield example
	except Exception as exn:
	exn.args = exn.args + (sample.get("stream"), sample.get("url"))
	if handler(exn):
	continue
	else:
	break


	collate_texts_and_images_webdocument = wds.filters.pipelinefilter(_collate_texts_and_images_webdocument)


	def _decode_image_and_text_webdocument(data, handler=log_and_continue):
	image_decoder = ImageDecoder()
	for sample in data:
	try:
	sample["images"] = [image_decoder(image) if image is not None else None for image in sample["images"]]
	sample["texts"] = [text.decode("utf-8") if text is not None else None for text in sample["texts"]]
	yield sample
	except Exception as exn:
	exn.args = exn.args + (sample.get("stream"), sample.get("url"))
	if handler(exn):
	continue
	else:
	break


	decode_image_and_text_webdocument = wds.filters.pipelinefilter(_decode_image_and_text_webdocument)


	def collate_dicts(samples):
	keys = samples[0].keys()
	batched_samples = {key: [sample[key] for sample in samples] for key in keys}
	return batched_samples


	def get_webdocuments_webdataset(
	urls,
	batch_size,
	shuffle_initial_urls_list=False,
	shuffle_before_split_by_node_buffer_size=100,
	shuffle_before_split_by_worker_buffer_size=100,
	shuffle_after_tarfile_to_samples_buffer_size=100,
	shuffle_after_batching_buffer_size=1000,
	):
	if shuffle_initial_urls_list:
	random.shuffle(urls)

	pipeline_list = [wds.SimpleShardList(urls)]

	if shuffle_before_split_by_node_buffer_size is not None:
	pipeline_list.append(wds.shuffle(shuffle_before_split_by_node_buffer_size))

	pipeline_list.append(wds.split_by_node)

	if shuffle_before_split_by_worker_buffer_size is not None:
	pipeline_list.append(wds.shuffle(shuffle_before_split_by_worker_buffer_size))

	pipeline_list.extend(
	[
	wds.split_by_worker,
	tarfile_to_webdocument_samples(),
	]
	)

	if shuffle_after_tarfile_to_samples_buffer_size is not None:
	pipeline_list.append(wds.shuffle(shuffle_after_tarfile_to_samples_buffer_size))

	pipeline_list.extend(
	[
	collate_texts_and_images_webdocument(),
	decode_image_and_text_webdocument(),
	wds.batched(batch_size, collation_fn=collate_dicts, partial=True),
	]
	)

	if shuffle_after_batching_buffer_size is not None:
	pipeline_list.append(wds.shuffle(shuffle_after_batching_buffer_size))

	dataset = wds.DataPipeline(pipeline_list)
	return dataset


	def split_keep_2(x):
	x = x.strip("./")
	x_splitter = x.split(".")
	return x_splitter[0], x_splitter[1]


	def _tarfile_to_pair_samples(src, handler=log_and_continue):
	streams = url_opener(src, handler=handler)
	files = tar_file_expander(streams, handler=handler)
	samples = group_by_keys(files, keys=split_keep_2, handler=handler)
	return samples


	tarfile_to_pair_samples = wds.filters.pipelinefilter(_tarfile_to_pair_samples)


	def _decode_image_and_text_pairs(data, handler=log_and_continue):
	image_decoder = ImageDecoder()
	for sample in data:
	try:
	sample["image"] = image_decoder(sample["image"])
	sample["text"] = sample["text"].decode("utf-8")
	yield sample
	except Exception as exn:
	exn.args = exn.args + (sample.get("stream"), sample.get("url"))
	if handler(exn):
	continue
	else:
	break


	decode_image_and_text_pairs = wds.filters.pipelinefilter(_decode_image_and_text_pairs)


	def get_image_caption_pairs_webdataset(
	urls,
	batch_size,
	shuffle_initial_urls_list=False,
	shuffle_before_split_by_node_buffer_size=100,
	shuffle_before_split_by_worker_buffer_size=100,
	shuffle_after_tarfile_to_samples_buffer_size=100,
	shuffle_after_batching_buffer_size=1000,
	):
	if shuffle_initial_urls_list:
	random.shuffle(urls)

	pipeline_list = [wds.SimpleShardList(urls)]

	if shuffle_before_split_by_node_buffer_size is not None:
	pipeline_list.append(wds.shuffle(shuffle_before_split_by_node_buffer_size))

	pipeline_list.append(wds.split_by_node)

	if shuffle_before_split_by_worker_buffer_size is not None:
	pipeline_list.append(wds.shuffle(shuffle_before_split_by_worker_buffer_size))

	pipeline_list.extend(
	[
	wds.split_by_worker,
	tarfile_to_pair_samples(handler=log_and_continue),
	]
	)

	if shuffle_after_tarfile_to_samples_buffer_size is not None:
	pipeline_list.append(wds.shuffle(shuffle_after_tarfile_to_samples_buffer_size))

	pipeline_list.extend(
	[
	decode_image_and_text_pairs(),
	wds.batched(batch_size, collation_fn=collate_dicts, partial=True), # todo: check if partial is needed
	]
	)

	if shuffle_after_batching_buffer_size is not None:
	pipeline_list.append(wds.shuffle(shuffle_after_batching_buffer_size))

	dataset = wds.DataPipeline(pipeline_list)
	return dataset


	def get_webdataset(
	urls,
	ds_type: DatasetTypes,
	batch_size: int,
	shuffle_initial_urls_list,
	shuffle_before_split_by_node_buffer_size,
	shuffle_before_split_by_worker_buffer_size,
	shuffle_after_tarfile_to_samples_buffer_size,
	shuffle_after_batching_buffer_size,
	):
	if ds_type == DatasetTypes.WEB_DOCUMENTS:
	return get_webdocuments_webdataset(
	urls,
	batch_size,
	shuffle_initial_urls_list,
	shuffle_before_split_by_node_buffer_size,
	shuffle_before_split_by_worker_buffer_size,
	shuffle_after_tarfile_to_samples_buffer_size,
	shuffle_after_batching_buffer_size,
	)
	elif ds_type == DatasetTypes.IMAGE_CAPTION_PAIRS:
	return get_image_caption_pairs_webdataset(
	urls,
	batch_size,
	shuffle_initial_urls_list,
	shuffle_before_split_by_node_buffer_size,
	shuffle_before_split_by_worker_buffer_size,
	shuffle_after_tarfile_to_samples_buffer_size,
	shuffle_after_batching_buffer_size,
	)
	else:
	raise ValueError(f"Unknown dataset type: {ds_type}")


	def check_webdataset_command(command):
	if "s3:/" not in command:
	return True

	command = command.strip()
	if not command.startswith("pipe:bash"):
	return False

	if not command.endswith(".tar"):
	return False

	if "get_file.sh" not in command:
	return False

	return True