SillyTavern-Extras1

Sleeping

SillyTavern-Extras1 / modules /voice_conversion /fairseq /data /add_target_dataset.py

TomatoCocotree

上传

6a62ffb over 1 year ago

3 kB

	# Copyright (c) Facebook, Inc. and its affiliates.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import torch

	from . import BaseWrapperDataset, data_utils
	from fairseq.data.text_compressor import TextCompressor, TextCompressionLevel


	class AddTargetDataset(BaseWrapperDataset):
	def __init__(
	self,
	dataset,
	labels,
	pad,
	eos,
	batch_targets,
	process_label=None,
	label_len_fn=None,
	add_to_input=False,
	text_compression_level=TextCompressionLevel.none,
	):
	super().__init__(dataset)
	self.labels = labels
	self.batch_targets = batch_targets
	self.pad = pad
	self.eos = eos
	self.process_label = process_label
	self.label_len_fn = label_len_fn
	self.add_to_input = add_to_input
	self.text_compressor = TextCompressor(level=text_compression_level)

	def get_label(self, index, process_fn=None):
	lbl = self.labels[index]
	lbl = self.text_compressor.decompress(lbl)
	return lbl if process_fn is None else process_fn(lbl)

	def __getitem__(self, index):
	item = self.dataset[index]
	item["label"] = self.get_label(index, process_fn=self.process_label)
	return item

	def size(self, index):
	sz = self.dataset.size(index)
	own_sz = self.label_len_fn(self.get_label(index))
	return sz, own_sz

	def collater(self, samples):
	collated = self.dataset.collater(samples)
	if len(collated) == 0:
	return collated
	indices = set(collated["id"].tolist())
	target = [s["label"] for s in samples if s["id"] in indices]

	if self.add_to_input:
	eos = torch.LongTensor([self.eos])
	prev_output_tokens = [torch.cat([eos, t], axis=-1) for t in target]
	target = [torch.cat([t, eos], axis=-1) for t in target]
	collated["net_input"]["prev_output_tokens"] = prev_output_tokens

	if self.batch_targets:
	collated["target_lengths"] = torch.LongTensor([len(t) for t in target])
	target = data_utils.collate_tokens(target, pad_idx=self.pad, left_pad=False)
	collated["ntokens"] = collated["target_lengths"].sum().item()
	if getattr(collated["net_input"], "prev_output_tokens", None):
	collated["net_input"]["prev_output_tokens"] = data_utils.collate_tokens(
	collated["net_input"]["prev_output_tokens"],
	pad_idx=self.pad,
	left_pad=False,
	)
	else:
	collated["ntokens"] = sum([len(t) for t in target])

	collated["target"] = target
	return collated

	def filter_indices_by_size(self, indices, max_sizes):
	indices, ignored = data_utils._filter_by_size_dynamic(
	indices, self.size, max_sizes
	)
	return indices, ignored