kul-speech-lab
/

whisper_large_CGN

Automatic Speech Recognition

Generated from Trainer

Inference Endpoints

Model card Files Files and versions Community

whisper_large_CGN / scripts /subs-annot.py

Jakob Poncelet

Scripts

b9e18f6 over 1 year ago

2.96 kB

	import numpy as np
	import kaldiio

	import datasets
	from transformers.utils import logging


	logger = logging.get_logger(__name__)

	_DESCRIPTION = "Annotated Subtitles"

	_FILEPATHS = {
	"fbank_pitch": "/esat/spchtemp/scratch/jponcele/espnet2/dump/fbank_pitch/subs_annot",
	"raw": "/esat/spchtemp/scratch/jponcele/espnet2/dump/raw/subs_annot"
	}

	_FEATURES_NAME = {
	"fbank_pitch": "feats.scp",
	"raw": "wav.scp"
	}


	class CGNConfig(datasets.BuilderConfig):
	def __init__(self, **kwargs):
	"""
	Args:
	data_dir: `string`, the path to the folder containing the files in the
	downloaded .tar
	citation: `string`, citation for the data set
	url: `string`, url for information about the data set
	**kwargs: keyword arguments forwarded to super.
	"""
	super(CGNConfig, self).__init__(version=datasets.Version("2.6.1", ""), **kwargs)


	class CGN(datasets.GeneratorBasedBuilder):

	DEFAULT_WRITER_BATCH_SIZE = 256
	DEFAULT_CONFIG_NAME = "raw"
	BUILDER_CONFIGS = [
	CGNConfig(name="raw", description="All Components")
	]

	def _info(self):
	return datasets.DatasetInfo(
	description=_DESCRIPTION,
	features=datasets.Features(
	{
	"audio": datasets.Value("string"),
	"text": datasets.Value("string"),
	"id": datasets.Value("string"),
	}
	),
	supervised_keys=("text",),
	)

	def _split_generators(self, _):

	return [
	datasets.SplitGenerator(
	name="test",
	gen_kwargs={}
	)
	]

	def _generate_examples(self):

	data_dirs = [_FILEPATHS[self.config.name]]
	for data_dir in data_dirs:
	with open(f"{data_dir}/text", "r") as txtfile:
	lines = txtfile.readlines()
	texts = {line.rstrip().split(' ')[0]: ' '.join(line.rstrip().split(' ')[1:]) for line in lines if len(line.rstrip().split(' ')) > 1}

	featfile = f"{data_dir}/{_FEATURES_NAME[self.config.name]}"

	with open(featfile, "r") as txtfile:
	feats_generator = dict(map(lambda s: s.strip().split(maxsplit=1), txtfile))

	#if featfile.endswith(".scp"):
	# feats_generator = kaldiio.load_scp(featfile)
	#elif featfile.endswith(".npz"):
	# feats_generator = np.load(featfile)

	for key, (uttid, transcript) in enumerate(texts.items()):
	if uttid not in feats_generator:
	logger.warning(f"Missing utterance: {uttid}")
	continue

	wav = feats_generator[uttid]
	#if isinstance(feats, tuple):
	# sr, feats = feats
	#feats = np.expand_dims(feats, axis=1)

	yield key, {"audio": wav, "text": transcript, "id": uttid}