whisper_large_CGN / scripts /subs-annot.py
Jakob Poncelet
Scripts
b9e18f6
raw
history blame
2.96 kB
import numpy as np
import kaldiio
import datasets
from transformers.utils import logging
logger = logging.get_logger(__name__)
_DESCRIPTION = "Annotated Subtitles"
_FILEPATHS = {
"fbank_pitch": "/esat/spchtemp/scratch/jponcele/espnet2/dump/fbank_pitch/subs_annot",
"raw": "/esat/spchtemp/scratch/jponcele/espnet2/dump/raw/subs_annot"
}
_FEATURES_NAME = {
"fbank_pitch": "feats.scp",
"raw": "wav.scp"
}
class CGNConfig(datasets.BuilderConfig):
def __init__(self, **kwargs):
"""
Args:
data_dir: `string`, the path to the folder containing the files in the
downloaded .tar
citation: `string`, citation for the data set
url: `string`, url for information about the data set
**kwargs: keyword arguments forwarded to super.
"""
super(CGNConfig, self).__init__(version=datasets.Version("2.6.1", ""), **kwargs)
class CGN(datasets.GeneratorBasedBuilder):
DEFAULT_WRITER_BATCH_SIZE = 256
DEFAULT_CONFIG_NAME = "raw"
BUILDER_CONFIGS = [
CGNConfig(name="raw", description="All Components")
]
def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features(
{
"audio": datasets.Value("string"),
"text": datasets.Value("string"),
"id": datasets.Value("string"),
}
),
supervised_keys=("text",),
)
def _split_generators(self, _):
return [
datasets.SplitGenerator(
name="test",
gen_kwargs={}
)
]
def _generate_examples(self):
data_dirs = [_FILEPATHS[self.config.name]]
for data_dir in data_dirs:
with open(f"{data_dir}/text", "r") as txtfile:
lines = txtfile.readlines()
texts = {line.rstrip().split(' ')[0]: ' '.join(line.rstrip().split(' ')[1:]) for line in lines if len(line.rstrip().split(' ')) > 1}
featfile = f"{data_dir}/{_FEATURES_NAME[self.config.name]}"
with open(featfile, "r") as txtfile:
feats_generator = dict(map(lambda s: s.strip().split(maxsplit=1), txtfile))
#if featfile.endswith(".scp"):
# feats_generator = kaldiio.load_scp(featfile)
#elif featfile.endswith(".npz"):
# feats_generator = np.load(featfile)
for key, (uttid, transcript) in enumerate(texts.items()):
if uttid not in feats_generator:
logger.warning(f"Missing utterance: {uttid}")
continue
wav = feats_generator[uttid]
#if isinstance(feats, tuple):
# sr, feats = feats
#feats = np.expand_dims(feats, axis=1)
yield key, {"audio": wav, "text": transcript, "id": uttid}