File size: 2,956 Bytes
b9e18f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import numpy as np
import kaldiio
import datasets
from transformers.utils import logging
logger = logging.get_logger(__name__)
_DESCRIPTION = "Corpus Gesproken Nederlands"
_FILEPATHS = {
"fbank_pitch": "/esat/spchtemp/scratch/jponcele/espnet2/dump/fbank_pitch/dev_s",
"raw": "/esat/spchtemp/scratch/jponcele/espnet2/dump/raw/dev_s"
}
_FEATURES_NAME = {
"fbank_pitch": "feats.scp",
"raw": "wav.scp"
}
class CGNConfig(datasets.BuilderConfig):
def __init__(self, **kwargs):
"""
Args:
data_dir: `string`, the path to the folder containing the files in the
downloaded .tar
citation: `string`, citation for the data set
url: `string`, url for information about the data set
**kwargs: keyword arguments forwarded to super.
"""
super(CGNConfig, self).__init__(version=datasets.Version("2.6.1", ""), **kwargs)
class CGN(datasets.GeneratorBasedBuilder):
DEFAULT_WRITER_BATCH_SIZE = 256
DEFAULT_CONFIG_NAME = "raw"
BUILDER_CONFIGS = [
CGNConfig(name="raw", description="All Components")
]
def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features(
{
"audio": datasets.Value("string"),
"text": datasets.Value("string"),
"id": datasets.Value("string"),
}
),
supervised_keys=("text",),
)
def _split_generators(self, _):
return [
datasets.SplitGenerator(
name="test",
gen_kwargs={}
)
]
def _generate_examples(self):
data_dirs = [_FILEPATHS[self.config.name]]
for data_dir in data_dirs:
with open(f"{data_dir}/text", "r") as txtfile:
lines = txtfile.readlines()
texts = {line.rstrip().split(' ')[0]: ' '.join(line.rstrip().split(' ')[1:]) for line in lines if len(line.rstrip().split(' ')) > 1}
featfile = f"{data_dir}/{_FEATURES_NAME[self.config.name]}"
with open(featfile, "r") as txtfile:
feats_generator = dict(map(lambda s: s.strip().split(maxsplit=1), txtfile))
#if featfile.endswith(".scp"):
# feats_generator = kaldiio.load_scp(featfile)
#elif featfile.endswith(".npz"):
# feats_generator = np.load(featfile)
for key, (uttid, transcript) in enumerate(texts.items()):
if uttid not in feats_generator:
logger.warning(f"Missing utterance: {uttid}")
continue
wav = feats_generator[uttid]
#if isinstance(feats, tuple):
# sr, feats = feats
#feats = np.expand_dims(feats, axis=1)
yield key, {"audio": wav, "text": transcript, "id": uttid}
|