whisper-large-mgb2-ar-16k / split_xml_mgb2.py
Zaid's picture
Training in progress, step 2000
d9d7490
from pathlib import Path
import soundfile as sf
import xml.etree.ElementTree as ET
split = "train" # or "dev"
# set the following path to where you
# extracted the mgb2 archive
archive_path = Path("data/train")
wav_dir = archive_path / "wav"
segments_file = archive_path / "xml" / "utf8"
# output directories
output_wav_dir = archive_path / "dataset" / split /"wav"
output_txt_dir = archive_path / "dataset" / split /"txt"
# create directories for output datasets
output_wav_dir.mkdir(parents=True, exist_ok=True)
output_txt_dir.mkdir(parents=True, exist_ok=True)
# for all xml segments files under utf8 directory from archive
for s_file in segments_file.glob("*.xml"):
tree = ET.parse(str(s_file))
root = tree.getroot()
head = root[0]
segments = root[1][0]
# get the name of the wav file form the recording tag
for child in head:
if child.tag == "recording":
print(child.attrib)
file_name = child.attrib.get("filename")
# get the start and end times from the segment under segments tag
# and join the text from each segment to construct the transcript
for segment in segments:
start_time = int(float(segment.attrib.get("starttime")) *16_000)
end_time = int(float(segment.attrib.get("endtime")) * 16_000)
text = " ".join([x.text for x in segment])
# now store the meta data and the correctly sampled wav file in the correct
# output directories
wav_path = wav_dir / f"{file_name}.wav"
sound, _ = sf.read(wav_path, start=start_time, stop=end_time)
sf.write(output_wav_dir / f"{file_name}_seg{start_time}_{end_time}.wav", sound, 16_000)
open(output_txt_dir / f"{file_name}_seg{start_time}_{end_time}.txt", "w").write(text)