VISOR-GPT / train /scripts /prepare_s2t_data_manifest.py
szukevin's picture
upload
7900c16
raw
history blame
1.99 kB
#!/usr/bin/env python3
"""
Helper script to generate speech2text dataset
"""
import argparse
import os
import glob
import random
def main():
parser = argparse.ArgumentParser()
parser.add_argument("root")
parser.add_argument("--output-file", required=True)
parser.add_argument("--for_finetune", action='store_true')
parser.add_argument("--sample-ratio", type=float, default=1.0)
parser.add_argument("--ext", default="flac")
parser.add_argument("--seed", type=int, default=7)
args = parser.parse_args()
rand = random.Random(args.seed)
os.makedirs(os.path.dirname(os.path.realpath(args.output_file)), exist_ok=True)
dir_path = os.path.realpath(args.root)
search_path = os.path.join(dir_path, "**/*." + args.ext)
transcriptions = {}
with open(args.output_file, "w") as out_file:
if args.for_finetune:
print("text" + "\t" + "wav_path", file=out_file)
for fname in glob.iglob(search_path, recursive=True):
if rand.random() > args.sample_ratio:
continue
file_path = os.path.realpath(fname)
dir = os.path.dirname(file_path)
if dir not in transcriptions:
parts = dir.split(os.path.sep)
trans_path = f"{parts[-2]}-{parts[-1]}.trans.txt"
path = os.path.join(args.root, dir, trans_path)
assert os.path.exists(path)
texts = {}
with open(path, "r") as trans_f:
for tline in trans_f:
items = tline.strip().split()
texts[items[0]] = " ".join(items[1:]).lower()
transcriptions[dir] = texts
part = os.path.basename(file_path).split(".")[0]
assert part in transcriptions[dir]
print(
transcriptions[dir][part] + "\t" + file_path,
file=out_file
)
if __name__ == "__main__":
main()