import gradio as gr import argparse import soundfile as sf import numpy as np import tempfile from pathlib import Path import os import subprocess import sys import re # from transformers import AutoProcessor, AutoModelForPreTraining # processor = AutoProcessor.from_pretrained("patrickvonplaten/mms-1b") # model = AutoModelForPreTraining.from_pretrained("patrickvonplaten/mms-1b") def process(audio, model, lang, format): with tempfile.TemporaryDirectory() as tmpdir: print(">>> preparing tmp manifest dir ...", file=sys.stderr) tmpdir = Path(tmpdir) with open(tmpdir / "dev.tsv", "w") as fw: fw.write("/\n") for audio in audio: nsample = sf.SoundFile(audio).frames fw.write(f"{audio}\t{nsample}\n") with open(tmpdir / "dev.uid", "w") as fw: fw.write(f"{audio}\n"*len(audio)) with open(tmpdir / "dev.ltr", "w") as fw: fw.write("d u m m y | d u m m y\n"*len(audio)) with open(tmpdir / "dev.wrd", "w") as fw: fw.write("dummy dummy\n"*len(audio)) cmd = f""" PYTHONPATH=. PREFIX=INFER HYDRA_FULL_ERROR=1 python infer.py -m decoding.type=viterbi dataset.max_tokens=4000000 distributed_training.distributed_world_size=1 "common_eval.path='{model}'" task.data={tmpdir} dataset.gen_subset="{lang}:dev" common_eval.post_process={format} decoding.results_path={tmpdir} """ print(">>> loading model & running inference ...", file=sys.stderr) subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL,) with open(tmpdir/"hypo.word") as fr: for ii, hypo in enumerate(fr): hypo = re.sub("\(\S+\)$", "", hypo).strip() print(f'===============\nInput: {audio[ii]}\nOutput: {hypo}') def transcribe(audio): model = "base_300m.pt" lang = "eng" format = "letter" process(np.ravel(audio), model, lang, format) gr.Interface( title = 'MetaAI (Facebook Research) MMS (Massively Multilingual Speech) ASR', fn=transcribe, inputs=[ gr.inputs.Audio(source="microphone", type="filepath") ], outputs=[ "textbox" ], live=True).launch()