|
import gradio as gr |
|
import librosa |
|
import soundfile as sf |
|
import torch |
|
import warnings |
|
import os |
|
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer |
|
|
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
from fastapi import FastAPI, HTTPException, File |
|
|
|
from transformers import pipeline |
|
|
|
|
|
pipe_300m = pipeline(model="Finnish-NLP/wav2vec2-xlsr-300m-finnish-lm",chunk_length_s=20, stride_length_s=(3, 3)) |
|
pipe_1b = pipeline(model="Finnish-NLP/wav2vec2-xlsr-1b-finnish-lm-v2",chunk_length_s=20, stride_length_s=(3, 3)) |
|
|
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model_checkpoint = 'Finnish-NLP/t5x-small-nl24-finnish' |
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_auth_token=os.environ.get('hf_token')) |
|
model = AutoModelForSeq2SeqLM.from_pretrained('Finnish-NLP/case_correction_model', from_flax=False, torch_dtype=torch.float32, use_auth_token=os.environ.get('hf_token')).to(device) |
|
|
|
|
|
|
|
def asr_transcript(audio, audio_microphone, model_params): |
|
|
|
audio = audio_microphone if audio_microphone else audio |
|
|
|
if audio == None and audio_microphone == None: |
|
return "Please provide audio by uploading file or by recording audio with microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading file or by recording audio with microphone by pressing Record (And allow usage of microphone)" |
|
text = "" |
|
|
|
if audio: |
|
if model_params == "300 million": |
|
text = pipe_300m(audio.name) |
|
elif model_params == "1 billion": |
|
text = pipe_1b(audio.name) |
|
|
|
input_ids = tokenizer(text['text'], return_tensors="pt").input_ids.to(device) |
|
outputs = model.generate(input_ids, max_length=128) |
|
case_corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
return text['text'], case_corrected_text |
|
else: |
|
return "File not valid" |
|
|
|
gradio_ui = gr.Interface( |
|
fn=asr_transcript, |
|
title="Finnish Automatic Speech-Recognition", |
|
description="Upload an audio clip, and let AI do the hard work of transcribing", |
|
inputs=[gr.inputs.Audio(label="Upload Audio File", type="file", optional=True), gr.inputs.Audio(source="microphone", type="file", optional=True, label="Record"), gr.inputs.Dropdown(choices=["300 million", "1 billion"], type="value", default="1 billion", label="Select speech recognition model parameter amount", optional=False)], |
|
outputs=[gr.outputs.Textbox(label="Recognized speech"),gr.outputs.Textbox(label="Recognized speech with case correction and punctuation")] |
|
) |
|
|
|
gradio_ui.launch() |