Spaces:

Finnish-NLP
/

Finnish-Automatic-Speech-Recognition

Sleeping

App Files Files Community

Finnish-Automatic-Speech-Recognition / app.py

RASMUS

Update app.py

3d187fe verified 6 months ago

raw

history blame contribute delete

4.2 kB

	import gradio as gr
	import librosa
	import soundfile as sf
	import torch
	import warnings
	import os
	from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer

	warnings.filterwarnings("ignore")

	#load wav2vec2 tokenizer and model

	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

	from fastapi import FastAPI, HTTPException, File

	from transformers import pipeline




	pipe_95m = pipeline(model="Finnish-NLP/wav2vec2-base-fi-voxpopuli-v2-finetuned",chunk_length_s=20, stride_length_s=(3, 3))
	pipe_300m = pipeline(model="Finnish-NLP/wav2vec2-large-uralic-voxpopuli-v2-finnish",chunk_length_s=20, stride_length_s=(3, 3))
	pipe_1b = pipeline(model="Finnish-NLP/wav2vec2-xlsr-1b-finnish-lm-v2",chunk_length_s=20, stride_length_s=(3, 3))



	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model_checkpoint = 'Finnish-NLP/t5-small-nl24-casing-punctuation-correction'
	tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)#, use_auth_token=os.environ.get('hf_token'))
	model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_flax=False, torch_dtype=torch.float32, use_auth_token=os.environ.get('hf_token')).to(device)

	# define speech-to-text function
	def asr_transcript(audio, audio_microphone, model_params):


	audio = audio_microphone if audio_microphone else audio

	if audio == None and audio_microphone == None:
	return "Please provide audio (wav or mp3) by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
	text = ""

	if audio:
	if model_params == "1 billion":
	text = pipe_1b(audio.name)
	elif model_params == "300 million":
	text = pipe_300m(audio.name)
	elif model_params == "95 million":
	text = pipe_95m(audio.name)

	input_ids = tokenizer(text['text'], return_tensors="pt").input_ids.to(device)
	outputs = model.generate(input_ids, max_length=128)
	case_corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return text['text'], case_corrected_text
	else:
	return "File not valid"

	gradio_ui = gr.Interface(
	fn=asr_transcript,
	title="Finnish Automatic Speech Recognition",
	description="Upload an audio clip or record from browser using microphone, and let AI do the hard work of transcribing.",
	article = """
	This demo includes 2 kinds of models that are run together. First selected ASR model does speech recognition which produces lowercase text without punctuation.
	After that we run a sequence-to-sequence model which tries to correct casing and punctuation which produces the final output.
	You can select one of two speech recognition models listed below

	1. 1 billion, best accuracy but slowest by big margin. Based on multilingual wav2vec2-xlsr model by Meta. More info here https://huggingface.co/Finnish-NLP/wav2vec2-xlsr-1b-finnish-lm-v2
	3. 300 million, at bar in accuracy as 1. but a lot faster. Based on Uralic wav2vec2 model by Meta. More info here https://huggingface.co/Finnish-NLP/wav2vec2-large-uralic-voxpopuli-v2-finnish
	3. 95 million, almost as accurate as 1. but really much faster. Based on Finnish wav2vec2 model by Meta. More info here https://huggingface.co/Finnish-NLP/wav2vec2-base-fi-voxpopuli-v2-finetuned

	More info about the casing+punctuation correction model can be found here https://huggingface.co/Finnish-NLP/t5-small-nl24-casing-punctuation-correction
	""",
	inputs=[gr.inputs.Audio(label="Upload Audio File", type="file", optional=True), gr.inputs.Audio(source="microphone", type="file", optional=True, label="Record from microphone"), gr.inputs.Dropdown(choices=["95 million","300 million", "1 billion"], type="value", default="300 million", label="Select speech recognition model parameter amount", optional=False)],
	outputs=[gr.outputs.Textbox(label="Recognized speech"),gr.outputs.Textbox(label="Recognized speech with case correction and punctuation")]
	)

	gradio_ui.launch()