RasmusToivanen
commited on
Commit
•
af31d45
1
Parent(s):
ab09d2c
add article, change to gradio 3, remove 300m model
Browse files
app.py
CHANGED
@@ -19,18 +19,16 @@ from transformers import pipeline
|
|
19 |
|
20 |
|
21 |
|
22 |
-
|
23 |
-
pipe_94m = pipeline(model="Finnish-NLP/wav2vec2-base-fi-voxpopuli-v2-finetuned",chunk_length_s=20, stride_length_s=(3, 3))
|
24 |
pipe_1b = pipeline(model="Finnish-NLP/wav2vec2-xlsr-1b-finnish-lm-v2",chunk_length_s=20, stride_length_s=(3, 3))
|
25 |
|
26 |
|
27 |
|
28 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
29 |
-
model_checkpoint = 'Finnish-NLP/
|
30 |
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_auth_token=os.environ.get('hf_token'))
|
31 |
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_flax=False, torch_dtype=torch.float32, use_auth_token=os.environ.get('hf_token')).to(device)
|
32 |
|
33 |
-
|
34 |
# define speech-to-text function
|
35 |
def asr_transcript(audio, audio_microphone, model_params):
|
36 |
|
@@ -38,16 +36,14 @@ def asr_transcript(audio, audio_microphone, model_params):
|
|
38 |
audio = audio_microphone if audio_microphone else audio
|
39 |
|
40 |
if audio == None and audio_microphone == None:
|
41 |
-
return "Please provide audio by uploading file or by recording audio
|
42 |
text = ""
|
43 |
|
44 |
if audio:
|
45 |
-
if model_params == "1 billion
|
46 |
text = pipe_1b(audio.name)
|
47 |
-
elif model_params == "
|
48 |
-
text =
|
49 |
-
elif model_params == "300 million multi":
|
50 |
-
text = pipe_300m(audio.name)
|
51 |
|
52 |
input_ids = tokenizer(text['text'], return_tensors="pt").input_ids.to(device)
|
53 |
outputs = model.generate(input_ids, max_length=128)
|
@@ -58,9 +54,19 @@ def asr_transcript(audio, audio_microphone, model_params):
|
|
58 |
|
59 |
gradio_ui = gr.Interface(
|
60 |
fn=asr_transcript,
|
61 |
-
title="Finnish
|
62 |
-
description="Upload an audio clip, and let AI do the hard work of transcribing",
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
outputs=[gr.outputs.Textbox(label="Recognized speech"),gr.outputs.Textbox(label="Recognized speech with case correction and punctuation")]
|
65 |
)
|
66 |
|
|
|
19 |
|
20 |
|
21 |
|
22 |
+
pipe_95m = pipeline(model="Finnish-NLP/wav2vec2-base-fi-voxpopuli-v2-finetuned",chunk_length_s=20, stride_length_s=(3, 3))
|
|
|
23 |
pipe_1b = pipeline(model="Finnish-NLP/wav2vec2-xlsr-1b-finnish-lm-v2",chunk_length_s=20, stride_length_s=(3, 3))
|
24 |
|
25 |
|
26 |
|
27 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
28 |
+
model_checkpoint = 'Finnish-NLP/t5-small-nl24-casing-punctuation-correction'
|
29 |
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_auth_token=os.environ.get('hf_token'))
|
30 |
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_flax=False, torch_dtype=torch.float32, use_auth_token=os.environ.get('hf_token')).to(device)
|
31 |
|
|
|
32 |
# define speech-to-text function
|
33 |
def asr_transcript(audio, audio_microphone, model_params):
|
34 |
|
|
|
36 |
audio = audio_microphone if audio_microphone else audio
|
37 |
|
38 |
if audio == None and audio_microphone == None:
|
39 |
+
return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
|
40 |
text = ""
|
41 |
|
42 |
if audio:
|
43 |
+
if model_params == "1 billion":
|
44 |
text = pipe_1b(audio.name)
|
45 |
+
elif model_params == "95 million":
|
46 |
+
text = pipe_95m(audio.name)
|
|
|
|
|
47 |
|
48 |
input_ids = tokenizer(text['text'], return_tensors="pt").input_ids.to(device)
|
49 |
outputs = model.generate(input_ids, max_length=128)
|
|
|
54 |
|
55 |
gradio_ui = gr.Interface(
|
56 |
fn=asr_transcript,
|
57 |
+
title="Finnish Automatic Speech Recognition",
|
58 |
+
description="Upload an audio clip or record from browser using microphone, and let AI do the hard work of transcribing.",
|
59 |
+
article = """
|
60 |
+
This demo includes 2 kinds of models that are run together. First selected ASR model does speech recognition which produces lowercase text without punctuation.
|
61 |
+
After that we run a sequence-to-sequence model which tries to correct casing and punctuation which produces the final output.
|
62 |
+
You can select one of two speech recognition models listed below
|
63 |
+
|
64 |
+
1. 1 billion, best accuracy but slowest by big margin. Based on multilingual wav2vec2-xlsr model by Meta. More info here https://huggingface.co/Finnish-NLP/wav2vec2-xlsr-1b-finnish-lm-v2
|
65 |
+
2. 95 million, almost as accurate as 1. but really much faster. Based on finnish wav2vec2-xlsr model by Meta. More info here https://huggingface.co/Finnish-NLP/wav2vec2-base-fi-voxpopuli-v2-finetuned
|
66 |
+
|
67 |
+
More info about the casing+punctuation correction model can be found here https://huggingface.co/Finnish-NLP/t5-small-nl24-casing-punctuation-correction
|
68 |
+
""",
|
69 |
+
inputs=[gr.inputs.Audio(label="Upload Audio File", type="file", optional=True), gr.inputs.Audio(source="microphone", type="file", optional=True, label="Record from microphone"), gr.inputs.Dropdown(choices=["95 million","1 billion"], type="value", default="1 billion", label="Select speech recognition model parameter amount", optional=False)],
|
70 |
outputs=[gr.outputs.Textbox(label="Recognized speech"),gr.outputs.Textbox(label="Recognized speech with case correction and punctuation")]
|
71 |
)
|
72 |
|