Update app.py
Browse files
app.py
CHANGED
@@ -7,14 +7,13 @@ import tempfile
|
|
7 |
import uuid
|
8 |
|
9 |
import torch
|
10 |
-
import transformers
|
11 |
|
12 |
from nemo.collections.asr.models import ASRModel
|
13 |
from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
|
14 |
from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
|
15 |
|
16 |
SAMPLE_RATE = 16000 # Hz
|
17 |
-
MAX_AUDIO_MINUTES =
|
18 |
|
19 |
model = ASRModel.from_pretrained("nvidia/canary-1b")
|
20 |
model.eval()
|
@@ -41,8 +40,6 @@ frame_asr = FrameBatchMultiTaskAED(
|
|
41 |
|
42 |
amp_dtype = torch.float16
|
43 |
|
44 |
-
llm_pipeline = transformers.pipeline("text-generation", model="meta-llama/Meta-Llama-3-8B", model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto")
|
45 |
-
|
46 |
def convert_audio(audio_filepath, tmpdir, utt_id):
|
47 |
"""
|
48 |
Convert all files to monochannel 16 kHz wav files.
|
@@ -147,13 +144,28 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
|
|
147 |
# add logic to make sure dropdown menus only suggest valid combos
|
148 |
def on_src_or_tgt_lang_change(src_lang_value, tgt_lang_value, pnc_value):
|
149 |
"""Callback function for when src_lang or tgt_lang dropdown menus are changed.
|
150 |
-
|
151 |
Args:
|
152 |
src_lang_value(string), tgt_lang_value (string), pnc_value(bool) - the current
|
153 |
chosen "values" of each Gradio component
|
154 |
Returns:
|
155 |
src_lang, tgt_lang, pnc - these are the new Gradio components that will be displayed
|
156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
"""
|
158 |
|
159 |
if src_lang_value == "English" and tgt_lang_value == "English":
|
@@ -237,12 +249,16 @@ with gr.Blocks(
|
|
237 |
theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
|
238 |
) as demo:
|
239 |
|
240 |
-
gr.HTML("<h1 style='text-align: center'>
|
241 |
|
242 |
with gr.Row():
|
243 |
with gr.Column():
|
244 |
gr.HTML(
|
245 |
-
"<p>Upload an audio file or record with your microphone.</p>"
|
|
|
|
|
|
|
|
|
246 |
)
|
247 |
|
248 |
audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath")
|
@@ -267,15 +283,31 @@ with gr.Blocks(
|
|
267 |
)
|
268 |
|
269 |
with gr.Column():
|
|
|
270 |
gr.HTML("<p>Run the model.</p>")
|
271 |
|
272 |
go_button = gr.Button(
|
273 |
value="Run model",
|
274 |
variant="primary", # make "primary" so it stands out (default is "secondary")
|
275 |
)
|
276 |
-
|
277 |
-
|
278 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
|
280 |
go_button.click(
|
281 |
fn=transcribe,
|
@@ -283,12 +315,6 @@ with gr.Blocks(
|
|
283 |
outputs = [model_output_text_box]
|
284 |
)
|
285 |
|
286 |
-
# model_output_text_box.change(
|
287 |
-
# fn=llm_pipeline,
|
288 |
-
# inputs = [model_output_text_box],
|
289 |
-
# outputs = [llm_output_text_box]
|
290 |
-
# )
|
291 |
-
|
292 |
# call on_src_or_tgt_lang_change whenever src_lang or tgt_lang dropdown menus are changed
|
293 |
src_lang.change(
|
294 |
fn=on_src_or_tgt_lang_change,
|
@@ -300,7 +326,7 @@ with gr.Blocks(
|
|
300 |
inputs=[src_lang, tgt_lang, pnc],
|
301 |
outputs=[src_lang, tgt_lang, pnc],
|
302 |
)
|
303 |
-
|
304 |
-
|
305 |
demo.queue()
|
306 |
-
demo.launch(
|
|
|
7 |
import uuid
|
8 |
|
9 |
import torch
|
|
|
10 |
|
11 |
from nemo.collections.asr.models import ASRModel
|
12 |
from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
|
13 |
from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
|
14 |
|
15 |
SAMPLE_RATE = 16000 # Hz
|
16 |
+
MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
|
17 |
|
18 |
model = ASRModel.from_pretrained("nvidia/canary-1b")
|
19 |
model.eval()
|
|
|
40 |
|
41 |
amp_dtype = torch.float16
|
42 |
|
|
|
|
|
43 |
def convert_audio(audio_filepath, tmpdir, utt_id):
|
44 |
"""
|
45 |
Convert all files to monochannel 16 kHz wav files.
|
|
|
144 |
# add logic to make sure dropdown menus only suggest valid combos
|
145 |
def on_src_or_tgt_lang_change(src_lang_value, tgt_lang_value, pnc_value):
|
146 |
"""Callback function for when src_lang or tgt_lang dropdown menus are changed.
|
|
|
147 |
Args:
|
148 |
src_lang_value(string), tgt_lang_value (string), pnc_value(bool) - the current
|
149 |
chosen "values" of each Gradio component
|
150 |
Returns:
|
151 |
src_lang, tgt_lang, pnc - these are the new Gradio components that will be displayed
|
152 |
+
|
153 |
+
Note: I found the required logic is easier to understand if you think about the possible src & tgt langs as
|
154 |
+
a matrix, e.g. with English, Spanish, French, German as the langs, and only transcription in the same language,
|
155 |
+
and X -> English and English -> X translation being allowed, the matrix looks like the diagram below ("Y" means it is
|
156 |
+
allowed to go into that state).
|
157 |
+
It is easier to understand the code if you think about which state you are in, given the current src_lang_value and
|
158 |
+
tgt_lang_value, and then which states you can go to from there.
|
159 |
+
tgt lang
|
160 |
+
- |EN |ES |FR |DE
|
161 |
+
------------------
|
162 |
+
EN| Y | Y | Y | Y
|
163 |
+
------------------
|
164 |
+
src ES| Y | Y | |
|
165 |
+
lang ------------------
|
166 |
+
FR| Y | | Y |
|
167 |
+
------------------
|
168 |
+
DE| Y | | | Y
|
169 |
"""
|
170 |
|
171 |
if src_lang_value == "English" and tgt_lang_value == "English":
|
|
|
249 |
theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
|
250 |
) as demo:
|
251 |
|
252 |
+
gr.HTML("<h1 style='text-align: center'>NeMo Canary model: Transcribe & Translate audio</h1>")
|
253 |
|
254 |
with gr.Row():
|
255 |
with gr.Column():
|
256 |
gr.HTML(
|
257 |
+
"<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
|
258 |
+
|
259 |
+
"<p style='color: #A0A0A0;'>This demo supports audio files up to 10 mins long. "
|
260 |
+
"You can transcribe longer files locally with this NeMo "
|
261 |
+
"<a href='https://github.com/NVIDIA/NeMo/blob/main/examples/asr/speech_multitask/speech_to_text_aed_chunked_infer.py'>script</a>.</p>"
|
262 |
)
|
263 |
|
264 |
audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath")
|
|
|
283 |
)
|
284 |
|
285 |
with gr.Column():
|
286 |
+
|
287 |
gr.HTML("<p>Run the model.</p>")
|
288 |
|
289 |
go_button = gr.Button(
|
290 |
value="Run model",
|
291 |
variant="primary", # make "primary" so it stands out (default is "secondary")
|
292 |
)
|
293 |
+
|
294 |
+
model_output_text_box = gr.Textbox(
|
295 |
+
label="Model Output",
|
296 |
+
elem_id="model_output_text_box",
|
297 |
+
)
|
298 |
+
llm_output_text_box = gr.Textbox(
|
299 |
+
label="LLM Output",
|
300 |
+
elem_id="llm_output_text_box",
|
301 |
+
)
|
302 |
+
|
303 |
+
with gr.Row():
|
304 |
+
|
305 |
+
gr.HTML(
|
306 |
+
"<p style='text-align: center'>"
|
307 |
+
"π€ <a href='https://huggingface.co/nvidia/canary-1b' target='_blank'>Canary model</a> | "
|
308 |
+
"π§βπ» <a href='https://github.com/NVIDIA/NeMo' target='_blank'>NeMo Repository</a>"
|
309 |
+
"</p>"
|
310 |
+
)
|
311 |
|
312 |
go_button.click(
|
313 |
fn=transcribe,
|
|
|
315 |
outputs = [model_output_text_box]
|
316 |
)
|
317 |
|
|
|
|
|
|
|
|
|
|
|
|
|
318 |
# call on_src_or_tgt_lang_change whenever src_lang or tgt_lang dropdown menus are changed
|
319 |
src_lang.change(
|
320 |
fn=on_src_or_tgt_lang_change,
|
|
|
326 |
inputs=[src_lang, tgt_lang, pnc],
|
327 |
outputs=[src_lang, tgt_lang, pnc],
|
328 |
)
|
329 |
+
|
330 |
+
|
331 |
demo.queue()
|
332 |
+
demo.launch()
|