almncarlo commited on
Commit
4c49282
β€’
1 Parent(s): 8a6e8fa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -20
app.py CHANGED
@@ -7,14 +7,13 @@ import tempfile
7
  import uuid
8
 
9
  import torch
10
- import transformers
11
 
12
  from nemo.collections.asr.models import ASRModel
13
  from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
14
  from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
15
 
16
  SAMPLE_RATE = 16000 # Hz
17
- MAX_AUDIO_MINUTES = 180 # wont try to transcribe if longer than this
18
 
19
  model = ASRModel.from_pretrained("nvidia/canary-1b")
20
  model.eval()
@@ -41,8 +40,6 @@ frame_asr = FrameBatchMultiTaskAED(
41
 
42
  amp_dtype = torch.float16
43
 
44
- llm_pipeline = transformers.pipeline("text-generation", model="meta-llama/Meta-Llama-3-8B", model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto")
45
-
46
  def convert_audio(audio_filepath, tmpdir, utt_id):
47
  """
48
  Convert all files to monochannel 16 kHz wav files.
@@ -147,13 +144,28 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
147
  # add logic to make sure dropdown menus only suggest valid combos
148
  def on_src_or_tgt_lang_change(src_lang_value, tgt_lang_value, pnc_value):
149
  """Callback function for when src_lang or tgt_lang dropdown menus are changed.
150
-
151
  Args:
152
  src_lang_value(string), tgt_lang_value (string), pnc_value(bool) - the current
153
  chosen "values" of each Gradio component
154
  Returns:
155
  src_lang, tgt_lang, pnc - these are the new Gradio components that will be displayed
156
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  """
158
 
159
  if src_lang_value == "English" and tgt_lang_value == "English":
@@ -237,12 +249,16 @@ with gr.Blocks(
237
  theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
238
  ) as demo:
239
 
240
- gr.HTML("<h1 style='text-align: center'>MyAlexa</h1>")
241
 
242
  with gr.Row():
243
  with gr.Column():
244
  gr.HTML(
245
- "<p>Upload an audio file or record with your microphone.</p>"
 
 
 
 
246
  )
247
 
248
  audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath")
@@ -267,15 +283,31 @@ with gr.Blocks(
267
  )
268
 
269
  with gr.Column():
 
270
  gr.HTML("<p>Run the model.</p>")
271
 
272
  go_button = gr.Button(
273
  value="Run model",
274
  variant="primary", # make "primary" so it stands out (default is "secondary")
275
  )
276
-
277
- model_output_text_box = gr.Textbox(label="Transcribed Text", elem_id="model_output_text_box")
278
- # llm_output_text_box = gr.Textbox(label="MyAlexa's Answer", elem_id="llm_output_text_box")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
 
280
  go_button.click(
281
  fn=transcribe,
@@ -283,12 +315,6 @@ with gr.Blocks(
283
  outputs = [model_output_text_box]
284
  )
285
 
286
- # model_output_text_box.change(
287
- # fn=llm_pipeline,
288
- # inputs = [model_output_text_box],
289
- # outputs = [llm_output_text_box]
290
- # )
291
-
292
  # call on_src_or_tgt_lang_change whenever src_lang or tgt_lang dropdown menus are changed
293
  src_lang.change(
294
  fn=on_src_or_tgt_lang_change,
@@ -300,7 +326,7 @@ with gr.Blocks(
300
  inputs=[src_lang, tgt_lang, pnc],
301
  outputs=[src_lang, tgt_lang, pnc],
302
  )
303
-
304
-
305
  demo.queue()
306
- demo.launch(share=True)
 
7
  import uuid
8
 
9
  import torch
 
10
 
11
  from nemo.collections.asr.models import ASRModel
12
  from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
13
  from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
14
 
15
  SAMPLE_RATE = 16000 # Hz
16
+ MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
17
 
18
  model = ASRModel.from_pretrained("nvidia/canary-1b")
19
  model.eval()
 
40
 
41
  amp_dtype = torch.float16
42
 
 
 
43
  def convert_audio(audio_filepath, tmpdir, utt_id):
44
  """
45
  Convert all files to monochannel 16 kHz wav files.
 
144
  # add logic to make sure dropdown menus only suggest valid combos
145
  def on_src_or_tgt_lang_change(src_lang_value, tgt_lang_value, pnc_value):
146
  """Callback function for when src_lang or tgt_lang dropdown menus are changed.
 
147
  Args:
148
  src_lang_value(string), tgt_lang_value (string), pnc_value(bool) - the current
149
  chosen "values" of each Gradio component
150
  Returns:
151
  src_lang, tgt_lang, pnc - these are the new Gradio components that will be displayed
152
+
153
+ Note: I found the required logic is easier to understand if you think about the possible src & tgt langs as
154
+ a matrix, e.g. with English, Spanish, French, German as the langs, and only transcription in the same language,
155
+ and X -> English and English -> X translation being allowed, the matrix looks like the diagram below ("Y" means it is
156
+ allowed to go into that state).
157
+ It is easier to understand the code if you think about which state you are in, given the current src_lang_value and
158
+ tgt_lang_value, and then which states you can go to from there.
159
+ tgt lang
160
+ - |EN |ES |FR |DE
161
+ ------------------
162
+ EN| Y | Y | Y | Y
163
+ ------------------
164
+ src ES| Y | Y | |
165
+ lang ------------------
166
+ FR| Y | | Y |
167
+ ------------------
168
+ DE| Y | | | Y
169
  """
170
 
171
  if src_lang_value == "English" and tgt_lang_value == "English":
 
249
  theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
250
  ) as demo:
251
 
252
+ gr.HTML("<h1 style='text-align: center'>NeMo Canary model: Transcribe & Translate audio</h1>")
253
 
254
  with gr.Row():
255
  with gr.Column():
256
  gr.HTML(
257
+ "<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
258
+
259
+ "<p style='color: #A0A0A0;'>This demo supports audio files up to 10 mins long. "
260
+ "You can transcribe longer files locally with this NeMo "
261
+ "<a href='https://github.com/NVIDIA/NeMo/blob/main/examples/asr/speech_multitask/speech_to_text_aed_chunked_infer.py'>script</a>.</p>"
262
  )
263
 
264
  audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath")
 
283
  )
284
 
285
  with gr.Column():
286
+
287
  gr.HTML("<p>Run the model.</p>")
288
 
289
  go_button = gr.Button(
290
  value="Run model",
291
  variant="primary", # make "primary" so it stands out (default is "secondary")
292
  )
293
+
294
+ model_output_text_box = gr.Textbox(
295
+ label="Model Output",
296
+ elem_id="model_output_text_box",
297
+ )
298
+ llm_output_text_box = gr.Textbox(
299
+ label="LLM Output",
300
+ elem_id="llm_output_text_box",
301
+ )
302
+
303
+ with gr.Row():
304
+
305
+ gr.HTML(
306
+ "<p style='text-align: center'>"
307
+ "🐀 <a href='https://huggingface.co/nvidia/canary-1b' target='_blank'>Canary model</a> | "
308
+ "πŸ§‘β€πŸ’» <a href='https://github.com/NVIDIA/NeMo' target='_blank'>NeMo Repository</a>"
309
+ "</p>"
310
+ )
311
 
312
  go_button.click(
313
  fn=transcribe,
 
315
  outputs = [model_output_text_box]
316
  )
317
 
 
 
 
 
 
 
318
  # call on_src_or_tgt_lang_change whenever src_lang or tgt_lang dropdown menus are changed
319
  src_lang.change(
320
  fn=on_src_or_tgt_lang_change,
 
326
  inputs=[src_lang, tgt_lang, pnc],
327
  outputs=[src_lang, tgt_lang, pnc],
328
  )
329
+
330
+
331
  demo.queue()
332
+ demo.launch()