Spaces:

almncarlo
/

myalexa

Runtime error

App Files Files Community

almncarlo commited on May 4

Commit

4c49282

•

1 Parent(s): 8a6e8fa

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -20

app.py CHANGED Viewed

@@ -7,14 +7,13 @@ import tempfile
 import uuid
 import torch
-import transformers
 from nemo.collections.asr.models import ASRModel
 from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
 from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
 SAMPLE_RATE = 16000 # Hz
-MAX_AUDIO_MINUTES = 180 # wont try to transcribe if longer than this
 model = ASRModel.from_pretrained("nvidia/canary-1b")
 model.eval()
@@ -41,8 +40,6 @@ frame_asr = FrameBatchMultiTaskAED(
 amp_dtype = torch.float16
-llm_pipeline = transformers.pipeline("text-generation", model="meta-llama/Meta-Llama-3-8B", model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto")
 def convert_audio(audio_filepath, tmpdir, utt_id):
 	"""
 	Convert all files to monochannel 16 kHz wav files.
@@ -147,13 +144,28 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
 # add logic to make sure dropdown menus only suggest valid combos
 def on_src_or_tgt_lang_change(src_lang_value, tgt_lang_value, pnc_value):
 	"""Callback function for when src_lang or tgt_lang dropdown menus are changed.
 	Args:
 		src_lang_value(string), tgt_lang_value (string), pnc_value(bool) - the current
 			chosen "values" of each Gradio component
 	Returns:
 		src_lang, tgt_lang, pnc - these are the new Gradio components that will be displayed
 	"""
 	if src_lang_value == "English" and tgt_lang_value == "English":
@@ -237,12 +249,16 @@ with gr.Blocks(
 	theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
 ) as demo:
-	gr.HTML("<h1 style='text-align: center'>MyAlexa</h1>")
 	with gr.Row():
 		with gr.Column():
 			gr.HTML(
-				"<p>Upload an audio file or record with your microphone.</p>"
 			)
 			audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath")
@@ -267,15 +283,31 @@ with gr.Blocks(
 				)
 		with gr.Column():
 			gr.HTML("<p>Run the model.</p>")
 			go_button = gr.Button(
 				value="Run model",
 				variant="primary", # make "primary" so it stands out (default is "secondary")
 			)
-            model_output_text_box = gr.Textbox(label="Transcribed Text", elem_id="model_output_text_box")
-            # llm_output_text_box = gr.Textbox(label="MyAlexa's Answer", elem_id="llm_output_text_box")
 	go_button.click(
 		fn=transcribe,
@@ -283,12 +315,6 @@ with gr.Blocks(
 		outputs = [model_output_text_box]
 	)
-    # model_output_text_box.change(
-    #     fn=llm_pipeline,
-    #     inputs = [model_output_text_box],
-    #     outputs = [llm_output_text_box]
-    # )
 	# call on_src_or_tgt_lang_change whenever src_lang or tgt_lang dropdown menus are changed
 	src_lang.change(
 		fn=on_src_or_tgt_lang_change,
@@ -300,7 +326,7 @@ with gr.Blocks(
 		inputs=[src_lang, tgt_lang, pnc],
 		outputs=[src_lang, tgt_lang, pnc],
 	)
 demo.queue()
-demo.launch(share=True)

 import uuid
 import torch
 from nemo.collections.asr.models import ASRModel
 from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
 from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
 SAMPLE_RATE = 16000 # Hz
+MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
 model = ASRModel.from_pretrained("nvidia/canary-1b")
 model.eval()
 amp_dtype = torch.float16
 def convert_audio(audio_filepath, tmpdir, utt_id):
 	"""
 	Convert all files to monochannel 16 kHz wav files.
 # add logic to make sure dropdown menus only suggest valid combos
 def on_src_or_tgt_lang_change(src_lang_value, tgt_lang_value, pnc_value):
 	"""Callback function for when src_lang or tgt_lang dropdown menus are changed.
 	Args:
 		src_lang_value(string), tgt_lang_value (string), pnc_value(bool) - the current
 			chosen "values" of each Gradio component
 	Returns:
 		src_lang, tgt_lang, pnc - these are the new Gradio components that will be displayed
+	Note: I found the required logic is easier to understand if you think about the possible src & tgt langs as
+	a matrix, e.g. with English, Spanish, French, German as the langs, and only transcription in the same language,
+	and X -> English and English -> X translation being allowed, the matrix looks like the diagram below ("Y" means it is
+	allowed to go into that state).
+	It is easier to understand the code if you think about which state you are in, given the current src_lang_value and
+	tgt_lang_value, and then which states you can go to from there.
+			tgt lang
+			- |EN |ES |FR |DE
+			------------------
+			EN| Y | Y | Y | Y
+			------------------
+		src 	ES| Y | Y |   |
+		lang	------------------
+			FR| Y |   | Y |
+			------------------
+			DE| Y |   |   | Y
 	"""
 	if src_lang_value == "English" and tgt_lang_value == "English":
 	theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
 ) as demo:
+	gr.HTML("<h1 style='text-align: center'>NeMo Canary model: Transcribe & Translate audio</h1>")
 	with gr.Row():
 		with gr.Column():
 			gr.HTML(
+				"<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
+				"<p style='color: #A0A0A0;'>This demo supports audio files up to 10 mins long. "
+				"You can transcribe longer files locally with this NeMo "
+				"<a href='https://github.com/NVIDIA/NeMo/blob/main/examples/asr/speech_multitask/speech_to_text_aed_chunked_infer.py'>script</a>.</p>"
 			)
 			audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath")
 				)
 		with gr.Column():
 			gr.HTML("<p>Run the model.</p>")
 			go_button = gr.Button(
 				value="Run model",
 				variant="primary", # make "primary" so it stands out (default is "secondary")
 			)
+			model_output_text_box = gr.Textbox(
+				label="Model Output",
+				elem_id="model_output_text_box",
+			)
+            llm_output_text_box = gr.Textbox(
+				label="LLM Output",
+				elem_id="llm_output_text_box",
+			)
+	with gr.Row():
+		gr.HTML(
+			"<p style='text-align: center'>"
+				"🐤 <a href='https://huggingface.co/nvidia/canary-1b' target='_blank'>Canary model</a> | "
+				"🧑‍💻 <a href='https://github.com/NVIDIA/NeMo' target='_blank'>NeMo Repository</a>"
+			"</p>"
+		)
 	go_button.click(
 		fn=transcribe,
 		outputs = [model_output_text_box]
 	)
 	# call on_src_or_tgt_lang_change whenever src_lang or tgt_lang dropdown menus are changed
 	src_lang.change(
 		fn=on_src_or_tgt_lang_change,
 		inputs=[src_lang, tgt_lang, pnc],
 		outputs=[src_lang, tgt_lang, pnc],
 	)
 demo.queue()
+demo.launch()