Spaces:

Baghdad99
/

ha-en

Sleeping

Baghdad99 commited on Dec 21, 2023

Commit

88de73c

•

1 Parent(s): 6ade673

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -17,17 +17,15 @@ def translate_speech(audio_input):
     # Load the audio file as a floating point time series
     audio_data, sample_rate = librosa.load(audio_input, sr=None)
-    # Prepare the input dictionary
-    input_dict = pipe.tokenizer(audio_data, return_tensors="pt", padding=True)
-    # Use the speech recognition model to get the logits
-    logits = pipe.model(input_dict.input_values.to("cuda")).logits
-    # Get the predicted IDs
-    pred_ids = torch.argmax(logits, dim=-1)[0]
-    # Decode the predicted IDs to get the transcription
-    transcription = pipe.tokenizer.decode(pred_ids)
     # Use the translation pipeline to translate the transcription
     translated_text = translator(transcription, return_tensors="pt")
@@ -58,6 +56,7 @@ def translate_speech(audio_input):
     return 16000, synthesised_speech
 # Define the Gradio interface
 iface = gr.Interface(
     fn=translate_speech,

     # Load the audio file as a floating point time series
     audio_data, sample_rate = librosa.load(audio_input, sr=None)
+    # Use the speech recognition pipeline to transcribe the audio
+    output = pipe(audio_data)
+    # Check if the output contains 'text'
+    if 'text' in output:
+        transcription = output["text"]
+    else:
+        print("The output does not contain 'text'")
+        return
     # Use the translation pipeline to translate the transcription
     translated_text = translator(transcription, return_tensors="pt")
     return 16000, synthesised_speech
 # Define the Gradio interface
 iface = gr.Interface(
     fn=translate_speech,