Update app.py
Browse files
app.py
CHANGED
@@ -24,36 +24,29 @@ def translate_speech(audio):
|
|
24 |
print(f"Output: {output}") # Print the output to see what it contains
|
25 |
|
26 |
# Check if the output contains 'text'
|
27 |
-
if 'text' in output:
|
28 |
-
transcription = output["text"]
|
29 |
else:
|
30 |
print("The output does not contain 'text'")
|
31 |
return
|
32 |
|
33 |
# Use the translation pipeline to translate the transcription
|
34 |
-
translated_text = translator(transcription
|
35 |
print(f"Translated text: {translated_text}") # Print the translated text to see what it contains
|
36 |
|
37 |
-
# Check if the translated text contains 'generated_token_ids'
|
38 |
-
if 'generated_token_ids' in translated_text[0]:
|
39 |
-
# Decode the tokens into text
|
40 |
-
translated_text_str = translator.tokenizer.decode(translated_text[0]['generated_token_ids'])
|
41 |
-
else:
|
42 |
-
print("The translated text does not contain 'generated_token_ids'")
|
43 |
-
return
|
44 |
-
|
45 |
# Use the VITS model to synthesize the translated text into speech
|
46 |
-
inputs = tokenizer(
|
47 |
with torch.no_grad():
|
48 |
-
output = model(**inputs)
|
49 |
|
50 |
# Save the synthesized speech to a WAV file
|
51 |
scipy.io.wavfile.write("synthesized_speech.wav", rate=model.config.sampling_rate, data=output.float().numpy())
|
52 |
|
53 |
-
|
54 |
-
|
|
|
55 |
|
56 |
-
return 16000,
|
57 |
|
58 |
# Define the Gradio interface
|
59 |
iface = gr.Interface(
|
|
|
24 |
print(f"Output: {output}") # Print the output to see what it contains
|
25 |
|
26 |
# Check if the output contains 'text'
|
27 |
+
if 'text' in output[0]:
|
28 |
+
transcription = output[0]["text"]
|
29 |
else:
|
30 |
print("The output does not contain 'text'")
|
31 |
return
|
32 |
|
33 |
# Use the translation pipeline to translate the transcription
|
34 |
+
translated_text = translator(transcription)
|
35 |
print(f"Translated text: {translated_text}") # Print the translated text to see what it contains
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
# Use the VITS model to synthesize the translated text into speech
|
38 |
+
inputs = tokenizer(translated_text[0]['translation_text'], return_tensors="pt")
|
39 |
with torch.no_grad():
|
40 |
+
output = model.generate(**inputs)
|
41 |
|
42 |
# Save the synthesized speech to a WAV file
|
43 |
scipy.io.wavfile.write("synthesized_speech.wav", rate=model.config.sampling_rate, data=output.float().numpy())
|
44 |
|
45 |
+
print("Translated text:", translated_text[0]['translation_text'])
|
46 |
+
print("Synthesized speech data shape:", output.shape)
|
47 |
+
print("Sampling rate:", model.config.sampling_rate)
|
48 |
|
49 |
+
return 16000, output.numpy()
|
50 |
|
51 |
# Define the Gradio interface
|
52 |
iface = gr.Interface(
|