Baghdad99 commited on
Commit
a63c502
1 Parent(s): 7b2ef40

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -16
app.py CHANGED
@@ -24,36 +24,29 @@ def translate_speech(audio):
24
  print(f"Output: {output}") # Print the output to see what it contains
25
 
26
  # Check if the output contains 'text'
27
- if 'text' in output:
28
- transcription = output["text"]
29
  else:
30
  print("The output does not contain 'text'")
31
  return
32
 
33
  # Use the translation pipeline to translate the transcription
34
- translated_text = translator(transcription, return_tensors="pt")
35
  print(f"Translated text: {translated_text}") # Print the translated text to see what it contains
36
 
37
- # Check if the translated text contains 'generated_token_ids'
38
- if 'generated_token_ids' in translated_text[0]:
39
- # Decode the tokens into text
40
- translated_text_str = translator.tokenizer.decode(translated_text[0]['generated_token_ids'])
41
- else:
42
- print("The translated text does not contain 'generated_token_ids'")
43
- return
44
-
45
  # Use the VITS model to synthesize the translated text into speech
46
- inputs = tokenizer(translated_text_str, return_tensors="pt")
47
  with torch.no_grad():
48
- output = model(**inputs).waveform
49
 
50
  # Save the synthesized speech to a WAV file
51
  scipy.io.wavfile.write("synthesized_speech.wav", rate=model.config.sampling_rate, data=output.float().numpy())
52
 
53
- # Scale the audio data to the range of int16 format
54
- synthesised_speech = (output * 32767).astype(np.int16)
 
55
 
56
- return 16000, synthesised_speech
57
 
58
  # Define the Gradio interface
59
  iface = gr.Interface(
 
24
  print(f"Output: {output}") # Print the output to see what it contains
25
 
26
  # Check if the output contains 'text'
27
+ if 'text' in output[0]:
28
+ transcription = output[0]["text"]
29
  else:
30
  print("The output does not contain 'text'")
31
  return
32
 
33
  # Use the translation pipeline to translate the transcription
34
+ translated_text = translator(transcription)
35
  print(f"Translated text: {translated_text}") # Print the translated text to see what it contains
36
 
 
 
 
 
 
 
 
 
37
  # Use the VITS model to synthesize the translated text into speech
38
+ inputs = tokenizer(translated_text[0]['translation_text'], return_tensors="pt")
39
  with torch.no_grad():
40
+ output = model.generate(**inputs)
41
 
42
  # Save the synthesized speech to a WAV file
43
  scipy.io.wavfile.write("synthesized_speech.wav", rate=model.config.sampling_rate, data=output.float().numpy())
44
 
45
+ print("Translated text:", translated_text[0]['translation_text'])
46
+ print("Synthesized speech data shape:", output.shape)
47
+ print("Sampling rate:", model.config.sampling_rate)
48
 
49
+ return 16000, output.numpy()
50
 
51
  # Define the Gradio interface
52
  iface = gr.Interface(