oyemade commited on
Commit
f0854dc
1 Parent(s): e17e609

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -0
app.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import torch
4
+
5
+ from datasets import load_dataset
6
+ from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
7
+
8
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
+
10
+
11
+ # load speech translation checkpoint
12
+ asr_pipe = pipeline("automatic-speech-recognition", model="oyemade/w2v-bert-2.0-yoruba-colab-CV16.1", device=device)
13
+
14
+
15
+ # load text-to-speech checkpoint and speaker embeddings
16
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
17
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
18
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
19
+
20
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
21
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
22
+
23
+
24
+ translation_model = pipeline("translation", "facebook/nllb-200-distilled-600M", src_lang="yor_Latn", tgt_lang="eng_Latn", device=device)
25
+
26
+
27
+ def translate(audio):
28
+ text = asr_pipe(audio)["text"]
29
+ # print(text)
30
+ translation = translation_model(text)
31
+ # print(translation[0]['translation_text'])
32
+ return translation[0]['translation_text']
33
+
34
+ def synthesise(text):
35
+ inputs = processor(text=text, return_tensors="pt")
36
+ speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
37
+ return speech.cpu()
38
+
39
+ def speech_to_speech_translation(audio):
40
+ # print(model)
41
+ translated_text = translate(model, audio)
42
+ synthesised_speech = synthesise(translated_text)
43
+ synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
44
+ return 16000, synthesised_speech
45
+
46
+ iface = gr.Interface(
47
+ speech_to_speech_translation,
48
+ gr.Audio(sources="microphone", type="filepath"),
49
+ gr.Audio(label="Generated Speech", type="numpy"),
50
+ title="Neoform AI: Yoruba Speech to English Speech",
51
+ description="Demo for Yoruba speech translated to English Speech. NOTE: If you get an ERROR after pressing submit, give the audio some secs to load then try again.",
52
+ )
53
+
54
+ iface.launch()