Spaces:
Runtime error
Runtime error
anyantudre
commited on
Upload 5 files
Browse files- app.py +56 -0
- requirements.txt +8 -0
- speech_to_text.py +46 -0
- text_to_speech.py +40 -0
- translation.py +10 -0
app.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import scipy
|
3 |
+
import gradio as gr
|
4 |
+
|
5 |
+
from transformers import set_seed, pipeline
|
6 |
+
from transformers import VitsTokenizer, VitsModel
|
7 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
8 |
+
from datasets import load_dataset, Audio
|
9 |
+
|
10 |
+
import speech_to_text, text_to_speech, translation
|
11 |
+
|
12 |
+
language_list = ['mos', 'fra', 'eng']
|
13 |
+
|
14 |
+
demo = gr.Blocks()
|
15 |
+
|
16 |
+
mms_stt = gr.Interface(
|
17 |
+
fn=speech_to_text.transcribe,
|
18 |
+
inputs=[
|
19 |
+
gr.Audio(sources=["microphone", "upload"], type="filepath"),
|
20 |
+
gr.Dropdown(language_list, label="Language")
|
21 |
+
],
|
22 |
+
outputs="text",
|
23 |
+
title="Speech-to-text"
|
24 |
+
)
|
25 |
+
|
26 |
+
mms_tts = gr.Interface(
|
27 |
+
fn=text_to_speech.synthesize_facebook,
|
28 |
+
inputs=[
|
29 |
+
gr.Text(label="Input text"),
|
30 |
+
gr.Dropdown(language_list, label="Language")
|
31 |
+
],
|
32 |
+
outputs=[
|
33 |
+
gr.Audio(label="Generated Audio", type="numpy")
|
34 |
+
],
|
35 |
+
title="Text-to-speech"
|
36 |
+
)
|
37 |
+
|
38 |
+
mms_translate = gr.Interface(
|
39 |
+
fn=translation.translation,
|
40 |
+
inputs=[
|
41 |
+
gr.Textbox(label="Text", placeholder="Yaa sõama"),
|
42 |
+
gr.Dropdown(label="Source Language", choices=["eng_Latn", "fra_Latn", "mos_Latn"]),
|
43 |
+
gr.Dropdown(label="Target Language", choices=["eng_Latn", "fra_Latn", "mos_Latn"])
|
44 |
+
],
|
45 |
+
outputs=["text"],
|
46 |
+
examples=[["Building a translation demo with Gradio is so easy!", "eng_Latn", "mos_Latn"]],
|
47 |
+
title="Translation Demo",
|
48 |
+
)
|
49 |
+
|
50 |
+
with demo:
|
51 |
+
gr.TabbedInterface(
|
52 |
+
[mms_translate, mms_tts, mms_stt],
|
53 |
+
["Translation", "Text-to-speech", "Speech-to-text"],
|
54 |
+
)
|
55 |
+
|
56 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
datasets
|
2 |
+
librosa
|
3 |
+
pycountry
|
4 |
+
scipy
|
5 |
+
sentencepiece
|
6 |
+
transformers
|
7 |
+
torch
|
8 |
+
gradio
|
speech_to_text.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import torch
|
3 |
+
from transformers import Wav2Vec2ForCTC, AutoProcessor
|
4 |
+
from transformers import set_seed
|
5 |
+
import time
|
6 |
+
|
7 |
+
|
8 |
+
def transcribe(fp:str, target_lang:str) -> str:
|
9 |
+
'''
|
10 |
+
For given audio file, transcribe it.
|
11 |
+
|
12 |
+
Parameters
|
13 |
+
----------
|
14 |
+
fp: str
|
15 |
+
The file path to the audio file.
|
16 |
+
target_lang:str
|
17 |
+
The ISO-3 code of the target language.
|
18 |
+
|
19 |
+
Returns
|
20 |
+
----------
|
21 |
+
transcript:str
|
22 |
+
The transcribed text.
|
23 |
+
'''
|
24 |
+
# Ensure replicability
|
25 |
+
set_seed(555)
|
26 |
+
start_time = time.time()
|
27 |
+
|
28 |
+
# Load transcription model
|
29 |
+
model_id = "facebook/mms-1b-all"
|
30 |
+
|
31 |
+
processor = AutoProcessor.from_pretrained(model_id, target_lang=target_lang)
|
32 |
+
model = Wav2Vec2ForCTC.from_pretrained(model_id, target_lang=target_lang, ignore_mismatched_sizes=True)
|
33 |
+
|
34 |
+
# Process the audio
|
35 |
+
signal, sampling_rate = librosa.load(fp, sr=16000)
|
36 |
+
inputs = processor(signal, sampling_rate=16_000, return_tensors="pt")
|
37 |
+
|
38 |
+
# Inference
|
39 |
+
with torch.no_grad():
|
40 |
+
outputs = model(**inputs).logits
|
41 |
+
|
42 |
+
ids = torch.argmax(outputs, dim=-1)[0]
|
43 |
+
transcript = processor.decode(ids)
|
44 |
+
|
45 |
+
print("Time elapsed: ", int(time.time() - start_time), " seconds")
|
46 |
+
return transcript
|
text_to_speech.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import torch
|
3 |
+
from transformers import set_seed
|
4 |
+
from transformers import VitsTokenizer, VitsModel
|
5 |
+
|
6 |
+
def synthesize_facebook(s:str, iso3:str) -> str:
|
7 |
+
'''
|
8 |
+
For given text, speak it.
|
9 |
+
|
10 |
+
Parameters
|
11 |
+
----------
|
12 |
+
s: str
|
13 |
+
The written text.
|
14 |
+
is03:str
|
15 |
+
The ISO-3 code of the text's language.
|
16 |
+
|
17 |
+
Returns
|
18 |
+
----------
|
19 |
+
synth:str
|
20 |
+
The synthesized audio.
|
21 |
+
'''
|
22 |
+
|
23 |
+
# Ensure replicability
|
24 |
+
set_seed(555)
|
25 |
+
start_time = time.time()
|
26 |
+
|
27 |
+
# Load synthesizer
|
28 |
+
tokenizer = VitsTokenizer.from_pretrained(f"facebook/mms-tts-{iso3}")
|
29 |
+
model = VitsModel.from_pretrained(f"facebook/mms-tts-{iso3}")
|
30 |
+
|
31 |
+
inputs = tokenizer(text=s, return_tensors="pt")
|
32 |
+
|
33 |
+
# Inference
|
34 |
+
with torch.no_grad():
|
35 |
+
outputs = model(**inputs)
|
36 |
+
|
37 |
+
synth = outputs.waveform[0]
|
38 |
+
|
39 |
+
print("Time elapsed: ", int(time.time() - start_time), " seconds")
|
40 |
+
return synth.numpy()
|
translation.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
|
5 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
|
6 |
+
|
7 |
+
|
8 |
+
def translation(text, src_lang, tgt_lang):
|
9 |
+
trans_pipe = pipeline("translation", model=model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang, max_length=400)
|
10 |
+
return trans_pipe(text)[0]["translation_text"]
|