anyantudre commited on
Commit
e41ca58
·
verified ·
1 Parent(s): 7f1969e

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +56 -0
  2. requirements.txt +8 -0
  3. speech_to_text.py +46 -0
  4. text_to_speech.py +40 -0
  5. translation.py +10 -0
app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import scipy
3
+ import gradio as gr
4
+
5
+ from transformers import set_seed, pipeline
6
+ from transformers import VitsTokenizer, VitsModel
7
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
8
+ from datasets import load_dataset, Audio
9
+
10
+ import speech_to_text, text_to_speech, translation
11
+
12
+ language_list = ['mos', 'fra', 'eng']
13
+
14
+ demo = gr.Blocks()
15
+
16
+ mms_stt = gr.Interface(
17
+ fn=speech_to_text.transcribe,
18
+ inputs=[
19
+ gr.Audio(sources=["microphone", "upload"], type="filepath"),
20
+ gr.Dropdown(language_list, label="Language")
21
+ ],
22
+ outputs="text",
23
+ title="Speech-to-text"
24
+ )
25
+
26
+ mms_tts = gr.Interface(
27
+ fn=text_to_speech.synthesize_facebook,
28
+ inputs=[
29
+ gr.Text(label="Input text"),
30
+ gr.Dropdown(language_list, label="Language")
31
+ ],
32
+ outputs=[
33
+ gr.Audio(label="Generated Audio", type="numpy")
34
+ ],
35
+ title="Text-to-speech"
36
+ )
37
+
38
+ mms_translate = gr.Interface(
39
+ fn=translation.translation,
40
+ inputs=[
41
+ gr.Textbox(label="Text", placeholder="Yaa sõama"),
42
+ gr.Dropdown(label="Source Language", choices=["eng_Latn", "fra_Latn", "mos_Latn"]),
43
+ gr.Dropdown(label="Target Language", choices=["eng_Latn", "fra_Latn", "mos_Latn"])
44
+ ],
45
+ outputs=["text"],
46
+ examples=[["Building a translation demo with Gradio is so easy!", "eng_Latn", "mos_Latn"]],
47
+ title="Translation Demo",
48
+ )
49
+
50
+ with demo:
51
+ gr.TabbedInterface(
52
+ [mms_translate, mms_tts, mms_stt],
53
+ ["Translation", "Text-to-speech", "Speech-to-text"],
54
+ )
55
+
56
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ datasets
2
+ librosa
3
+ pycountry
4
+ scipy
5
+ sentencepiece
6
+ transformers
7
+ torch
8
+ gradio
speech_to_text.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import torch
3
+ from transformers import Wav2Vec2ForCTC, AutoProcessor
4
+ from transformers import set_seed
5
+ import time
6
+
7
+
8
+ def transcribe(fp:str, target_lang:str) -> str:
9
+ '''
10
+ For given audio file, transcribe it.
11
+
12
+ Parameters
13
+ ----------
14
+ fp: str
15
+ The file path to the audio file.
16
+ target_lang:str
17
+ The ISO-3 code of the target language.
18
+
19
+ Returns
20
+ ----------
21
+ transcript:str
22
+ The transcribed text.
23
+ '''
24
+ # Ensure replicability
25
+ set_seed(555)
26
+ start_time = time.time()
27
+
28
+ # Load transcription model
29
+ model_id = "facebook/mms-1b-all"
30
+
31
+ processor = AutoProcessor.from_pretrained(model_id, target_lang=target_lang)
32
+ model = Wav2Vec2ForCTC.from_pretrained(model_id, target_lang=target_lang, ignore_mismatched_sizes=True)
33
+
34
+ # Process the audio
35
+ signal, sampling_rate = librosa.load(fp, sr=16000)
36
+ inputs = processor(signal, sampling_rate=16_000, return_tensors="pt")
37
+
38
+ # Inference
39
+ with torch.no_grad():
40
+ outputs = model(**inputs).logits
41
+
42
+ ids = torch.argmax(outputs, dim=-1)[0]
43
+ transcript = processor.decode(ids)
44
+
45
+ print("Time elapsed: ", int(time.time() - start_time), " seconds")
46
+ return transcript
text_to_speech.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import torch
3
+ from transformers import set_seed
4
+ from transformers import VitsTokenizer, VitsModel
5
+
6
+ def synthesize_facebook(s:str, iso3:str) -> str:
7
+ '''
8
+ For given text, speak it.
9
+
10
+ Parameters
11
+ ----------
12
+ s: str
13
+ The written text.
14
+ is03:str
15
+ The ISO-3 code of the text's language.
16
+
17
+ Returns
18
+ ----------
19
+ synth:str
20
+ The synthesized audio.
21
+ '''
22
+
23
+ # Ensure replicability
24
+ set_seed(555)
25
+ start_time = time.time()
26
+
27
+ # Load synthesizer
28
+ tokenizer = VitsTokenizer.from_pretrained(f"facebook/mms-tts-{iso3}")
29
+ model = VitsModel.from_pretrained(f"facebook/mms-tts-{iso3}")
30
+
31
+ inputs = tokenizer(text=s, return_tensors="pt")
32
+
33
+ # Inference
34
+ with torch.no_grad():
35
+ outputs = model(**inputs)
36
+
37
+ synth = outputs.waveform[0]
38
+
39
+ print("Time elapsed: ", int(time.time() - start_time), " seconds")
40
+ return synth.numpy()
translation.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
5
+ model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
6
+
7
+
8
+ def translation(text, src_lang, tgt_lang):
9
+ trans_pipe = pipeline("translation", model=model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang, max_length=400)
10
+ return trans_pipe(text)[0]["translation_text"]