Spaces:
Runtime error
Runtime error
File size: 4,022 Bytes
d347764 1a55abe 33215f7 1a55abe 33215f7 1a55abe b2e4995 1a55abe b2e4995 33215f7 1a55abe 33215f7 1a55abe d347764 1a55abe d347764 1a55abe 33215f7 1a55abe 33215f7 c737803 1a55abe 33215f7 1a55abe 33215f7 1a55abe 33215f7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
import gradio as gr
import torch
import uuid
import json
import librosa
import os
import tempfile
import soundfile as sf
import scipy.io.wavfile as wav
from transformers import VitsModel, AutoTokenizer, set_seed
from nemo.collections.asr.models import EncDecMultiTaskModel
# Constants
SAMPLE_RATE = 16000 # Hz
# Load ASR model
canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
decode_cfg = canary_model.cfg.decoding
decode_cfg.beam.beam_size = 1
canary_model.change_decoding_strategy(decode_cfg)
# Function to convert audio to text using ASR
def gen_text(audio_filepath, action, source_lang, target_lang):
if audio_filepath is None:
raise gr.Error("Please provide some input audio.")
utt_id = uuid.uuid4()
with tempfile.TemporaryDirectory() as tmpdir:
# Convert to 16 kHz
data, sr = librosa.load(audio_filepath, sr=None, mono=True)
if sr != SAMPLE_RATE:
data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
converted_audio_filepath = os.path.join(tmpdir, f"{utt_id}.wav")
sf.write(converted_audio_filepath, data, SAMPLE_RATE)
# Transcribe or translate audio
duration = len(data) / SAMPLE_RATE
manifest_data = {
"audio_filepath": converted_audio_filepath,
"taskname": action,
"source_lang": source_lang,
"target_lang": source_lang if action == "asr" else target_lang,
"pnc": "no",
"answer": "predict",
"duration": str(duration),
}
manifest_filepath = os.path.join(tmpdir, f"{utt_id}.json")
with open(manifest_filepath, 'w') as fout:
fout.write(json.dumps(manifest_data))
predicted_text = canary_model.transcribe(manifest_filepath)[0]
return predicted_text
# Function to convert text to speech using TTS
def gen_speech(text, lang):
set_seed(555) # Make it deterministic
model = f"facebook/mms-tts-{lang}"
# load TTS model
tts_model = VitsModel.from_pretrained(model)
tts_tokenizer = AutoTokenizer.from_pretrained(model)
input_text = tts_tokenizer(text, return_tensors="pt")
with torch.no_grad():
outputs = tts_model(**input_text)
waveform_np = outputs.waveform[0].cpu().numpy()
return SAMPLE_RATE, waveform_np
# Main function for speech-to-speech translation
def speech_to_speech_translation(audio_filepath, source_lang, target_lang):
translation = gen_text(audio_filepath, "s2t_translation", source_lang, target_lang)
sample_rate, synthesized_speech = gen_speech(translation, target_lang)
return sample_rate, synthesized_speech
# Define supported languages
LANGUAGES = {
"English": "eng",
"German": "deu",
"Spanish": "spa",
"French": "fra"
}
# Create Gradio interface
demo = gr.Blocks()
with demo:
gr.Markdown("# Multilingual Speech-to-Speech Translation")
gr.Markdown("Translate speech from one language to another.")
with gr.Row():
source_lang = gr.Dropdown(choices=list(LANGUAGES.keys()), value="English", label="Source Language")
target_lang = gr.Dropdown(choices=list(LANGUAGES.keys()), value="French", label="Target Language")
with gr.Tabs():
with gr.TabItem("Microphone"):
mic_input = gr.Audio(source="microphone", type="filepath")
mic_output = gr.Audio(label="Generated Speech", type="numpy")
mic_button = gr.Button("Translate")
with gr.TabItem("Audio File"):
file_input = gr.Audio(source="upload", type="filepath")
file_output = gr.Audio(label="Generated Speech", type="numpy")
file_button = gr.Button("Translate")
mic_button.click(
speech_to_speech_translation,
inputs=[mic_input, source_lang, target_lang],
outputs=mic_output
)
file_button.click(
speech_to_speech_translation,
inputs=[file_input, source_lang, target_lang],
outputs=file_output
)
demo.launch() |