LangTransAudio / app.py
fargerm's picture
Update app.py
58a8314 verified
import streamlit as st
from transformers import MarianMTModel, MarianTokenizer, SpeechT5Processor, SpeechT5ForTextToSpeech
from datasets import load_dataset
import torch
import soundfile as sf
# Define the translation model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-ur"
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)
# Define the TTS model and processor
tts_model_name = "microsoft/speecht5_tts"
tts_model = SpeechT5ForTextToSpeech.from_pretrained(tts_model_name)
processor = SpeechT5Processor.from_pretrained(tts_model_name)
# Load speaker embeddings
speaker_embeddings = torch.tensor(load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")["xvector"][0]).unsqueeze(0)
# Function to translate text
def translate_text(text, target_lang):
inputs = tokenizer(text, return_tensors="pt", padding=True)
translated = model.generate(**inputs)
translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
return translated_text
# Function to synthesize speech
def synthesize_speech(text, target_lang):
inputs = processor(text=text, return_tensors="pt")
speech = tts_model.generate_speech(inputs["input_ids"], speaker_embeddings)
# Save the speech to a file
output_path = "output.wav"
sf.write(output_path, speech.numpy(), samplerate=16000)
# Check if the audio file was generated correctly
try:
with open(output_path, 'rb') as f:
audio_data = f.read()
if not audio_data:
st.error("Error: The audio file is empty.")
else:
st.success("Audio generated successfully.")
except Exception as e:
st.error(f"Error reading the audio file: {e}")
return output_path
# Streamlit UI
st.title("Language Translator with Speech Synthesis")
# Input text
text_input = st.text_input("Enter text in English:")
if text_input:
st.session_state.text_input = text_input
# Language selection
target_lang = st.selectbox("Select Target Language:", [
"Urdu (ur)", "Hindi (hi)", "Bengali (bn)"
])
# Translate button
if st.button("Translate"):
if target_lang == "Urdu (ur)":
target_lang = "ur"
elif target_lang == "Hindi (hi)":
target_lang = "hi"
elif target_lang == "Bengali (bn)":
target_lang = "bn"
translated_text = translate_text(st.session_state.text_input, target_lang)
st.text_area("Translated text:", value=translated_text, height=100)
audio_file = synthesize_speech(translated_text, target_lang)
st.audio(audio_file)
# Clear input for new text
st.session_state.text_input = ""