LangTransAudio / app.py
fargerm's picture
Update app.py
b8a60ec verified
raw
history blame
2.24 kB
import streamlit as st
from transformers import MarianMTModel, MarianTokenizer, pipeline
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
import soundfile as sf
import torch
# Define the language model and tokenizer
translation_model_name = "Helsinki-NLP/opus-mt-en-ur"
tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
translation_model = MarianMTModel.from_pretrained(translation_model_name)
# Load the text-to-speech model
tts_model_name = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(tts_model_name)
tts_model = SpeechT5ForTextToSpeech.from_pretrained(tts_model_name)
# Function to translate text
def translate_text(text, target_lang):
if target_lang not in ["Urdu", "Hindi", "Bengali"]:
return "Error: Target language not supported."
tokens = tokenizer(text, return_tensors="pt", padding=True)
translated_tokens = translation_model.generate(**tokens)
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
return translated_text
# Function to generate speech
def synthesize_speech(text, speaker_embeddings=None):
inputs = processor(text, return_tensors="pt")
with torch.no_grad():
speech = tts_model.generate_speech(inputs, speaker_embeddings)
sf.write("output.wav", speech.numpy(), 16000)
return "output.wav"
# Streamlit app
st.title("Language Translator with TTS")
st.write("Enter the text you want to translate and hear the translation.")
# Select target language
target_language = st.selectbox("Select Target Language", ["Urdu", "Hindi", "Bengali"])
# Text input
text_to_translate = st.text_input("Enter text here")
if st.button("Translate and Generate Audio"):
# Clear input for new text
st.session_state.text_to_translate = ""
# Perform translation
translated_text = translate_text(text_to_translate, target_language)
st.write(f"Translated text ({target_language}): {translated_text}")
# Generate speech
if translated_text and "Error" not in translated_text:
speaker_embeddings = None # Placeholder, use actual embeddings if needed
audio_file = synthesize_speech(translated_text, speaker_embeddings)
st.audio(audio_file)