import streamlit as st from transformers import MarianMTModel, MarianTokenizer, SpeechT5Processor, SpeechT5ForTextToSpeech from datasets import load_dataset import torch import soundfile as sf # Define the translation model and tokenizer model_name = "Helsinki-NLP/opus-mt-en-ur" model = MarianMTModel.from_pretrained(model_name) tokenizer = MarianTokenizer.from_pretrained(model_name) # Define the TTS model and processor tts_model_name = "microsoft/speecht5_tts" tts_model = SpeechT5ForTextToSpeech.from_pretrained(tts_model_name) processor = SpeechT5Processor.from_pretrained(tts_model_name) # Load speaker embeddings speaker_embeddings = torch.tensor(load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")["xvector"][0]).unsqueeze(0) # Function to translate text def translate_text(text, target_lang): inputs = tokenizer(text, return_tensors="pt", padding=True) translated = model.generate(**inputs) translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0] return translated_text # Function to synthesize speech def synthesize_speech(text, target_lang): inputs = processor(text=text, return_tensors="pt") speech = tts_model.generate_speech(inputs["input_ids"], speaker_embeddings) # Save the speech to a file output_path = "output.wav" sf.write(output_path, speech.numpy(), samplerate=16000) # Check if the audio file was generated correctly try: with open(output_path, 'rb') as f: audio_data = f.read() if not audio_data: st.error("Error: The audio file is empty.") else: st.success("Audio generated successfully.") except Exception as e: st.error(f"Error reading the audio file: {e}") return output_path # Streamlit UI st.title("Language Translator with Speech Synthesis") # Input text text_input = st.text_input("Enter text in English:") if text_input: st.session_state.text_input = text_input # Language selection target_lang = st.selectbox("Select Target Language:", [ "Urdu (ur)", "Hindi (hi)", "Bengali (bn)" ]) # Translate button if st.button("Translate"): if target_lang == "Urdu (ur)": target_lang = "ur" elif target_lang == "Hindi (hi)": target_lang = "hi" elif target_lang == "Bengali (bn)": target_lang = "bn" translated_text = translate_text(st.session_state.text_input, target_lang) st.text_area("Translated text:", value=translated_text, height=100) audio_file = synthesize_speech(translated_text, target_lang) st.audio(audio_file) # Clear input for new text st.session_state.text_input = ""