import streamlit as st from transformers import MarianMTModel, MarianTokenizer, SpeechT5Processor, SpeechT5ForTextToSpeech from datasets import load_dataset import torch from scipy.io.wavfile import write as write_wav from io import BytesIO # Define the language options languages = { 'es': 'Spanish', 'fr': 'French', 'de': 'German', 'ur': 'Urdu', 'hi': 'Hindi', 'bn': 'Bengali' } # Streamlit app st.title("Real-Time Language Translator and TTS") # Text input text_to_translate = st.text_area("Enter text to translate:", "Hello, how are you?") # Language selection target_language = st.selectbox("Select target language:", list(languages.keys())) def translate_text(text, target_lang): # Update the model and tokenizer based on the selected language model_name = f"Helsinki-NLP/opus-mt-en-{target_lang}" tokenizer = MarianTokenizer.from_pretrained(model_name) model = MarianMTModel.from_pretrained(model_name) # Translate text encoded_text = tokenizer.encode(text, return_tensors="pt") forced_bos_token_id = model.config.forced_bos_token_id translated = model.generate(encoded_text, forced_bos_token_id=forced_bos_token_id) translated_text = tokenizer.decode(translated[0], skip_special_tokens=True) return translated_text def synthesize_speech(text): # Load the TTS model and processor processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") # Load speaker embeddings embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0) # Synthesize speech inputs = processor(text=text, return_tensors="pt") speech = model.generate_speech(inputs["input_ids"], speaker_embeddings) # Convert speech tensor to numpy array and save as wav audio_np = speech.squeeze().cpu().numpy() sample_rate = 16000 # Define a sample rate audio_buffer = BytesIO() write_wav(audio_buffer, sample_rate, audio_np) # Write the numpy array as a WAV file to the buffer audio_buffer.seek(0) return audio_buffer if st.button("Translate and Synthesize Speech"): # Perform translation translated_text = translate_text(text_to_translate, target_language) st.write(f"Translated text ({languages[target_language]}): {translated_text}") # Perform text-to-speech audio_bytes = synthesize_speech(translated_text) st.audio(audio_bytes, format="audio/wav")