File size: 2,663 Bytes
d4d81e1
017af4b
44a1495
f4e544b
44a1495
d4d81e1
44a1495
 
 
 
d4d81e1
017af4b
b8a60ec
017af4b
 
58a8314
 
017af4b
1829ce2
b8a60ec
1829ce2
44a1495
 
 
d4d81e1
 
44a1495
 
017af4b
 
44a1495
017af4b
 
 
58a8314
 
 
 
 
 
 
 
 
 
 
44a1495
017af4b
d4d81e1
44a1495
 
 
 
 
 
 
 
 
 
 
 
a51a9d1
44a1495
 
 
 
 
 
 
 
1829ce2
44a1495
 
 
 
 
390d4d0
b8a60ec
44a1495
6f0da1b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import streamlit as st
from transformers import MarianMTModel, MarianTokenizer, SpeechT5Processor, SpeechT5ForTextToSpeech
from datasets import load_dataset
import torch
import soundfile as sf

# Define the translation model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-ur"
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Define the TTS model and processor
tts_model_name = "microsoft/speecht5_tts"
tts_model = SpeechT5ForTextToSpeech.from_pretrained(tts_model_name)
processor = SpeechT5Processor.from_pretrained(tts_model_name)

# Load speaker embeddings
speaker_embeddings = torch.tensor(load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")["xvector"][0]).unsqueeze(0)

# Function to translate text
def translate_text(text, target_lang):
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    translated = model.generate(**inputs)
    translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
    return translated_text

# Function to synthesize speech
def synthesize_speech(text, target_lang):
    inputs = processor(text=text, return_tensors="pt")
    speech = tts_model.generate_speech(inputs["input_ids"], speaker_embeddings)
    
    # Save the speech to a file
    output_path = "output.wav"
    sf.write(output_path, speech.numpy(), samplerate=16000)

    # Check if the audio file was generated correctly
    try:
        with open(output_path, 'rb') as f:
            audio_data = f.read()
        if not audio_data:
            st.error("Error: The audio file is empty.")
        else:
            st.success("Audio generated successfully.")
    except Exception as e:
        st.error(f"Error reading the audio file: {e}")
    
    return output_path

# Streamlit UI
st.title("Language Translator with Speech Synthesis")

# Input text
text_input = st.text_input("Enter text in English:")
if text_input:
    st.session_state.text_input = text_input

# Language selection
target_lang = st.selectbox("Select Target Language:", [
    "Urdu (ur)", "Hindi (hi)", "Bengali (bn)"
])

# Translate button
if st.button("Translate"):
    if target_lang == "Urdu (ur)":
        target_lang = "ur"
    elif target_lang == "Hindi (hi)":
        target_lang = "hi"
    elif target_lang == "Bengali (bn)":
        target_lang = "bn"

    translated_text = translate_text(st.session_state.text_input, target_lang)
    st.text_area("Translated text:", value=translated_text, height=100)
    
    audio_file = synthesize_speech(translated_text, target_lang)
    st.audio(audio_file)

    # Clear input for new text
    st.session_state.text_input = ""