Vinay15's picture
Update app.py
72e2358 verified
import gradio as gr
import torch
from datasets import load_dataset
from transformers import SpeechT5Processor, SpeechT5HifiGan, SpeechT5ForTextToSpeech
# Load the fine-tuned model and vocoder for Italian from the new model ID
model_id = "Vinay15/speecht5_finetuned_voxpopuli_it"
model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Load speaker embeddings dataset
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7440]["xvector"]).unsqueeze(0)
# Load processor for the new Italian model
processor = SpeechT5Processor.from_pretrained(model_id)
# Optional: Text cleanup for Italian-specific characters
replacements = [
("à", "a"),
("è", "e"),
("é", "e"),
("ì", "i"),
("ò", "o"),
("ù", "u"),
]
# Text-to-speech synthesis function
def synthesize_speech(text):
# Clean up text for Italian-specific accents
for src, dst in replacements:
text = text.replace(src, dst)
# Process input text
inputs = processor(text=text, return_tensors="pt")
# Generate speech using the model and vocoder
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
# Return the generated speech as (sample_rate, audio_array)
return (16000, speech.cpu().numpy())
# Title and description for the Gradio interface
title = "Fine-tuning TTS for Italian as a Regional Language Using SpeechT5"
description = f"""
This Space generates speech in Italian, a regional language, using a fine-tuned SpeechT5 model from Hugging Face.
Italian is considered a regional language because it is primarily spoken within Italy and a few Italian-speaking regions in
other countries, such as Switzerland, San Marino, Vatican City, and areas in Croatia and Slovenia.
With about 85 million speakers worldwide, Italian's regional usage contrasts with the global reach of languages like English or Spanish.
**Fine-Tuned Model Preparation:** This model has been fine-tuned using the VoxPopuli Italian dataset to optimize SpeechT5 for
Italian pronunciation, intonation, and fluency. The fine-tuning process involved preprocessing the text data to ensure accurate
Italian accents and phonetics, resulting in high-quality Italian speech synthesis.
The fine-tuned model is available [here](https://huggingface.co/Vinay15/speecht5_finetuned_voxpopuli_it).
**Note:** Processing time may vary based on sentence length. Longer sentences may take more time to process and generate audio.
For more details, visit the [GitHub repository](https://github.com/Vinay152003/Fine-tuning-TTS-for-a-Italian-it-Language) and review the project [report](https://drive.google.com/file/d/1cvNPkuFlTZAu1iDaagCwVRGXFd6r6vqi/view?usp=sharing).
"""
# Create Gradio interface with multiple examples
interface = gr.Interface(
fn=synthesize_speech,
inputs=gr.Textbox(label="Input Text", placeholder="Enter Italian text here..."),
outputs=gr.Audio(label="Generated Speech"),
title=title,
description=description,
examples=[
["Questa è una dimostrazione di sintesi vocale in italiano."],
["Benvenuti alla nostra piattaforma di sintesi vocale!"],
["Il modello è stato addestrato per parlare l'italiano in modo naturale e fluido."],
["Oggi il tempo è bello e il sole splende."],
["La città di Roma è una delle destinazioni turistiche più popolari al mondo."]
]
)
# Launch the interface
interface.launch()