|
import gradio as gr |
|
import torch |
|
from datasets import load_dataset |
|
from transformers import SpeechT5Processor, SpeechT5HifiGan, SpeechT5ForTextToSpeech |
|
|
|
|
|
model_id = "Vinay15/speecht5_finetuned_voxpopuli_it" |
|
model = SpeechT5ForTextToSpeech.from_pretrained(model_id) |
|
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") |
|
|
|
|
|
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") |
|
speaker_embeddings = torch.tensor(embeddings_dataset[7440]["xvector"]).unsqueeze(0) |
|
|
|
|
|
processor = SpeechT5Processor.from_pretrained(model_id) |
|
|
|
|
|
replacements = [ |
|
("à", "a"), |
|
("è", "e"), |
|
("é", "e"), |
|
("ì", "i"), |
|
("ò", "o"), |
|
("ù", "u"), |
|
] |
|
|
|
|
|
def synthesize_speech(text): |
|
|
|
for src, dst in replacements: |
|
text = text.replace(src, dst) |
|
|
|
|
|
inputs = processor(text=text, return_tensors="pt") |
|
|
|
|
|
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) |
|
|
|
|
|
return (16000, speech.cpu().numpy()) |
|
|
|
|
|
title = "Fine-tuning TTS for Italian as a Regional Language Using SpeechT5" |
|
description = f""" |
|
This Space generates speech in Italian, a regional language, using a fine-tuned SpeechT5 model from Hugging Face. |
|
Italian is considered a regional language because it is primarily spoken within Italy and a few Italian-speaking regions in |
|
other countries, such as Switzerland, San Marino, Vatican City, and areas in Croatia and Slovenia. |
|
With about 85 million speakers worldwide, Italian's regional usage contrasts with the global reach of languages like English or Spanish. |
|
|
|
**Fine-Tuned Model Preparation:** This model has been fine-tuned using the VoxPopuli Italian dataset to optimize SpeechT5 for |
|
Italian pronunciation, intonation, and fluency. The fine-tuning process involved preprocessing the text data to ensure accurate |
|
Italian accents and phonetics, resulting in high-quality Italian speech synthesis. |
|
|
|
The fine-tuned model is available [here](https://huggingface.co/Vinay15/speecht5_finetuned_voxpopuli_it). |
|
|
|
**Note:** Processing time may vary based on sentence length. Longer sentences may take more time to process and generate audio. |
|
|
|
For more details, visit the [GitHub repository](https://github.com/Vinay152003/Fine-tuning-TTS-for-a-Italian-it-Language) and review the project [report](https://drive.google.com/file/d/1cvNPkuFlTZAu1iDaagCwVRGXFd6r6vqi/view?usp=sharing). |
|
""" |
|
|
|
|
|
interface = gr.Interface( |
|
fn=synthesize_speech, |
|
inputs=gr.Textbox(label="Input Text", placeholder="Enter Italian text here..."), |
|
outputs=gr.Audio(label="Generated Speech"), |
|
title=title, |
|
description=description, |
|
examples=[ |
|
["Questa è una dimostrazione di sintesi vocale in italiano."], |
|
["Benvenuti alla nostra piattaforma di sintesi vocale!"], |
|
["Il modello è stato addestrato per parlare l'italiano in modo naturale e fluido."], |
|
["Oggi il tempo è bello e il sole splende."], |
|
["La città di Roma è una delle destinazioni turistiche più popolari al mondo."] |
|
] |
|
) |
|
|
|
|
|
interface.launch() |
|
|