import torch from torch.utils.data import DataLoader import numpy as np from tqdm import tqdm from transformers import SpeechT5HifiGan from datasets import load_dataset from tqdm import tqdm import soundfile as sf import librosa import random dataset = load_dataset('pourmand1376/asr-farsi-youtube-chunked-10-seconds', split = "test") import librosa from datasets import load_dataset, Audio def resample_audio(example): # Resample to 16 kHz y_resampled = librosa.resample(example["audio"]["array"], orig_sr=example["audio"]["sampling_rate"], target_sr=16000) # Update the example with the resampled audio and new sample rate example["audio"]["array"] = y_resampled example["audio"]["sampling_rate"] = 16000 return example dataset = dataset.select(range(1000)) dataset = dataset.map(resample_audio) import torch from torch.utils.data import DataLoader import numpy as np from tqdm import tqdm from transformers import SpeechT5HifiGan from datasets import load_dataset from tqdm import tqdm import soundfile as sf import librosa def set_seed(seed): torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) np.random.seed(seed) torch.backends.cudnn.benchmark = False set_seed(997) # Load model directly from transformers import AutoProcessor, AutoModelForTextToSpectrogram processor = AutoProcessor.from_pretrained("Alidr79/speecht5_v3_youtube") model = AutoModelForTextToSpectrogram.from_pretrained("Alidr79/speecht5_v3_youtube") from speechbrain.inference.classifiers import EncoderClassifier import os spk_model_name = "speechbrain/spkrec-xvect-voxceleb" device = "cuda" if torch.cuda.is_available() else "cpu" speaker_model = EncoderClassifier.from_hparams( source=spk_model_name, run_opts={"device": device}, savedir=os.path.join("/tmp", spk_model_name), ) def create_speaker_embedding(waveform): with torch.no_grad(): speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform)) speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() return speaker_embeddings vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") from PersianG2p import Persian_g2p_converter from scipy.io import wavfile import soundfile as sf PersianG2Pconverter = Persian_g2p_converter(use_large = True) import noisereduce as nr def denoise_audio(audio, sr): # Perform noise reduction denoised_audio = nr.reduce_noise(y=audio, sr=sr) return denoised_audio import noisereduce as nr from pydub import AudioSegment def match_target_amplitude(sound, target_dBFS): change_in_dBFS = target_dBFS - sound.dBFS return sound.apply_gain(change_in_dBFS) import librosa def tts_fn(slider_value, input_text): audio_embedding = dataset[slider_value]['audio']['array'] sample_rate_embedding = dataset[slider_value]['audio']['sampling_rate'] if sample_rate_embedding != 16000: audio_embedding = librosa.resample(audio_embedding, orig_sr=sample_rate_embedding, target_sr=16_000) with torch.no_grad(): speaker_embedding = create_speaker_embedding(audio_embedding) speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0) phonemes = PersianG2Pconverter.transliterate(input_text, tidy = False, secret = True) # text = "" # for i in phonemes.replace(' .', '').split(" "): # text += i + " " text = phonemes print("sentence:", input_text) print("sentence phonemes:", text) with torch.no_grad(): inputs = processor(text = text, return_tensors="pt") with torch.no_grad(): spectrogram = model.generate_speech(inputs["input_ids"], speaker_embedding, minlenratio = 2, maxlenratio = 4, threshold = 0.3) with torch.no_grad(): speech = vocoder(spectrogram) speech = speech.numpy().reshape(-1) speech_denoised = denoise_audio(speech, 16000) sf.write("in_speech.wav", speech_denoised, 16000) sound = AudioSegment.from_wav("in_speech.wav", "wav") normalized_sound = match_target_amplitude(sound, -20.0) normalized_sound.export("out_sound.wav", format="wav") sample_rate_out, audio_out = wavfile.read("out_sound.wav") assert sample_rate_out == 16_000 return 16000, (audio_out.reshape(-1)).astype(np.int16) def master_fn(slider_value, input_text): if "." not in input_text: input_text += '.' print(f"speaker_id = {slider_value}") all_speech = [] for sentence in input_text.split("."): if sentence != '' and sentence != ' ' and sentence != '\n': sampling_rate_response, audio_chunk_response = tts_fn(slider_value, sentence) all_speech.append(audio_chunk_response) audio_response = np.concatenate(all_speech) return sampling_rate_response, audio_response import gradio as gr slider = gr.Slider( minimum=0, maximum=(len(dataset)-1), value=600, step=1, label="Select a speaker(Good examples : 600, 604, 910, 7, 13)" ) # Create the text input component text_input = gr.Textbox( label="Enter some text", placeholder="Type something here..." ) demo = gr.Interface( fn = master_fn, inputs=[slider, text_input], # List of inputs outputs = "audio" ) demo.launch()