File size: 5,195 Bytes
2b8a3f4 4440622 2b8a3f4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
import torch
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm
from transformers import SpeechT5HifiGan
from datasets import load_dataset
from tqdm import tqdm
import soundfile as sf
import librosa
dataset = load_dataset('pourmand1376/asr-farsi-youtube-chunked-10-seconds', split = "test")
import librosa
from datasets import load_dataset, Audio
def resample_audio(example):
# Resample to 16 kHz
y_resampled = librosa.resample(example["audio"]["array"], orig_sr=example["audio"]["sampling_rate"], target_sr=16000)
# Update the example with the resampled audio and new sample rate
example["audio"]["array"] = y_resampled
example["audio"]["sampling_rate"] = 16000
return example
dataset = dataset.select(range(1000))
dataset = dataset.map(resample_audio)
import torch
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm
from transformers import SpeechT5HifiGan
from datasets import load_dataset
from tqdm import tqdm
import soundfile as sf
import librosa
def set_seed(seed):
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
set_seed(1)
# Load model directly
from transformers import AutoProcessor, AutoModelForTextToSpectrogram
processor = AutoProcessor.from_pretrained("Alidr79/speecht5_v3_youtube")
model = AutoModelForTextToSpectrogram.from_pretrained("Alidr79/speecht5_v3_youtube")
from speechbrain.inference.classifiers import EncoderClassifier
import os
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
source=spk_model_name,
run_opts={"device": device},
savedir=os.path.join("/tmp", spk_model_name),
)
def create_speaker_embedding(waveform):
with torch.no_grad():
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
return speaker_embeddings
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
from PersianG2p import Persian_g2p_converter
from scipy.io import wavfile
import soundfile as sf
PersianG2Pconverter = Persian_g2p_converter(use_large = True)
import noisereduce as nr
def denoise_audio(audio, sr):
# Perform noise reduction
denoised_audio = nr.reduce_noise(y=audio, sr=sr)
return denoised_audio
import noisereduce as nr
from pydub import AudioSegment
def match_target_amplitude(sound, target_dBFS):
change_in_dBFS = target_dBFS - sound.dBFS
return sound.apply_gain(change_in_dBFS)
import librosa
def tts_fn(slider_value, input_text):
audio_embedding = dataset[slider_value]['audio']['array']
sample_rate_embedding = dataset[slider_value]['audio']['sampling_rate']
if sample_rate_embedding != 16000:
audio_embedding = librosa.resample(audio_embedding, orig_sr=sample_rate_embedding, target_sr=16_000)
with torch.no_grad():
speaker_embedding = create_speaker_embedding(audio_embedding)
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
phonemes = PersianG2Pconverter.transliterate(input_text, tidy = False, secret = True)
# text = "</s>"
# for i in phonemes.replace(' .', '').split(" "):
# text += i + " <pad> "
text = phonemes
print("sentence:", input_text)
print("sentence phonemes:", text)
with torch.no_grad():
inputs = processor(text = text, return_tensors="pt")
with torch.no_grad():
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embedding, minlenratio = 2, maxlenratio = 4, threshold = 0.5)
with torch.no_grad():
speech = vocoder(spectrogram)
speech = speech.numpy().reshape(-1)
speech_denoised = denoise_audio(speech, 16000)
sf.write("in_speech.wav", speech_denoised, 16000)
sound = AudioSegment.from_wav("in_speech.wav", "wav")
normalized_sound = match_target_amplitude(sound, -20.0)
normalized_sound.export("out_sound.wav", format="wav")
sample_rate_out, audio_out = wavfile.read("out_sound.wav")
assert sample_rate_out == 16_000
return 16000, (audio_out.reshape(-1)).astype(np.int16)
def master_fn(slider_value, input_text):
if "." not in input_text:
input_text += '.'
all_speech = []
for sentence in input_text.split("."):
sampling_rate_response, audio_chunk_response = tts_fn(slider_value, sentence)
all_speech.append(audio_chunk_response)
audio_response = np.concatenate(all_speech)
return sampling_rate_response, audio_response
import gradio as gr
slider = gr.Slider(
minimum=0,
maximum=(len(dataset)-1),
value=600,
step=1,
label="Select a speaker(Good examples : 600, 604, 910, 7, 13)"
)
# Create the text input component
text_input = gr.Textbox(
label="Enter some text",
placeholder="Type something here..."
)
demo = gr.Interface(
fn = master_fn,
inputs=[slider, text_input], # List of inputs
outputs = "audio"
)
demo.launch() |