import gradio as gr from huggingface_hub import hf_hub_download """ For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference """ import os import pickle import numpy as np import torch import torch.nn.functional as F from collections import OrderedDict from onmt_modules.misc import sequence_mask from model_autopst import Generator_2 as Predictor from hparams_autopst import hparams from model_sea import Generator from hparams_sea import hparams as sea_hparams device = torch.device("cuda" if torch.cuda.is_available() else "cpu") P = Predictor(hparams).eval().to(device) checkpoint = torch.load(hf_hub_download(repo_id="jonathanjordan21/AutoPST", filename='580000-P.ckpt'), map_location=lambda storage, loc: storage) P.load_state_dict(checkpoint['model'], strict=True) print('Loaded predictor .....................................................') dict_test = pickle.load(open('./assets/test_vctk.meta', 'rb')) spect_vc = OrderedDict() uttrs = [('p231', 'p270', '001'), ('p270', 'p231', '001'), ('p231', 'p245', '003001'), ('p245', 'p231', '003001'), ('p239', 'p270', '024002'), ('p270', 'p239', '024002')] for uttr in uttrs: cep_real, spk_emb = dict_test[uttr[0]][uttr[2]] cep_real_A = torch.from_numpy(cep_real).unsqueeze(0).to(device) len_real_A = torch.tensor(cep_real_A.size(1)).unsqueeze(0).to(device) real_mask_A = sequence_mask(len_real_A, cep_real_A.size(1)).float() _, spk_emb = dict_test[uttr[1]][uttr[2]] spk_emb_B = torch.from_numpy(spk_emb).unsqueeze(0).to(device) with torch.no_grad(): spect_output, len_spect = P.infer_onmt(cep_real_A.transpose(2,1)[:,:14,:], real_mask_A, len_real_A, spk_emb_B) uttr_tgt = spect_output[:len_spect[0],0,:].cpu().numpy() spect_vc[f'{uttr[0]}_{uttr[1]}_{uttr[2]}'] = uttr_tgt # spectrogram to waveform # Feel free to use other vocoders # This cell requires some preparation to work, please see the corresponding part in AutoVC import torch import librosa import pickle import os from synthesis import build_model from synthesis import wavegen model = build_model().to(device) checkpoint = torch.load(hf_hub_download(repo_id="jonathanjordan21/AutoPST", filename="checkpoint_step001000000_ema.pth"), map_location=torch.device('cpu')) model.load_state_dict(checkpoint["state_dict"]) # sea_checkpoint = torch.load(hf_hub_download(repo_id="jonathanjordan21/AutoPST", filename='sea.ckpt'), map_location=lambda storage, loc: storage) # gen =Generator(sea_hparams) # gen.load_state_dict(sea_checkpoint['model'], strict=True) # for name, sp in spect_vc.items(): # print(name) # waveform = wavegen(model, c=sp) # librosa.output.write_wav('./assets/'+name+'.wav', waveform, sr=16000) # def respond( # message, # history: list[tuple[str, str]], # system_message, # max_tokens, # temperature, # top_p, # ): # messages = [{"role": "system", "content": system_message}] # for val in history: # if val[0]: # messages.append({"role": "user", "content": val[0]}) # if val[1]: # messages.append({"role": "assistant", "content": val[1]}) # messages.append({"role": "user", "content": message}) # response = "" # for message in client.chat_completion( # messages, # max_tokens=max_tokens, # stream=True, # temperature=temperature, # top_p=top_p, # ): # token = message.choices[0].delta.content # response += token # yield response """ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface """ # demo = gr.ChatInterface( # respond, # additional_inputs=[ # gr.Textbox(value="You are a friendly Chatbot.", label="System message"), # gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), # gr.Slider( # minimum=0.1, # maximum=1.0, # value=0.95, # step=0.05, # label="Top-p (nucleus sampling)", # ), # ], # ) import os import pickle import numpy as np import soundfile as sf from scipy import signal from scipy.signal import get_window from librosa.filters import mel from numpy.random import RandomState def butter_highpass(cutoff, fs, order=5): nyq = 0.5 * fs normal_cutoff = cutoff / nyq b, a = signal.butter(order, normal_cutoff, btype='high', analog=False) return b, a def pySTFT(x, fft_length=1024, hop_length=256): x = np.pad(x, int(fft_length//2), mode='reflect') noverlap = fft_length - hop_length shape = x.shape[:-1]+((x.shape[-1]-noverlap)//hop_length, fft_length) strides = x.strides[:-1]+(hop_length*x.strides[-1], x.strides[-1]) result = np.lib.stride_tricks.as_strided(x, shape=shape, strides=strides) fft_window = get_window('hann', fft_length, fftbins=True) result = np.fft.rfft(fft_window * result, n=fft_length).T return np.abs(result) def create_sp(cep_real, spk_emb): # cep_real, spk_emb = dict_test[uttr[0]][uttr[2]] cep_real_A = torch.from_numpy(cep_real).unsqueeze(0).to(device) len_real_A = torch.tensor(cep_real_A.size(1)).unsqueeze(0).to(device) real_mask_A = sequence_mask(len_real_A, cep_real_A.size(1)).float() # _, spk_emb = dict_test[uttr[1]][uttr[2]] spk_emb_B = torch.from_numpy(spk_emb).unsqueeze(0).to(device) with torch.no_grad(): spect_output, len_spect = P.infer_onmt(cep_real_A.transpose(2,1)[:,:14,:], real_mask_A, len_real_A, spk_emb_B) uttr_tgt = spect_output[:len_spect[0],0,:].cpu().numpy() return uttr_tgt def create_mel(x): mel_basis = mel(sr=16000, n_fft=1024, fmin=90, fmax=7600, n_mels=80).T min_level = np.exp(-100 / 20 * np.log(10)) b, a = butter_highpass(30, 16000, order=5) mfcc_mean, mfcc_std, dctmx = pickle.load(open('assets/mfcc_stats.pkl', 'rb')) spk2emb = pickle.load(open('assets/spk2emb_82.pkl', 'rb')) if x.shape[0] % 256 == 0: x = np.concatenate((x, np.array([1e-06])), axis=0) y = signal.filtfilt(b, a, x) D = pySTFT(y * 0.96).T D_mel = np.dot(D, mel_basis) D_db = 20 * np.log10(np.maximum(min_level, D_mel)) # mel sp S = (D_db + 80) / 100 # mel cep cc_tmp = S.dot(dctmx) cc_norm = (cc_tmp - mfcc_mean) / mfcc_std S = np.clip(S, 0, 1) # teacher code # cc_torch = torch.from_numpy(cc_norm[:,0:20].astype(np.float32)).unsqueeze(0).to(device) # with torch.no_grad(): # codes = gen.encode(cc_torch, torch.ones_like(cc_torch[:,:,0])).squeeze(0) return S, cc_norm def transcribe(audio, spk): sr, y = audio y = librosa.resample(y, orig_sr=sr, target_sr=16000) y = y.astype(np.float32) y /= np.max(np.abs(y)) spk_emb = np.zeros((82,)) spk_emb[int(spk)-1] = 1 mel_sp, mel_cep = create_mel(y) sp = create_sp(mel_cep, spk_emb) waveform = wavegen(model, c=sp) return 16000, waveform # return transcriber({"sampling_rate": sr, "raw": y})["text"] demo = gr.Interface( transcribe, [ gr.Audio(), gr.Slider(1, 82, value=21, label="Count", step=1, info="Choose between 1 and 82") ], "audio", ) if __name__ == "__main__": demo.launch()