Spaces:
Build error
Build error
File size: 3,204 Bytes
f72cf0a eaabb20 f72cf0a eaabb20 f72cf0a 3626f45 f72cf0a 39dcf8d f72cf0a 37fd30f d1aeb88 37fd30f 9d4d12d 37fd30f 3626f45 cd4cc40 3626f45 eaabb20 9ac1698 eaabb20 cd4cc40 eaabb20 f72cf0a 8be4633 d1aeb88 f72cf0a 39dcf8d f72cf0a eaabb20 f72cf0a fedc6c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import onnxruntime
import numpy as np
import pyworld as pw
import librosa
import soundfile as sf
def resize2d(source, target_len):
source[source<0.001] = np.nan
target = np.interp(np.linspace(0, len(source)-1, num=target_len,endpoint=True), np.arange(0, len(source)), source)
return np.nan_to_num(target)
def _calculate_f0(input: np.ndarray,length,sr,f0min,f0max,
use_continuous_f0: bool=True,
use_log_f0: bool=True) -> np.ndarray:
input = input.astype(float)
frame_period = len(input)/sr/(length)*1000
f0, timeaxis = pw.dio(
input,
fs=sr,
f0_floor=f0min,
f0_ceil=f0max,
frame_period=frame_period)
f0 = pw.stonemask(input, f0, timeaxis, sr)
if use_log_f0:
nonzero_idxs = np.where(f0 != 0)[0]
f0[nonzero_idxs] = np.log(f0[nonzero_idxs])
return f0.reshape(-1)
def get_text(wav,sr,transform=1.0):
#wav, sr = librosa.load(file,sr=None)
if len(wav.shape) > 1:
wav = librosa.to_mono(wav.transpose(1, 0))
if sr!=16000:
wav16 = librosa.resample(wav, sr, 16000)
else:
wav16=wav
source = {"source":np.expand_dims(np.expand_dims(wav16,0),0)}
hubertsession = onnxruntime.InferenceSession("hubert.onnx",providers=['CUDAExecutionProvider'])
units = np.array(hubertsession.run(['embed'], source)[0])
f0=_calculate_f0(wav,units.shape[1],sr,
f0min=librosa.note_to_hz('C2'),
f0max=librosa.note_to_hz('C7'))
f0=resize2d(f0,units.shape[1])
f0[f0!=0]=f0[f0!=0]+np.log(transform)
expf0 = np.expand_dims(f0,(0,2))
output=np.concatenate((units,expf0,expf0),axis=2)
return output.astype(np.float32),f0
def getkey(key):
return np.power(2,key/12.0)
def infer(f,r,speaker,key,reqf0=False):
speaker=int(speaker[7:])
if not f is None:
file=f
elif not r is None:
file=r
else:
return "请上传音频", None
audio,sr = librosa.load(file,sr=None)
if sr<16000:
return "采样率过低,请上传至少拥有16000Hz采样率的音频",None
duration = audio.shape[0] / sr
print(audio,sr,duration)
if duration > 120:
return "请上传小于2min的音频", None
#audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
x,sourcef0 = get_text(audio,sr,getkey(key))
x_lengths = [np.size(x,1)]
print(x_lengths[0],sr,speaker,key)
sid = [speaker]
ort_inputs = {'x':x,'x_lengths':x_lengths,'sid':sid,"noise_scale":[0.667],"length_scale":[1.0],"noise_scale_w":[0.8]}
infersession = onnxruntime.InferenceSession("onnxmodel334.onnx",providers=['CUDAExecutionProvider'])
ort_output = infersession.run(['audio'], ort_inputs)
#sf.write(o,ort_output[0][0][0],22050,'PCM_16',format='wav')
genf0=np.array([])
if reqf0:
wav, sr = librosa.load(o,sr=None)
genf0=_calculate_f0(wav,x_lengths[0],sr,
f0min=librosa.note_to_hz('C2'),
f0max=librosa.note_to_hz('C7'))
genf0=resize2d(genf0,x_lengths[0])
return 'success',(22050,ort_output[0][0][0])#sourcef0.tolist(),genf0.tolist() |