pengdaqian commited on
Commit
2e2adc3
·
1 Parent(s): 3f4fdab
Files changed (4) hide show
  1. .gitignore +1 -0
  2. app.py +43 -16
  3. requirements.txt +0 -1
  4. whisper/inference.py +3 -2
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .idea/
app.py CHANGED
@@ -1,20 +1,24 @@
 
 
 
 
1
  from music.search import get_random_spit, get_albums
2
  from vits.models import SynthesizerInfer
 
3
  from omegaconf import OmegaConf
4
  import torchcrepe
5
  import torch
6
  import io
7
- import os
8
  import gradio as gr
9
  import librosa
10
  import numpy as np
11
  import soundfile
12
  import random
13
- from audio2numpy import open_audio
14
  from spleeter.separator import Separator
15
  from spleeter.audio.adapter import AudioAdapter
16
  from pydub import AudioSegment
17
  import scipy.io.wavfile
 
18
 
19
  import logging
20
 
@@ -84,11 +88,13 @@ model.eval()
84
  model.to(device)
85
  separator = Separator('spleeter:2stems')
86
  audio_loader = AudioAdapter.default()
 
87
 
88
 
89
  def svc_change(argswave, argsspk):
90
  argsppg = "svc_tmp.ppg.npy"
91
- os.system(f"python whisper/inference.py -w {argswave} -p {argsppg}")
 
92
 
93
  spk = np.load(argsspk)
94
  spk = torch.FloatTensor(spk)
@@ -120,16 +126,16 @@ def svc_change(argswave, argsspk):
120
  out_audio = []
121
  has_audio = False
122
 
123
- while (out_index + out_chunk < all_frame):
124
  has_audio = True
125
- if (out_index == 0): # start frame
126
  cut_s = out_index
127
  cut_s_48k = 0
128
  else:
129
  cut_s = out_index - hop_frame
130
  cut_s_48k = hop_frame * hop_size
131
 
132
- if (out_index + out_chunk + hop_frame > all_frame): # end frame
133
  cut_e = out_index + out_chunk
134
  cut_e_48k = 0
135
  else:
@@ -148,8 +154,8 @@ def svc_change(argswave, argsspk):
148
  out_audio.extend(sub_out)
149
  out_index = out_index + out_chunk
150
 
151
- if (out_index < all_frame):
152
- if (has_audio):
153
  cut_s = out_index - hop_frame
154
  cut_s_48k = hop_frame * hop_size
155
  else:
@@ -177,23 +183,40 @@ def np_to_audio_segment(fp_arr):
177
  return sound
178
 
179
 
 
 
 
 
 
 
 
 
 
 
 
180
  def svc_main(sid, input_audio):
181
  if input_audio is None:
182
  return "You need to upload an audio", None
183
  sampling_rate, audio = input_audio
184
- input_audio_tmp_file = 'origin.wav'
 
 
 
 
185
  #
186
  # prediction = separator.separate(audio)
187
  # vocals, accompaniment = prediction["vocals"], prediction["accompaniment"]
188
  soundfile.write(input_audio_tmp_file, audio, sampling_rate, format="wav")
189
- separator.separate_to_file(input_audio_tmp_file, '')
 
 
190
 
191
- vocals_filepath = os.path.join(os.path.splitext(input_audio_tmp_file)[0], 'vocals.wav')
192
- accompaniment_filepath = os.path.join(os.path.splitext(input_audio_tmp_file)[0], 'accompaniment.wav')
 
193
 
194
  vocals, sampling_rate = soundfile.read(vocals_filepath)
195
 
196
- vocals = (vocals / np.iinfo(vocals.dtype).max).astype(np.float32)
197
  if len(vocals.shape) > 1:
198
  vocals = librosa.to_mono(vocals.transpose(1, 0))
199
  if sampling_rate != 16000:
@@ -204,7 +227,7 @@ def svc_main(sid, input_audio):
204
  soundfile.write(wav_path, vocals, 16000, format="wav")
205
 
206
  out_vocals = svc_change(wav_path, f"configs/singers/singer00{sid}.npy")
207
- out_vocals_filepath = os.path.join(os.path.splitext(input_audio_tmp_file)[0], 'out_vocals.wav')
208
  soundfile.write(out_vocals_filepath, out_vocals, 48000, format="wav")
209
 
210
  sound1 = AudioSegment.from_file(out_vocals_filepath)
@@ -212,7 +235,11 @@ def svc_main(sid, input_audio):
212
 
213
  played_togther = sound1.overlay(sound2)
214
 
215
- return "Success", (48000, played_togther)
 
 
 
 
216
 
217
 
218
  def auto_search(name):
@@ -221,7 +248,7 @@ def auto_search(name):
221
  album = random.choice(albums)
222
  save_path = get_random_spit(album)
223
  fp = save_path
224
- signal, sampling_rate = open_audio(fp)
225
  return sampling_rate, signal
226
 
227
 
 
1
+ import os
2
+
3
+ os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
4
+
5
  from music.search import get_random_spit, get_albums
6
  from vits.models import SynthesizerInfer
7
+ import whisper.inference
8
  from omegaconf import OmegaConf
9
  import torchcrepe
10
  import torch
11
  import io
 
12
  import gradio as gr
13
  import librosa
14
  import numpy as np
15
  import soundfile
16
  import random
 
17
  from spleeter.separator import Separator
18
  from spleeter.audio.adapter import AudioAdapter
19
  from pydub import AudioSegment
20
  import scipy.io.wavfile
21
+ import uuid
22
 
23
  import logging
24
 
 
88
  model.to(device)
89
  separator = Separator('spleeter:2stems')
90
  audio_loader = AudioAdapter.default()
91
+ whisper_model = whisper.inference.load_model(os.path.join("whisper_pretrain", "medium.pt"))
92
 
93
 
94
  def svc_change(argswave, argsspk):
95
  argsppg = "svc_tmp.ppg.npy"
96
+ whisper.inference.pred_ppg(whisper_model, argswave, argsppg)
97
+ # os.system(f"python whisper/inference.py -w {argswave} -p {argsppg}")
98
 
99
  spk = np.load(argsspk)
100
  spk = torch.FloatTensor(spk)
 
126
  out_audio = []
127
  has_audio = False
128
 
129
+ while out_index + out_chunk < all_frame:
130
  has_audio = True
131
+ if out_index == 0: # start frame
132
  cut_s = out_index
133
  cut_s_48k = 0
134
  else:
135
  cut_s = out_index - hop_frame
136
  cut_s_48k = hop_frame * hop_size
137
 
138
+ if out_index + out_chunk + hop_frame > all_frame: # end frame
139
  cut_e = out_index + out_chunk
140
  cut_e_48k = 0
141
  else:
 
154
  out_audio.extend(sub_out)
155
  out_index = out_index + out_chunk
156
 
157
+ if out_index < all_frame:
158
+ if has_audio:
159
  cut_s = out_index - hop_frame
160
  cut_s_48k = hop_frame * hop_size
161
  else:
 
183
  return sound
184
 
185
 
186
+ def get_dtype_max_value(dtype):
187
+ if np.issubdtype(dtype, np.integer):
188
+ info = np.iinfo(dtype)
189
+ return info.max
190
+ elif np.issubdtype(dtype, np.floating):
191
+ info = np.finfo(dtype)
192
+ return info.max
193
+ else:
194
+ raise ValueError("不支持的 dtype 类型")
195
+
196
+
197
  def svc_main(sid, input_audio):
198
  if input_audio is None:
199
  return "You need to upload an audio", None
200
  sampling_rate, audio = input_audio
201
+ uuid_value = uuid.uuid4()
202
+ uuid_string = str(uuid_value)
203
+ input_audio_tmp_file = f'{uuid_string}.wav'
204
+ tmpfile_path = '/tmp'
205
+
206
  #
207
  # prediction = separator.separate(audio)
208
  # vocals, accompaniment = prediction["vocals"], prediction["accompaniment"]
209
  soundfile.write(input_audio_tmp_file, audio, sampling_rate, format="wav")
210
+ if not os.path.exists(tmpfile_path):
211
+ os.makedirs(tmpfile_path)
212
+ separator.separate_to_file(input_audio_tmp_file, tmpfile_path)
213
 
214
+ curr_tmp_path = os.path.join(tmpfile_path, os.path.splitext(input_audio_tmp_file)[0])
215
+ vocals_filepath = os.path.join(curr_tmp_path, 'vocals.wav')
216
+ accompaniment_filepath = os.path.join(curr_tmp_path, 'accompaniment.wav')
217
 
218
  vocals, sampling_rate = soundfile.read(vocals_filepath)
219
 
 
220
  if len(vocals.shape) > 1:
221
  vocals = librosa.to_mono(vocals.transpose(1, 0))
222
  if sampling_rate != 16000:
 
227
  soundfile.write(wav_path, vocals, 16000, format="wav")
228
 
229
  out_vocals = svc_change(wav_path, f"configs/singers/singer00{sid}.npy")
230
+ out_vocals_filepath = os.path.join(curr_tmp_path, 'out_vocals.wav')
231
  soundfile.write(out_vocals_filepath, out_vocals, 48000, format="wav")
232
 
233
  sound1 = AudioSegment.from_file(out_vocals_filepath)
 
235
 
236
  played_togther = sound1.overlay(sound2)
237
 
238
+ result_path = os.path.join(curr_tmp_path, 'out_song.wav')
239
+ played_togther.export(result_path, format="wav")
240
+ result, sampling_rate = soundfile.read(result_path)
241
+
242
+ return "Success", (sampling_rate, result)
243
 
244
 
245
  def auto_search(name):
 
248
  album = random.choice(albums)
249
  save_path = get_random_spit(album)
250
  fp = save_path
251
+ signal, sampling_rate = soundfile.read(fp)
252
  return sampling_rate, signal
253
 
254
 
requirements.txt CHANGED
@@ -14,5 +14,4 @@ tqdm
14
  librosa
15
  pydub
16
  musicdl
17
- audio2numpy
18
  spleeter
 
14
  librosa
15
  pydub
16
  musicdl
 
17
  spleeter
whisper/inference.py CHANGED
@@ -21,7 +21,7 @@ def pred_ppg(whisper: Whisper, wavPath, ppgPath):
21
  audln = audio.shape[0]
22
  ppg_a = []
23
  idx_s = 0
24
- while (idx_s + 25 * 16000 < audln):
25
  short = audio[idx_s:idx_s + 25 * 16000]
26
  idx_s = idx_s + 25 * 16000
27
  ppgln = 25 * 16000 // 320
@@ -31,7 +31,7 @@ def pred_ppg(whisper: Whisper, wavPath, ppgPath):
31
  ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
32
  ppg = ppg[:ppgln,] # [length, dim=1024]
33
  ppg_a.extend(ppg)
34
- if (idx_s < audln):
35
  short = audio[idx_s:audln]
36
  ppgln = (audln - idx_s) // 320
37
  # short = pad_or_trim(short)
@@ -48,6 +48,7 @@ if __name__ == "__main__":
48
  parser.description = 'please enter embed parameter ...'
49
  parser.add_argument("-w", "--wav", help="wav", dest="wav")
50
  parser.add_argument("-p", "--ppg", help="ppg", dest="ppg")
 
51
  args = parser.parse_args()
52
  print(args.wav)
53
  print(args.ppg)
 
21
  audln = audio.shape[0]
22
  ppg_a = []
23
  idx_s = 0
24
+ while idx_s + 25 * 16000 < audln:
25
  short = audio[idx_s:idx_s + 25 * 16000]
26
  idx_s = idx_s + 25 * 16000
27
  ppgln = 25 * 16000 // 320
 
31
  ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
32
  ppg = ppg[:ppgln,] # [length, dim=1024]
33
  ppg_a.extend(ppg)
34
+ if idx_s < audln:
35
  short = audio[idx_s:audln]
36
  ppgln = (audln - idx_s) // 320
37
  # short = pad_or_trim(short)
 
48
  parser.description = 'please enter embed parameter ...'
49
  parser.add_argument("-w", "--wav", help="wav", dest="wav")
50
  parser.add_argument("-p", "--ppg", help="ppg", dest="ppg")
51
+
52
  args = parser.parse_args()
53
  print(args.wav)
54
  print(args.ppg)