Spaces:

amphion
/

singing_voice_conversion

Running on A10G

App Files Files Community

Adorable-Qin commited on Mar 7

Commit

2d4dca5

•

1 Parent(s): 7c4481d

Change F0 extractor to Crepe

Browse files

Files changed (2) hide show

ckpts/svc/vocalist_l1_contentvec+whisper/args.json +1 -1
utils/f0.py +22 -1

ckpts/svc/vocalist_l1_contentvec+whisper/args.json CHANGED Viewed

@@ -140,7 +140,7 @@
         "pin_memory": true,
         "pitch_bin": 256,
         "pitch_dir": "pitches",
-        "pitch_extractor": "parselmouth",
         "pitch_max": 1100.0,
         "pitch_min": 50.0,
         "processed_dir": "ckpts/svc/vocalist_l1_contentvec+whisper/data",

         "pin_memory": true,
         "pitch_bin": 256,
         "pitch_dir": "pitches",
+        "pitch_extractor": "crepe", // "parselmouth"
         "pitch_max": 1100.0,
         "pitch_min": 50.0,
         "processed_dir": "ckpts/svc/vocalist_l1_contentvec+whisper/data",

utils/f0.py CHANGED Viewed

@@ -207,7 +207,7 @@ def get_f0_features_using_harvest(audio, mel_len, fs, hop_length, f0_min, f0_max
     return f0
-def get_f0_features_using_crepe(
     audio, mel_len, fs, hop_length, hop_length_new, f0_min, f0_max, threshold=0.3
 ):
     """Using torchcrepe to extract the f0 feature.
@@ -259,6 +259,25 @@ def get_f0_features_using_crepe(
     f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
     return f0
 def get_f0(audio, cfg):
     if cfg.pitch_extractor == "dio":
@@ -267,6 +286,8 @@ def get_f0(audio, cfg):
         f0 = get_f0_features_using_pyin(audio, cfg)
     elif cfg.pitch_extractor == "parselmouth":
         f0, _ = get_f0_features_using_parselmouth(audio, cfg)
     # elif cfg.data.f0_extractor == 'cwt': # todo
     return f0

     return f0
+def get_f0_features_using_crepe_legacy(
     audio, mel_len, fs, hop_length, hop_length_new, f0_min, f0_max, threshold=0.3
 ):
     """Using torchcrepe to extract the f0 feature.
     f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
     return f0
+def get_f0_features_using_crepe(audio, cfg):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    audio_torch = torch.FloatTensor(audio).unsqueeze(0).to(device)
+    crepe_pitch, pd = torchcrepe.predict(audio_torch, cfg.sample_rate, cfg.hop_size, fmin=cfg.f0_min, fmax=cfg.f0_max, return_periodicity=True)
+    threshold = 0.3
+    # Filter, de-silence, set up threshold for unvoiced part
+    pd = torchcrepe.filter.median(pd, 3)
+    pd = torchcrepe.threshold.Silence(-60.0)(pd, audio_torch, cfg.sample_rate, 256)
+    crepe_pitch = torchcrepe.threshold.At(threshold)(crepe_pitch, pd)
+    crepe_pitch = torchcrepe.filter.mean(crepe_pitch, 3)
+    # Convert unvoiced part to 0hz
+    crepe_pitch = torch.where(torch.isnan(crepe_pitch), torch.full_like(crepe_pitch, 0), crepe_pitch)
+    return crepe_pitch[0].cpu().numpy()
 def get_f0(audio, cfg):
     if cfg.pitch_extractor == "dio":
         f0 = get_f0_features_using_pyin(audio, cfg)
     elif cfg.pitch_extractor == "parselmouth":
         f0, _ = get_f0_features_using_parselmouth(audio, cfg)
+    elif cfg.pitch_extractor == "crepe":
+        f0 = get_f0_features_using_crepe(audio, cfg)
     # elif cfg.data.f0_extractor == 'cwt': # todo
     return f0