File size: 2,241 Bytes
7694c84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# %%
import os
import torch
import librosa
import numpy as np
import torch.nn.functional as F

from tqdm import tqdm
from utils.audio import MelSpectrogram
from utils import write_lines_to_file

# %% CONFIG

wavs_path = 'G:/data/arabic-speech-corpus/wav_new'

waves = [f.path for f in os.scandir(wavs_path) if f.path.endswith('.wav')]
print(f"{len(waves)} wave files found at {wavs_path}")

mel_trf = MelSpectrogram()

# %% extract pitch (f0) values

pitch_dict = {}

for i, wav_path in tqdm(enumerate(waves), total=len(waves)):
    wav, sr = librosa.load(wav_path, sr=mel_trf.sample_rate)

    wav_name = os.path.basename(wav_path)
    if wav_name in pitch_dict:
        continue
    mel_spec = mel_trf(torch.tensor(wav)[None])[0] # [mel_bands, T]

    # estimate pitch
    pitch_mel, voiced_flag, voiced_probs = librosa.pyin(
        wav, sr=mel_trf.sample_rate,
        fmin=librosa.note_to_hz('C2'),
        fmax=librosa.note_to_hz('C7'),
        frame_length=mel_trf.win_length,
        hop_length=mel_trf.hop_length)

    pitch_mel = np.where(np.isnan(pitch_mel), 0., pitch_mel) # set nan to zero
    pitch_mel = torch.from_numpy(pitch_mel)
    pitch_mel = F.pad(pitch_mel, (0, mel_spec.size(1) - pitch_mel.size(0))) # pad to mel length

    pitch_dict[wav_name] = pitch_mel

    if i % 10 == 0: # save intermediate dict
        torch.save(pitch_dict, './data/pitch_dict.pt')

torch.save(pitch_dict, './data/pitch_dict.pt')


# %% calculate pitch mean and std

pitch_dict = torch.load('./data/pitch_dict.pt')

rmean = 0
rvar = 0
ndata = 0

for pitch_mel in pitch_dict.values():   
    pitch_mel = np.where(np.isnan(pitch_mel), 0.0, pitch_mel)
    
    pitch_mel_ = pitch_mel[pitch_mel > 1]
    p_mean = np.mean(pitch_mel_)
    p_var = np.var(pitch_mel_)
    p_len = len(pitch_mel_)

    rvar = ((ndata-1)*rvar + (p_len-1)*p_var) / (ndata + p_len - 1) + \
            ndata*p_len*(p_mean - rmean)**2 / ((ndata + p_len)*(ndata + p_len - 1))
    
    rmean = (p_len*p_mean + ndata*rmean) / (p_len + ndata)

    ndata += p_len

mean, std = rmean, np.sqrt(rvar)
print('mean ', mean)
print('std ', std)

write_lines_to_file(path='./data/mean_std.txt', 
                    lines=[f"mean: {mean}", 
                           f"std: {std}"])