File size: 2,164 Bytes
b60a7b6
 
 
81d87a8
 
b60a7b6
 
81d87a8
 
3dfc859
b60a7b6
5fb3738
77d5702
5fb3738
81d87a8
 
3dfc859
81d87a8
3dfc859
 
 
 
 
 
 
 
 
81d87a8
 
 
 
 
 
 
 
77d5702
b3b61c9
 
 
 
 
 
 
 
3dfc859
b60a7b6
 
 
 
 
 
 
 
 
b3b61c9
 
 
5fb3738
b3b61c9
5fb3738
b3b61c9
 
 
 
b60a7b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81d87a8
b60a7b6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
"""
Util functions to process any incoming audio data to be processable by the model 
"""
import os
import librosa
import torch
import torchaudio
from scipy.io import wavfile 
import wget
import requests

DEFAULT_SAMPLE_RATE=48000
DEFAULT_WAVE_LENGTH=3

def process_from_url(url):
    # download UI audio
    req_url = requests.get(url)

    with open('temp.wav', 'wb') as file:
        file.write(req_url.content)

    
    # filename = 'temp.wav'
    # audio = torchaudio.load(filename)

    # # remove wget file
    # os.remove(filename)

    # spec
    spec = process_from_filename('temp.wav')

    os.remove('temp.wav')
    return spec


def process_from_filename(filename, target_sample_rate=DEFAULT_SAMPLE_RATE, wav_length=DEFAULT_WAVE_LENGTH):
    wav, sample_rate = torchaudio.load(filename)

    wav = process_raw_wav(wav, sample_rate, target_sample_rate, wav_length)

    spec = _wav_to_spec(wav, target_sample_rate)

    return spec

def process_raw_wav(wav, sample_rate, target_sample_rate, wav_length):
    num_samples = wav_length * target_sample_rate

    wav = _resample(wav, sample_rate, target_sample_rate)
    wav = _mix_down(wav)
    wav = _cut(wav, num_samples)
    wav = _pad(wav, num_samples)

    return wav

def _wav_to_spec(wav, target_sample_rate):
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=target_sample_rate,
        n_fft=2048,
        hop_length=512,
        n_mels=128,
    )

    return mel_spectrogram(wav)

def _resample(wav, sample_rate, target_sample_rate):
    if sample_rate != target_sample_rate:
        resampler = torchaudio.transforms.Resample(sample_rate, target_sample_rate)
        wav = resampler(wav)
    
    return wav

def _mix_down(wav):
    if wav.shape[0] > 1:
        wav = torch.mean(wav, dim=0, keepdim=True)
    
    return wav

def _cut(wav, num_samples):
    if wav.shape[1] > num_samples:
        wav = wav[:, :num_samples]
    
    return wav

def _pad(wav, num_samples):
    if wav.shape[1] < num_samples:
        missing_samples = num_samples - wav.shape[1]
        pad = (0, missing_samples)
        wav = torch.nn.functional.pad(wav, pad)
    
    return wav