Upload 14 files
Browse files- hparams.py +29 -0
- multiresunet_model.py +161 -0
- preprocess_data.py +115 -0
- pretrained_models/bass_hf.h5 +3 -0
- pretrained_models/bass_lf.h5 +3 -0
- pretrained_models/drums_hf.h5 +3 -0
- pretrained_models/drums_lf.h5 +3 -0
- pretrained_models/other_hf.h5 +3 -0
- pretrained_models/other_lf.h5 +3 -0
- pretrained_models/vocals_hf.h5 +3 -0
- pretrained_models/vocals_lf.h5 +3 -0
- separate.py +79 -0
- train.py +42 -0
- utils.py +21 -0
hparams.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
|
3 |
+
### Audio Hyperparameters ###
|
4 |
+
sr = 44100
|
5 |
+
|
6 |
+
lf_params = {
|
7 |
+
'min_f': librosa.note_to_hz('c0'),
|
8 |
+
'max_f': 4100,
|
9 |
+
'bins_per_octave': 24,
|
10 |
+
'gamma': 20
|
11 |
+
}
|
12 |
+
|
13 |
+
|
14 |
+
hf_params = {
|
15 |
+
'min_f': 4100,
|
16 |
+
'max_f': 16350,
|
17 |
+
'bins_per_octave': 96,
|
18 |
+
'gamma': 0
|
19 |
+
}
|
20 |
+
|
21 |
+
### Network Hyperparameters ###
|
22 |
+
|
23 |
+
n_channels = 1
|
24 |
+
chunk_size = 512
|
25 |
+
frequency_bins = 192
|
26 |
+
batch_size = 32
|
27 |
+
learning_rate = 0.0001
|
28 |
+
epochs = 35
|
29 |
+
inference_batch_size = 4
|
multiresunet_model.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
|
3 |
+
def Downsampling(x, filters, kernel_size = (5,5), padding = 'same', stride = 2, multires = False):
|
4 |
+
'''
|
5 |
+
Downsampling Block
|
6 |
+
|
7 |
+
Arguments:
|
8 |
+
x : input layer (tf.keras.layer)
|
9 |
+
filters : number of filters (int)
|
10 |
+
kernel_size : kernel dimensions (tuple or int), default (5,5)
|
11 |
+
padding : padding type for convolution (string), default same
|
12 |
+
stride : stride for convolution (tuple or int), default 2
|
13 |
+
|
14 |
+
Returns:
|
15 |
+
output : output layer (tf.keras.layer)
|
16 |
+
'''
|
17 |
+
if multires == False:
|
18 |
+
conv = tf.keras.layers.Conv2D(kernel_size = kernel_size, filters = filters, strides = stride, padding = padding,data_format = "channels_last")(x)
|
19 |
+
elif multires == True:
|
20 |
+
conv = tf.keras.layers.Conv2D(kernel_size = kernel_size, filters = filters//2, strides = stride, padding = padding,data_format = "channels_last")(x)
|
21 |
+
conv3 = tf.keras.layers.Conv2D(kernel_size = (3,3), filters = filters//4, strides = stride, padding = padding,data_format = "channels_last")(x)
|
22 |
+
conv7 = tf.keras.layers.Conv2D(kernel_size = (7,7), filters = filters//4, strides = stride, padding = padding,data_format = "channels_last")(x)
|
23 |
+
conv = tf.keras.layers.Concatenate()([conv, conv3, conv7])
|
24 |
+
bn = tf.keras.layers.BatchNormalization()(conv)
|
25 |
+
output = tf.keras.layers.LeakyReLU(0.2)(bn)
|
26 |
+
|
27 |
+
return output
|
28 |
+
|
29 |
+
def Upsampling(x , y, filters, res_filts, kernel_size = (5,5), padding = 'same', stride = 2, dropout = 'False', resblock = True, se_block = False):
|
30 |
+
'''
|
31 |
+
Upsampling Block
|
32 |
+
|
33 |
+
Arguments:
|
34 |
+
x : input layer (tf.keras.layer)
|
35 |
+
y : residual connection layer (tf.keras.layer)
|
36 |
+
filters : number of filters (int)
|
37 |
+
kernel_size : kernel dimensions (tuple or int), default (5,5)
|
38 |
+
padding : padding type for convolution (string), default same
|
39 |
+
stride : stride for convolution (tuple or int), default 2
|
40 |
+
dropout : dropout (boolean), default False
|
41 |
+
|
42 |
+
Returns:
|
43 |
+
output : output layer (tf.keras.layer)
|
44 |
+
'''
|
45 |
+
|
46 |
+
conv = tf.keras.layers.Conv2DTranspose(kernel_size = kernel_size, filters = filters, strides = stride, padding = padding, data_format = "channels_last")(x)
|
47 |
+
act = tf.keras.layers.ReLU()(conv)
|
48 |
+
output = tf.keras.layers.BatchNormalization()(act)
|
49 |
+
if dropout == 'True':
|
50 |
+
output = tf.keras.layers.Dropout(0.5)(output)
|
51 |
+
if y is not None:
|
52 |
+
if resblock is True:
|
53 |
+
y = ResBlock(y, depth = 2, filters = res_filts)
|
54 |
+
output = tf.keras.layers.Concatenate()([y, output])
|
55 |
+
if se_block is True:
|
56 |
+
output = SE_Block(output, r = 16)
|
57 |
+
return output
|
58 |
+
|
59 |
+
def ResBlock(x, filters, depth = 2, kernel_size = (5,5), padding = 'same', method = 'concat', se_block = False):
|
60 |
+
'''
|
61 |
+
ResNet Block
|
62 |
+
|
63 |
+
Arguments:
|
64 |
+
x : input layer (tf.keras.layer)
|
65 |
+
depth : number of layers in ResBlock
|
66 |
+
filters : number of filters (int)
|
67 |
+
kernel_size : kernel dimensions (tuple or int), default (5,5)
|
68 |
+
padding : padding type for convolution (string), default same
|
69 |
+
dropout : dropout (boolean), default False
|
70 |
+
|
71 |
+
Returns:
|
72 |
+
output : output layer (tf.keras.layer)
|
73 |
+
'''
|
74 |
+
|
75 |
+
conv = tf.keras.layers.Conv2D(kernel_size = kernel_size, filters = filters, padding = padding, data_format = "channels_last")(x)
|
76 |
+
conv = tf.keras.layers.ReLU()(conv)
|
77 |
+
conv = tf.keras.layers.BatchNormalization()(conv)
|
78 |
+
for i in range(0,depth-1):
|
79 |
+
conv = tf.keras.layers.Conv2D(kernel_size = kernel_size, filters = filters, padding = padding, data_format = "channels_last")(conv)
|
80 |
+
conv = tf.keras.layers.ReLU()(conv)
|
81 |
+
conv = tf.keras.layers.BatchNormalization()(conv)
|
82 |
+
if method == 'add':
|
83 |
+
output = tf.keras.layers.Add()([x, conv])
|
84 |
+
elif method == 'concat':
|
85 |
+
output = tf.keras.layers.Concatenate()([x, conv])
|
86 |
+
|
87 |
+
output = tf.keras.layers.ReLU()(output)
|
88 |
+
|
89 |
+
if se_block is True:
|
90 |
+
output = SE_Block(output, r = 16)
|
91 |
+
|
92 |
+
return output
|
93 |
+
|
94 |
+
def SE_Block(x, r = 16):
|
95 |
+
|
96 |
+
'''
|
97 |
+
Squeeze and Excitation Block
|
98 |
+
Assumes channel_last format
|
99 |
+
|
100 |
+
Arguments:
|
101 |
+
x : input layer (tf.keras.layer)
|
102 |
+
r : reduction ratio for first FC layer
|
103 |
+
|
104 |
+
Returns:
|
105 |
+
output : output layer (tf.keras.layer)
|
106 |
+
'''
|
107 |
+
filters = x.shape[-1]
|
108 |
+
pool = tf.keras.layers.GlobalAveragePooling2D(data_format='channels_last')(x)
|
109 |
+
fc1 = tf.keras.layers.Dense(int(filters/r))(pool)
|
110 |
+
fc1 = tf.keras.layers.ReLU()(fc1)
|
111 |
+
fc2 = tf.keras.layers.Dense(filters)(fc1)
|
112 |
+
fc2 = tf.keras.layers.Activation('sigmoid')(fc2)
|
113 |
+
output = tf.keras.layers.Reshape([1,1,filters])(fc2)
|
114 |
+
|
115 |
+
output = tf.keras.layers.Multiply()([x,output])
|
116 |
+
|
117 |
+
return output
|
118 |
+
|
119 |
+
def Steminator(input_shape = (256,128,1), kernel_size = (5,5), feature_maps = 8, multires = True, resblock = True, se_block = True):
|
120 |
+
|
121 |
+
'''
|
122 |
+
MultiResUnet Network Builder - Steminator
|
123 |
+
|
124 |
+
Arguments:
|
125 |
+
input_shape : input shape (tuple)
|
126 |
+
depth : number of layers in ResBlock
|
127 |
+
feature_maps : number of initial filters (int)
|
128 |
+
kernel_size : kernel dimensions (tuple or int), default (5,5)
|
129 |
+
multires : use multi-res Unet (boolean), default True
|
130 |
+
resblock : use resblock residual connections (boolean), default True
|
131 |
+
|
132 |
+
Returns:
|
133 |
+
model : tf.keras Neural net model (tf.keras.Model)
|
134 |
+
'''
|
135 |
+
|
136 |
+
cqt_input = tf.keras.Input(shape=input_shape)
|
137 |
+
|
138 |
+
ds_0 = Downsampling(cqt_input, filters = feature_maps*2, multires = multires)
|
139 |
+
ds_1 = Downsampling(ds_0, filters = feature_maps*4, multires = multires)
|
140 |
+
ds_2 = Downsampling(ds_1, filters = feature_maps*8, multires = multires)
|
141 |
+
ds_3 = Downsampling(ds_2, filters = feature_maps*16, multires = multires)
|
142 |
+
ds_4 = Downsampling(ds_3, filters = feature_maps*32, multires = multires)
|
143 |
+
ds_5 = Downsampling(ds_4, filters = feature_maps*64, multires = multires)
|
144 |
+
|
145 |
+
us_0 = Upsampling(ds_5,ds_4,filters = feature_maps*32, res_filts = feature_maps, dropout = 'True', resblock = resblock)
|
146 |
+
us_1 = Upsampling(us_0,ds_3,filters = feature_maps*16, res_filts = feature_maps*2, dropout = 'True', resblock = resblock)
|
147 |
+
us_2 = Upsampling(us_1,ds_2,filters = feature_maps*8, res_filts = feature_maps*4, dropout = 'True', resblock = resblock)
|
148 |
+
us_3 = Upsampling(us_2,ds_1,filters = feature_maps*4, res_filts = feature_maps*8, resblock = resblock)
|
149 |
+
us_4 = Upsampling(us_3,ds_0,filters = feature_maps*2, res_filts = feature_maps*16, resblock = resblock, se_block = False)
|
150 |
+
us_5 = Upsampling(us_4,None,filters = feature_maps, res_filts = feature_maps*32, resblock = resblock, se_block = False)
|
151 |
+
|
152 |
+
|
153 |
+
mask = tf.keras.layers.Conv2D(kernel_size = (1,1), filters = 1,activation='relu', padding = 'same',data_format="channels_last")(us_5) #original network kernel_size = (1,1)
|
154 |
+
|
155 |
+
outputs = tf.keras.layers.Multiply()([cqt_input,mask])
|
156 |
+
|
157 |
+
model = tf.keras.Model(inputs = cqt_input, outputs = outputs, name='Steminator')
|
158 |
+
|
159 |
+
#model.summary()
|
160 |
+
|
161 |
+
return model
|
preprocess_data.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import glob
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import librosa
|
6 |
+
from essentia.standard import (NSGConstantQ,
|
7 |
+
NSGIConstantQ)
|
8 |
+
|
9 |
+
import hparams
|
10 |
+
import utils
|
11 |
+
|
12 |
+
def parse_files(path, source):
|
13 |
+
|
14 |
+
if source == 'mixture':
|
15 |
+
path = path + 'Mixtures/Dev/*/' + str(source) + '.wav'
|
16 |
+
paths = sorted(glob.glob(path))
|
17 |
+
else:
|
18 |
+
path = path + 'Sources/Dev/*/' + str(source) + '.wav'
|
19 |
+
paths = sorted(glob.glob(path))
|
20 |
+
return paths
|
21 |
+
|
22 |
+
def forward_transform(y, min_f, max_f, bpo, gamma):
|
23 |
+
# Parameters
|
24 |
+
params = {
|
25 |
+
# Backward transform needs to know the signal size.
|
26 |
+
'inputSize': y.size,
|
27 |
+
'minFrequency': min_f,
|
28 |
+
'maxFrequency': max_f,
|
29 |
+
'binsPerOctave': bpo,
|
30 |
+
# Minimum number of FFT bins per CQ channel.
|
31 |
+
'minimumWindow': 4,
|
32 |
+
'gamma': gamma
|
33 |
+
}
|
34 |
+
|
35 |
+
|
36 |
+
# Forward and backward transforms
|
37 |
+
constantq, dcchannel, nfchannel = NSGConstantQ(**params)(y)
|
38 |
+
|
39 |
+
return constantq, dcchannel, nfchannel
|
40 |
+
|
41 |
+
def backward_transform(c, dc, nf, orig_size, min_f, max_f, bpo, gamma):
|
42 |
+
# Parameters
|
43 |
+
params = {
|
44 |
+
# Backward transform needs to know the signal size.
|
45 |
+
'inputSize': orig_size,
|
46 |
+
'minFrequency': min_f,
|
47 |
+
'maxFrequency': max_f,
|
48 |
+
'binsPerOctave': bpo,
|
49 |
+
# Minimum number of FFT bins per CQ channel.
|
50 |
+
'minimumWindow': 4,
|
51 |
+
'gamma': gamma
|
52 |
+
}
|
53 |
+
|
54 |
+
|
55 |
+
# Forward and backward transforms
|
56 |
+
y = NSGIConstantQ(**params)(c, dc, nf)
|
57 |
+
|
58 |
+
return y
|
59 |
+
|
60 |
+
|
61 |
+
def make_chunks(c):
|
62 |
+
cqt = np.abs(c).astype(np.float16)
|
63 |
+
cqt = np.asfortranarray(cqt)
|
64 |
+
padded_cqt = librosa.util.fix_length(cqt,hparams.chunk_size*np.ceil(cqt.shape[-1]/hparams.chunk_size).astype(int))
|
65 |
+
framed_cqt = librosa.util.frame(padded_cqt,hparams.chunk_size,hparams.chunk_size)
|
66 |
+
samples = np.transpose(framed_cqt,(2,0,1))
|
67 |
+
cqt_input = np.expand_dims(samples,-1)
|
68 |
+
return cqt_input
|
69 |
+
|
70 |
+
if __name__ == '__main__':
|
71 |
+
args = argparse.ArgumentParser()
|
72 |
+
|
73 |
+
args.add_argument('Path',metavar='path',type=str,help='Path to DSD100')
|
74 |
+
args.add_argument('Source',metavar='source',type=str,help='Desired source to preprocess for separation. Use mixture to preprocess the mixtures')
|
75 |
+
args.add_argument('Output_path',metavar='output_path',type=str,help='Output path for the pikled spectrograms')
|
76 |
+
|
77 |
+
args = args.parse_args()
|
78 |
+
path = args.Path
|
79 |
+
source = args.Source
|
80 |
+
outpath = args.Output_path
|
81 |
+
|
82 |
+
if path[-1] != '/':
|
83 |
+
path = path + '/'
|
84 |
+
if outpath[-1] != '/':
|
85 |
+
outpath = outpath + '/'
|
86 |
+
|
87 |
+
|
88 |
+
files = parse_files(path, source)
|
89 |
+
mag_lf_array = []
|
90 |
+
mag_hf_array = []
|
91 |
+
|
92 |
+
for i in range(0,len(files)):
|
93 |
+
print(files[i])
|
94 |
+
y, sr = librosa.load(files[i], hparams.sr, mono = True)
|
95 |
+
C_lf,_,_ = forward_transform(y,hparams.lf_params['min_f'],hparams.lf_params['max_f'],hparams.lf_params['bins_per_octave'], hparams.lf_params['gamma'])
|
96 |
+
C_hf,_,_ = forward_transform(y,hparams.hf_params['min_f'],hparams.hf_params['max_f'],hparams.hf_params['bins_per_octave'], hparams.hf_params['gamma'])
|
97 |
+
c_lf = make_chunks(C_lf)
|
98 |
+
c_hf = make_chunks(C_hf)
|
99 |
+
mag_lf_array.append(c_lf)
|
100 |
+
mag_hf_array.append(c_hf)
|
101 |
+
if i == 1:
|
102 |
+
break
|
103 |
+
|
104 |
+
|
105 |
+
mag_lf = utils.list_to_array(mag_lf_array)
|
106 |
+
mag_hf = utils.list_to_array(mag_hf_array)
|
107 |
+
|
108 |
+
|
109 |
+
filename_lf = source + '_lf.npy'
|
110 |
+
filename_hf = source + '_hf.npy'
|
111 |
+
utils.pickle(mag_lf, outpath, filename_lf)
|
112 |
+
utils.pickle(mag_hf, outpath, filename_hf)
|
113 |
+
|
114 |
+
|
115 |
+
|
pretrained_models/bass_hf.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1ee25f613e06405327650d09c32a218e8d72ff1a657492acc12729d4d73e27f7
|
3 |
+
size 133864672
|
pretrained_models/bass_lf.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b81709b811e7d605b2f1c8107075e3bd6e3cb3bfce4e51922d26edc06fda8844
|
3 |
+
size 133865952
|
pretrained_models/drums_hf.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:da3a8d08e4b4f6783fcc8601fad48e8fea3ee3077cc8ec91b34b34d2946b547b
|
3 |
+
size 133864672
|
pretrained_models/drums_lf.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:27378ddab7e14033eff4f4de3baf5be3d7da12c0fd02ce90cf127a02df4d6b17
|
3 |
+
size 133864672
|
pretrained_models/other_hf.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:41a29798ce7487d180b8afb59e1197336efb8467f86d76f531de97e81c913bef
|
3 |
+
size 133864672
|
pretrained_models/other_lf.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1ae2ac9c3c26c2f9544e7f991ca01a2daa6c61e4350d9594a3eb16785092a699
|
3 |
+
size 133864672
|
pretrained_models/vocals_hf.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:76232de963eba108bf4a1a41f97c7f9a361b29dc5609f902c0d4b90beb0c32e0
|
3 |
+
size 133864672
|
pretrained_models/vocals_lf.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dd29e527a5508b5f0e6d618261ab79f16d3c839d8ff642068d40095ab0364dd4
|
3 |
+
size 133862488
|
separate.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
|
3 |
+
import hparams
|
4 |
+
import utils
|
5 |
+
import multiresunet_model
|
6 |
+
import preprocess_data
|
7 |
+
|
8 |
+
import tensorflow as tf
|
9 |
+
import numpy as np
|
10 |
+
import librosa
|
11 |
+
import torchaudio
|
12 |
+
import torch
|
13 |
+
|
14 |
+
if __name__ == '__main__':
|
15 |
+
args = argparse.ArgumentParser()
|
16 |
+
|
17 |
+
args.add_argument('Path',metavar='path',type=str,help='Path to audio track to be separated')
|
18 |
+
args.add_argument('Source',metavar='source',type=str,help='Desired source to separate')
|
19 |
+
args.add_argument('Model_path', metavar='path_to_model',type=str,help='Path to saved models')
|
20 |
+
args.add_argument('Output_path', metavar='output_path',type=str,help='Output path for separated audio')
|
21 |
+
|
22 |
+
|
23 |
+
### Parse args ###
|
24 |
+
args = args.parse_args()
|
25 |
+
path_to_audio = args.Path
|
26 |
+
source = args.Source
|
27 |
+
path_to_model = args.Model_path
|
28 |
+
output_path = args.Output_path + source + '.wav'
|
29 |
+
|
30 |
+
### Load models ###
|
31 |
+
model_lf = tf.keras.models.load_model(path_to_model + source + '_lf.h5')
|
32 |
+
model_hf = tf.keras.models.load_model(path_to_model + source + '_hf.h5')
|
33 |
+
|
34 |
+
### Load audio track ###
|
35 |
+
y, sr = librosa.load(path_to_audio, hparams.sr, mono = True)
|
36 |
+
|
37 |
+
### Perform CQT transform on the audio ###
|
38 |
+
C_lf,dc_lf,nf_lf = preprocess_data.forward_transform(y,hparams.lf_params['min_f'],hparams.lf_params['max_f'],hparams.lf_params['bins_per_octave'], hparams.lf_params['gamma'])
|
39 |
+
C_hf,dc_hf,nf_hf = preprocess_data.forward_transform(y,hparams.hf_params['min_f'],hparams.hf_params['max_f'],hparams.hf_params['bins_per_octave'], hparams.hf_params['gamma'])
|
40 |
+
|
41 |
+
dc_lf[:] = 0
|
42 |
+
dc_hf[:] = 0
|
43 |
+
nf_lf[:] = 0
|
44 |
+
nf_hf[:] = 0
|
45 |
+
|
46 |
+
phase_lf = np.angle(C_lf)
|
47 |
+
phase_hf = np.angle(C_hf)
|
48 |
+
|
49 |
+
### Batch Input ###
|
50 |
+
c_lf = preprocess_data.make_chunks(C_lf)
|
51 |
+
c_hf = preprocess_data.make_chunks(C_hf)
|
52 |
+
|
53 |
+
### Separate LF and HF ###
|
54 |
+
c_lf = model_lf.predict(c_lf,batch_size = hparams.inference_batch_size)
|
55 |
+
c_hf = model_hf.predict(c_hf,batch_size = hparams.inference_batch_size)
|
56 |
+
|
57 |
+
### Reshape Model Output ###
|
58 |
+
mag_lf = np.hstack(c_lf[:,:,:,0])[:,:phase_lf.shape[-1]]
|
59 |
+
mag_hf = np.hstack(c_hf[:,:,:,0])[:,:phase_hf.shape[-1]]
|
60 |
+
c_lf = mag_lf * np.math.e**(phase_lf*1j)
|
61 |
+
c_hf = mag_hf * np.math.e**(phase_hf*1j)
|
62 |
+
|
63 |
+
### Inverse CQT transform using the mixture phase information ###
|
64 |
+
y_lf_hat = preprocess_data.backward_transform(c_lf,dc_lf,nf_lf,y.shape[0],hparams.lf_params['min_f'],hparams.lf_params['max_f'],hparams.lf_params['bins_per_octave'], hparams.lf_params['gamma'])
|
65 |
+
y_hf_hat = preprocess_data.backward_transform(c_hf,dc_hf,nf_hf,y.shape[0],hparams.hf_params['min_f'],hparams.hf_params['max_f'],hparams.hf_params['bins_per_octave'], hparams.hf_params['gamma'])
|
66 |
+
y_hat = y_lf_hat + y_hf_hat
|
67 |
+
|
68 |
+
print(mag_lf.shape)
|
69 |
+
print(mag_hf.shape)
|
70 |
+
|
71 |
+
print(y_lf_hat.shape)
|
72 |
+
print(y_hf_hat.shape)
|
73 |
+
|
74 |
+
torchaudio.save(output_path, torch.from_numpy(np.expand_dims(y_hat,0)), hparams.sr)
|
75 |
+
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
|
train.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
|
3 |
+
import hparams
|
4 |
+
import utils
|
5 |
+
import multiresunet_model
|
6 |
+
|
7 |
+
import tensorflow as tf
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
if __name__ == '__main__':
|
11 |
+
args = argparse.ArgumentParser()
|
12 |
+
|
13 |
+
args.add_argument('Path',metavar='path',type=str,help='Path to DSD100 pickled spectrograms. See preprocess_data.py for more details')
|
14 |
+
args.add_argument('Source',metavar='source',type=str,help='Desired source to separate')
|
15 |
+
args.add_argument('Spectrum',metavar='spectrum',type=str,help='Low (lf) or High (hf) frequencies training')
|
16 |
+
args.add_argument('Outpath',metavar='model_out_path',type=str,help='Path to save the model to')
|
17 |
+
|
18 |
+
### Parse Args ###
|
19 |
+
args = args.parse_args()
|
20 |
+
path = args.Path
|
21 |
+
source = args.Source
|
22 |
+
spectrum = args.Spectrum
|
23 |
+
output_path = args.Outpath
|
24 |
+
|
25 |
+
### Load Data ###
|
26 |
+
x = np.load(path + 'mixture_' + spectrum + '.npy')
|
27 |
+
y = np.load(path + source + '_' + spectrum + '.npy')
|
28 |
+
|
29 |
+
### Construct model ###
|
30 |
+
model = multiresunet_model.Steminator((hparams.frequency_bins,hparams.chunk_size,hparams.n_channels))
|
31 |
+
optimizer = tf.keras.optimizers.Adam(lr = hparams.learning_rate)
|
32 |
+
model.compile(optimizer, loss='mean_absolute_error')
|
33 |
+
|
34 |
+
### Training ###
|
35 |
+
model.fit(x,y,epochs = hparams.epochs, batch_size = hparams.batch_size)
|
36 |
+
|
37 |
+
### Save model ###
|
38 |
+
model.save(output_path + source + '_' + spectrum + '.h5')
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
|
utils.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import os
|
3 |
+
|
4 |
+
def create_dir(path):
|
5 |
+
if not os.path.exists(path):
|
6 |
+
try:
|
7 |
+
os.makedirs(path)
|
8 |
+
except OSError as e:
|
9 |
+
print('Could not create directory:' + path)
|
10 |
+
|
11 |
+
def list_to_array(m):
|
12 |
+
M = m[0]
|
13 |
+
for i in range(1,len(m)):
|
14 |
+
M = np.concatenate((M,m[i]), axis = 0)
|
15 |
+
|
16 |
+
return M
|
17 |
+
|
18 |
+
def pickle(array, path, filename):
|
19 |
+
create_dir(path)
|
20 |
+
np.save(path+filename, array)
|
21 |
+
return 0
|