Spaces:
Running
Running
modify app
Browse files- app.py +8 -6
- inference.py +96 -98
- modules/common_audioeffects.py +1537 -0
- modules/common_miscellaneous.py +219 -0
- modules/data_normalization.py +342 -0
- modules/fx_utils.py +308 -0
- modules/normalization_imager.py +123 -0
- modules/utils_data_normalization.py +992 -0
app.py
CHANGED
@@ -64,8 +64,8 @@ def process_audio_with_youtube(input_audio, input_youtube_url, reference_audio,
|
|
64 |
return process_audio(input_audio, reference_audio)
|
65 |
|
66 |
def process_audio(input_audio, reference_audio):
|
67 |
-
output_audio, predicted_params, sr = mastering_transfer.process_audio(
|
68 |
-
input_audio, reference_audio
|
69 |
)
|
70 |
|
71 |
param_output = mastering_transfer.get_param_output_string(predicted_params)
|
@@ -88,7 +88,7 @@ def process_audio(input_audio, reference_audio):
|
|
88 |
# Denormalize the audio to int16
|
89 |
output_audio = denormalize_audio(output_audio, dtype=np.int16)
|
90 |
|
91 |
-
return (sr, output_audio), param_output
|
92 |
|
93 |
def perform_ito(input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights):
|
94 |
if ito_reference_audio is None:
|
@@ -182,13 +182,15 @@ with gr.Blocks() as demo:
|
|
182 |
process_button = gr.Button("Process Mastering Style Transfer")
|
183 |
|
184 |
with gr.Row():
|
185 |
-
|
|
|
|
|
186 |
param_output = gr.Textbox(label="Predicted Parameters", lines=5)
|
187 |
|
188 |
process_button.click(
|
189 |
process_audio,
|
190 |
inputs=[input_audio, reference_audio],
|
191 |
-
outputs=[output_audio, param_output]
|
192 |
)
|
193 |
|
194 |
with gr.Tab("YouTube Audio"):
|
@@ -252,7 +254,7 @@ with gr.Blocks() as demo:
|
|
252 |
|
253 |
ito_button.click(
|
254 |
perform_ito,
|
255 |
-
inputs=[
|
256 |
outputs=[ito_output_audio, ito_param_output, ito_step_slider, ito_log, ito_loss_plot, all_results]
|
257 |
).then(
|
258 |
update_ito_output,
|
|
|
64 |
return process_audio(input_audio, reference_audio)
|
65 |
|
66 |
def process_audio(input_audio, reference_audio):
|
67 |
+
output_audio, predicted_params, sr, normalized_input = mastering_transfer.process_audio(
|
68 |
+
input_audio, reference_audio
|
69 |
)
|
70 |
|
71 |
param_output = mastering_transfer.get_param_output_string(predicted_params)
|
|
|
88 |
# Denormalize the audio to int16
|
89 |
output_audio = denormalize_audio(output_audio, dtype=np.int16)
|
90 |
|
91 |
+
return (sr, output_audio), param_output, (sr, normalized_input)
|
92 |
|
93 |
def perform_ito(input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights):
|
94 |
if ito_reference_audio is None:
|
|
|
182 |
process_button = gr.Button("Process Mastering Style Transfer")
|
183 |
|
184 |
with gr.Row():
|
185 |
+
with gr.Column():
|
186 |
+
output_audio = gr.Audio(label="Output Audio", type='numpy')
|
187 |
+
normalized_input = gr.Audio(label="Normalized Input Audio", type='numpy')
|
188 |
param_output = gr.Textbox(label="Predicted Parameters", lines=5)
|
189 |
|
190 |
process_button.click(
|
191 |
process_audio,
|
192 |
inputs=[input_audio, reference_audio],
|
193 |
+
outputs=[output_audio, param_output, normalized_input]
|
194 |
)
|
195 |
|
196 |
with gr.Tab("YouTube Audio"):
|
|
|
254 |
|
255 |
ito_button.click(
|
256 |
perform_ito,
|
257 |
+
inputs=[normalized_input, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights],
|
258 |
outputs=[ito_output_audio, ito_param_output, ito_step_slider, ito_log, ito_loss_plot, all_results]
|
259 |
).then(
|
260 |
update_ito_output,
|
inference.py
CHANGED
@@ -30,6 +30,11 @@ class MasteringStyleTransfer:
|
|
30 |
self.effects_encoder = self.load_effects_encoder()
|
31 |
self.mastering_converter = self.load_mastering_converter()
|
32 |
|
|
|
|
|
|
|
|
|
|
|
33 |
def load_effects_encoder(self):
|
34 |
effects_encoder = Effects_Encoder(self.args.cfg_enc)
|
35 |
reload_weights(effects_encoder, self.args.encoder_path, self.device)
|
@@ -60,68 +65,6 @@ class MasteringStyleTransfer:
|
|
60 |
predicted_params = self.mastering_converter.get_last_predicted_params()
|
61 |
return output_audio, predicted_params
|
62 |
|
63 |
-
# def inference_time_optimization(self, input_tensor, reference_tensor, ito_config, initial_reference_feature):
|
64 |
-
# fit_embedding = torch.nn.Parameter(initial_reference_feature)
|
65 |
-
# optimizer = getattr(torch.optim, ito_config['optimizer'])([fit_embedding], lr=ito_config['learning_rate'])
|
66 |
-
|
67 |
-
# af_loss = AudioFeatureLoss(
|
68 |
-
# weights=ito_config['af_weights'],
|
69 |
-
# sample_rate=ito_config['sample_rate'],
|
70 |
-
# stem_separation=False,
|
71 |
-
# use_clap=False
|
72 |
-
# )
|
73 |
-
|
74 |
-
# min_loss = float('inf')
|
75 |
-
# min_loss_step = 0
|
76 |
-
# min_loss_output = None
|
77 |
-
# min_loss_params = None
|
78 |
-
# min_loss_embedding = None
|
79 |
-
|
80 |
-
# loss_history = []
|
81 |
-
# divergence_counter = 0
|
82 |
-
# ito_log = []
|
83 |
-
|
84 |
-
# for step in range(ito_config['num_steps']):
|
85 |
-
# optimizer.zero_grad()
|
86 |
-
|
87 |
-
# output_audio = self.mastering_converter(input_tensor, fit_embedding)
|
88 |
-
# current_params = self.mastering_converter.get_last_predicted_params()
|
89 |
-
|
90 |
-
# losses = af_loss(output_audio, reference_tensor)
|
91 |
-
# total_loss = sum(losses.values())
|
92 |
-
|
93 |
-
# loss_history.append(total_loss.item())
|
94 |
-
|
95 |
-
# if total_loss < min_loss:
|
96 |
-
# min_loss = total_loss.item()
|
97 |
-
# min_loss_step = step
|
98 |
-
# min_loss_output = output_audio.detach()
|
99 |
-
# min_loss_params = current_params
|
100 |
-
# min_loss_embedding = fit_embedding.detach().clone()
|
101 |
-
|
102 |
-
# # Check for divergence
|
103 |
-
# if len(loss_history) > 10 and total_loss > loss_history[-11]:
|
104 |
-
# divergence_counter += 1
|
105 |
-
# else:
|
106 |
-
# divergence_counter = 0
|
107 |
-
|
108 |
-
# # Log top 5 parameter differences
|
109 |
-
# if step == 0:
|
110 |
-
# initial_params = current_params
|
111 |
-
# top_5_diff = self.get_top_n_diff_string(initial_params, current_params, top_n=5)
|
112 |
-
# log_entry = f"Step {step + 1}\n Loss: {total_loss.item():.4f}\n{top_5_diff}\n"
|
113 |
-
|
114 |
-
# if divergence_counter >= 10:
|
115 |
-
# print(f"Optimization stopped early due to divergence at step {step}")
|
116 |
-
# break
|
117 |
-
|
118 |
-
# total_loss.backward()
|
119 |
-
# optimizer.step()
|
120 |
-
|
121 |
-
# yield log_entry, output_audio.detach(), current_params, step + 1, total_loss.item()
|
122 |
-
|
123 |
-
# return min_loss_output, min_loss_params, min_loss_embedding, min_loss_step + 1
|
124 |
-
|
125 |
def inference_time_optimization(self, input_tensor, reference_tensor, ito_config, initial_reference_feature):
|
126 |
fit_embedding = torch.nn.Parameter(initial_reference_feature)
|
127 |
optimizer = getattr(torch.optim, ito_config['optimizer'])([fit_embedding], lr=ito_config['learning_rate'])
|
@@ -167,11 +110,9 @@ class MasteringStyleTransfer:
|
|
167 |
total_loss.backward()
|
168 |
optimizer.step()
|
169 |
|
170 |
-
# yield all_results[-1]
|
171 |
-
|
172 |
return all_results, min_loss_step
|
173 |
|
174 |
-
def preprocess_audio(self, audio, target_sample_rate=44100):
|
175 |
sample_rate, data = audio
|
176 |
|
177 |
# Normalize audio to -1 to 1 range
|
@@ -195,62 +136,119 @@ class MasteringStyleTransfer:
|
|
195 |
else:
|
196 |
raise ValueError(f"Unsupported audio shape: {data.shape}")
|
197 |
|
198 |
-
# Convert to torch tensor
|
199 |
-
data_tensor = torch.FloatTensor(data).unsqueeze(0)
|
200 |
-
|
201 |
# Resample if necessary
|
202 |
if sample_rate != target_sample_rate:
|
203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
|
205 |
return data_tensor.to(self.device)
|
206 |
|
207 |
-
def process_audio(self, input_audio, reference_audio
|
208 |
-
input_tensor = self.preprocess_audio(input_audio, self.args.sample_rate)
|
209 |
reference_tensor = self.preprocess_audio(reference_audio, self.args.sample_rate)
|
210 |
-
ito_reference_tensor = self.preprocess_audio(ito_reference_audio, self.args.sample_rate)
|
211 |
|
212 |
reference_feature = self.get_reference_embedding(reference_tensor)
|
213 |
|
214 |
output_audio, predicted_params = self.mastering_style_transfer(input_tensor, reference_feature)
|
215 |
|
216 |
-
return output_audio, predicted_params, self.args.sample_rate
|
217 |
-
|
218 |
-
def print_predicted_params(self, predicted_params):
|
219 |
-
if predicted_params is None:
|
220 |
-
print("No predicted parameters available.")
|
221 |
-
return
|
222 |
-
|
223 |
-
print("Predicted Parameters:")
|
224 |
-
for fx_name, fx_params in predicted_params.items():
|
225 |
-
print(f"\n{fx_name.upper()}:")
|
226 |
-
if isinstance(fx_params, dict):
|
227 |
-
for param_name, param_value in fx_params.items():
|
228 |
-
if isinstance(param_value, torch.Tensor):
|
229 |
-
param_value = param_value.detach().cpu().numpy()
|
230 |
-
print(f" {param_name}: {param_value}")
|
231 |
-
elif isinstance(fx_params, torch.Tensor):
|
232 |
-
param_value = fx_params.detach().cpu().numpy()
|
233 |
-
print(f" {param_value}")
|
234 |
-
else:
|
235 |
-
print(f" {fx_params}")
|
236 |
|
237 |
def get_param_output_string(self, params):
|
238 |
if params is None:
|
239 |
return "No parameters available"
|
240 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
output = []
|
242 |
for fx_name, fx_params in params.items():
|
243 |
-
output.append(f"{fx_name
|
244 |
if isinstance(fx_params, dict):
|
245 |
for param_name, param_value in fx_params.items():
|
246 |
if isinstance(param_value, torch.Tensor):
|
247 |
param_value = param_value.item()
|
248 |
-
|
249 |
-
|
250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
else:
|
252 |
-
|
253 |
-
|
|
|
|
|
|
|
|
|
254 |
return "\n".join(output)
|
255 |
|
256 |
def get_top_n_diff_string(self, initial_params, ito_params, top_n=5):
|
|
|
30 |
self.effects_encoder = self.load_effects_encoder()
|
31 |
self.mastering_converter = self.load_mastering_converter()
|
32 |
|
33 |
+
self.fx_normalizer = Audio_Effects_Normalizer(precomputed_feature_path=args.fx_norm_feature_path, \
|
34 |
+
STEMS=['mixture'], \
|
35 |
+
EFFECTS=['eq', 'imager', 'loudness'], \
|
36 |
+
audio_extension=args.audio_extension)
|
37 |
+
|
38 |
def load_effects_encoder(self):
|
39 |
effects_encoder = Effects_Encoder(self.args.cfg_enc)
|
40 |
reload_weights(effects_encoder, self.args.encoder_path, self.device)
|
|
|
65 |
predicted_params = self.mastering_converter.get_last_predicted_params()
|
66 |
return output_audio, predicted_params
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
def inference_time_optimization(self, input_tensor, reference_tensor, ito_config, initial_reference_feature):
|
69 |
fit_embedding = torch.nn.Parameter(initial_reference_feature)
|
70 |
optimizer = getattr(torch.optim, ito_config['optimizer'])([fit_embedding], lr=ito_config['learning_rate'])
|
|
|
110 |
total_loss.backward()
|
111 |
optimizer.step()
|
112 |
|
|
|
|
|
113 |
return all_results, min_loss_step
|
114 |
|
115 |
+
def preprocess_audio(self, audio, target_sample_rate=44100, is_input=False):
|
116 |
sample_rate, data = audio
|
117 |
|
118 |
# Normalize audio to -1 to 1 range
|
|
|
136 |
else:
|
137 |
raise ValueError(f"Unsupported audio shape: {data.shape}")
|
138 |
|
|
|
|
|
|
|
139 |
# Resample if necessary
|
140 |
if sample_rate != target_sample_rate:
|
141 |
+
data = julius.resample_frac(torch.from_numpy(data), sample_rate, target_sample_rate).numpy()
|
142 |
+
|
143 |
+
# Apply fx normalization for input audio during mastering style transfer
|
144 |
+
if is_input:
|
145 |
+
data = self.fx_normalizer.normalize_audio(data, 'mixture')
|
146 |
+
|
147 |
+
# Convert to torch tensor
|
148 |
+
data_tensor = torch.FloatTensor(data).unsqueeze(0)
|
149 |
|
150 |
return data_tensor.to(self.device)
|
151 |
|
152 |
+
def process_audio(self, input_audio, reference_audio):
|
153 |
+
input_tensor = self.preprocess_audio(input_audio, self.args.sample_rate, is_input=True)
|
154 |
reference_tensor = self.preprocess_audio(reference_audio, self.args.sample_rate)
|
|
|
155 |
|
156 |
reference_feature = self.get_reference_embedding(reference_tensor)
|
157 |
|
158 |
output_audio, predicted_params = self.mastering_style_transfer(input_tensor, reference_feature)
|
159 |
|
160 |
+
return output_audio, predicted_params, self.args.sample_rate, input_tensor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
def get_param_output_string(self, params):
|
163 |
if params is None:
|
164 |
return "No parameters available"
|
165 |
|
166 |
+
param_mapper = {
|
167 |
+
'EQ': {
|
168 |
+
'low_shelf_gain_db': ('Low Shelf Gain', 'dB', -20, 20),
|
169 |
+
'low_shelf_cutoff_freq': ('Low Shelf Cutoff', 'Hz', 20, 2000),
|
170 |
+
'low_shelf_q_factor': ('Low Shelf Q', '', 0.1, 5.0),
|
171 |
+
'band0_gain_db': ('Low-Mid Band Gain', 'dB', -20, 20),
|
172 |
+
'band0_cutoff_freq': ('Low-Mid Band Frequency', 'Hz', 80, 2000),
|
173 |
+
'band0_q_factor': ('Low-Mid Band Q', '', 0.1, 5.0),
|
174 |
+
'band1_gain_db': ('Mid Band Gain', 'dB', -20, 20),
|
175 |
+
'band1_cutoff_freq': ('Mid Band Frequency', 'Hz', 2000, 8000),
|
176 |
+
'band1_q_factor': ('Mid Band Q', '', 0.1, 5.0),
|
177 |
+
'band2_gain_db': ('High-Mid Band Gain', 'dB', -20, 20),
|
178 |
+
'band2_cutoff_freq': ('High-Mid Band Frequency', 'Hz', 8000, 12000),
|
179 |
+
'band2_q_factor': ('High-Mid Band Q', '', 0.1, 5.0),
|
180 |
+
'band3_gain_db': ('High Band Gain', 'dB', -20, 20),
|
181 |
+
'band3_cutoff_freq': ('High Band Frequency', 'Hz', 12000, 20000), # Assuming sample_rate is 44100
|
182 |
+
'band3_q_factor': ('High Band Q', '', 0.1, 5.0),
|
183 |
+
'high_shelf_gain_db': ('High Shelf Gain', 'dB', -20, 20),
|
184 |
+
'high_shelf_cutoff_freq': ('High Shelf Cutoff', 'Hz', 4000, 20000), # Assuming sample_rate is 44100
|
185 |
+
'high_shelf_q_factor': ('High Shelf Q', '', 0.1, 5.0),
|
186 |
+
},
|
187 |
+
'DISTORTION': {
|
188 |
+
'drive_db': ('Drive', 'dB', 0, 8),
|
189 |
+
'parallel_weight_factor': ('Dry/Wet Mix', '%', 0, 100),
|
190 |
+
},
|
191 |
+
'MULTIBAND_COMP': {
|
192 |
+
'low_cutoff': ('Low/Mid Crossover', 'Hz', 20, 1000),
|
193 |
+
'high_cutoff': ('Mid/High Crossover', 'Hz', 1000, 20000),
|
194 |
+
'parallel_weight_factor': ('Dry/Wet Mix', '%', 0, 100),
|
195 |
+
'low_shelf_comp_thresh': ('Low Band Comp Threshold', 'dB', -60, 0),
|
196 |
+
'low_shelf_comp_ratio': ('Low Band Comp Ratio', ':1', 1, 20),
|
197 |
+
'low_shelf_exp_thresh': ('Low Band Exp Threshold', 'dB', -60, 0),
|
198 |
+
'low_shelf_exp_ratio': ('Low Band Exp Ratio', ':1', 1, 20),
|
199 |
+
'low_shelf_at': ('Low Band Attack Time', 'ms', 5, 100),
|
200 |
+
'low_shelf_rt': ('Low Band Release Time', 'ms', 5, 100),
|
201 |
+
'mid_band_comp_thresh': ('Mid Band Comp Threshold', 'dB', -60, 0),
|
202 |
+
'mid_band_comp_ratio': ('Mid Band Comp Ratio', ':1', 1, 20),
|
203 |
+
'mid_band_exp_thresh': ('Mid Band Exp Threshold', 'dB', -60, 0),
|
204 |
+
'mid_band_exp_ratio': ('Mid Band Exp Ratio', ':1', 1, 20),
|
205 |
+
'mid_band_at': ('Mid Band Attack Time', 'ms', 5, 100),
|
206 |
+
'mid_band_rt': ('Mid Band Release Time', 'ms', 5, 100),
|
207 |
+
'high_shelf_comp_thresh': ('High Band Comp Threshold', 'dB', -60, 0),
|
208 |
+
'high_shelf_comp_ratio': ('High Band Comp Ratio', ':1', 1, 20),
|
209 |
+
'high_shelf_exp_thresh': ('High Band Exp Threshold', 'dB', -60, 0),
|
210 |
+
'high_shelf_exp_ratio': ('High Band Exp Ratio', ':1', 1, 20),
|
211 |
+
'high_shelf_at': ('High Band Attack Time', 'ms', 5, 100),
|
212 |
+
'high_shelf_rt': ('High Band Release Time', 'ms', 5, 100),
|
213 |
+
},
|
214 |
+
'GAIN': {
|
215 |
+
'gain_db': ('Output Gain', 'dB', -24, 24),
|
216 |
+
},
|
217 |
+
'IMAGER': {
|
218 |
+
'width': ('Stereo Width', '', 0, 1),
|
219 |
+
},
|
220 |
+
'LIMITER': {
|
221 |
+
'threshold': ('Threshold', 'dB', -60, 0),
|
222 |
+
'at': ('Attack Time', 'ms', 5, 100),
|
223 |
+
'rt': ('Release Time', 'ms', 5, 100),
|
224 |
+
},
|
225 |
+
}
|
226 |
+
|
227 |
output = []
|
228 |
for fx_name, fx_params in params.items():
|
229 |
+
output.append(f"{fx_name}:")
|
230 |
if isinstance(fx_params, dict):
|
231 |
for param_name, param_value in fx_params.items():
|
232 |
if isinstance(param_value, torch.Tensor):
|
233 |
param_value = param_value.item()
|
234 |
+
|
235 |
+
if fx_name in param_mapper and param_name in param_mapper[fx_name]:
|
236 |
+
friendly_name, unit, min_val, max_val = param_mapper[fx_name][param_name]
|
237 |
+
if fx_name == 'IMAGER' and param_name == 'width':
|
238 |
+
# Convert width to a more intuitive scale
|
239 |
+
width_percentage = param_value * 200
|
240 |
+
output.append(f" {friendly_name}: {width_percentage:.2f}% (Range: 0-200%)")
|
241 |
+
else:
|
242 |
+
output.append(f" {friendly_name}: {param_value:.2f} {unit} (Range: {min_val}-{max_val})")
|
243 |
+
else:
|
244 |
+
output.append(f" {param_name}: {param_value:.2f}")
|
245 |
else:
|
246 |
+
if fx_name == 'IMAGER':
|
247 |
+
width_percentage = fx_params.item() * 200
|
248 |
+
output.append(f" Stereo Width: {width_percentage:.2f}% (Range: 0-200%)")
|
249 |
+
else:
|
250 |
+
output.append(f" {fx_params.item():.2f}")
|
251 |
+
|
252 |
return "\n".join(output)
|
253 |
|
254 |
def get_top_n_diff_string(self, initial_params, ito_params, top_n=5):
|
modules/common_audioeffects.py
ADDED
@@ -0,0 +1,1537 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Audio effects for data augmentation.
|
3 |
+
|
4 |
+
Several audio effects can be combined into an augmentation chain.
|
5 |
+
|
6 |
+
Important note: We assume that the parallelization during training is done using
|
7 |
+
multi-processing and not multi-threading. Hence, we do not need the
|
8 |
+
`@sox.sox_context()` decorators as discussed in this
|
9 |
+
[thread](https://github.com/pseeth/soxbindings/issues/4).
|
10 |
+
|
11 |
+
AI Music Technology Group, Sony Group Corporation
|
12 |
+
AI Speech and Sound Group, Sony Europe
|
13 |
+
|
14 |
+
|
15 |
+
This implementation originally belongs to Sony Group Corporation,
|
16 |
+
which has been introduced in the work "Automatic music mixing with deep learning and out-of-domain data".
|
17 |
+
Original repo link: https://github.com/sony/FxNorm-automix
|
18 |
+
This work modifies a few implementations from the original repo to suit the task.
|
19 |
+
"""
|
20 |
+
|
21 |
+
from itertools import permutations
|
22 |
+
import logging
|
23 |
+
import numpy as np
|
24 |
+
import pymixconsole as pymc
|
25 |
+
from pymixconsole.parameter import Parameter
|
26 |
+
from pymixconsole.parameter_list import ParameterList
|
27 |
+
from pymixconsole.processor import Processor
|
28 |
+
from random import shuffle
|
29 |
+
from scipy.signal import oaconvolve
|
30 |
+
import soxbindings as sox
|
31 |
+
from typing import List, Optional, Tuple, Union
|
32 |
+
from numba import jit
|
33 |
+
|
34 |
+
# prevent pysox from logging warnings regarding non-opimal timestretch factors
|
35 |
+
logging.getLogger('sox').setLevel(logging.ERROR)
|
36 |
+
|
37 |
+
|
38 |
+
# Monkey-Patch `Processor` for convenience
|
39 |
+
# (a) Allow `None` as blocksize if processor can work on variable-length audio
|
40 |
+
def new_init(self, name, parameters, block_size, sample_rate, dtype='float32'):
|
41 |
+
"""
|
42 |
+
Initialize processor.
|
43 |
+
|
44 |
+
Args:
|
45 |
+
self: Reference to object
|
46 |
+
name (str): Name of processor.
|
47 |
+
parameters (parameter_list): Parameters for this processor.
|
48 |
+
block_size (int): Size of blocks for blockwise processing.
|
49 |
+
Can also be `None` if full audio can be processed at once.
|
50 |
+
sample_rate (int): Sample rate of input audio. Use `None` if effect is independent of this value.
|
51 |
+
dtype (str): data type of samples
|
52 |
+
"""
|
53 |
+
self.name = name
|
54 |
+
self.parameters = parameters
|
55 |
+
self.block_size = block_size
|
56 |
+
self.sample_rate = sample_rate
|
57 |
+
self.dtype = dtype
|
58 |
+
|
59 |
+
|
60 |
+
# (b) make code simpler
|
61 |
+
def new_update(self, parameter_name):
|
62 |
+
"""
|
63 |
+
Update processor after randomization of parameters.
|
64 |
+
|
65 |
+
Args:
|
66 |
+
self: Reference to object.
|
67 |
+
parameter_name (str): Parameter whose value has changed.
|
68 |
+
"""
|
69 |
+
pass
|
70 |
+
|
71 |
+
|
72 |
+
# (c) representation for nice print
|
73 |
+
def new_repr(self):
|
74 |
+
"""
|
75 |
+
Create human-readable representation.
|
76 |
+
|
77 |
+
Args:
|
78 |
+
self: Reference to object.
|
79 |
+
|
80 |
+
Returns:
|
81 |
+
string representation of object.
|
82 |
+
"""
|
83 |
+
return f'Processor(name={self.name!r}, parameters={self.parameters!r}'
|
84 |
+
|
85 |
+
|
86 |
+
Processor.__init__ = new_init
|
87 |
+
Processor.__repr__ = new_repr
|
88 |
+
Processor.update = new_update
|
89 |
+
|
90 |
+
|
91 |
+
class AugmentationChain:
|
92 |
+
"""Basic audio Fx chain which is used for data augmentation."""
|
93 |
+
|
94 |
+
def __init__(self,
|
95 |
+
fxs: Optional[List[Tuple[Union[Processor, 'AugmentationChain'], float, bool]]] = [],
|
96 |
+
shuffle: Optional[bool] = False,
|
97 |
+
parallel: Optional[bool] = False,
|
98 |
+
parallel_weight_factor = None,
|
99 |
+
randomize_param_value=True):
|
100 |
+
"""
|
101 |
+
Create augmentation chain from the dictionary `fxs`.
|
102 |
+
|
103 |
+
Args:
|
104 |
+
fxs (list of tuples): First tuple element is an instances of `pymc.processor` or `AugmentationChain` that
|
105 |
+
we want to use for data augmentation. Second element gives probability that effect should be applied.
|
106 |
+
Third element defines, whether the processed signal is normalized by the RMS of the input.
|
107 |
+
shuffle (bool): If `True` then order of Fx are changed whenever chain is applied.
|
108 |
+
"""
|
109 |
+
self.fxs = fxs
|
110 |
+
self.shuffle = shuffle
|
111 |
+
self.parallel = parallel
|
112 |
+
self.parallel_weight_factor = parallel_weight_factor
|
113 |
+
self.randomize_param_value = randomize_param_value
|
114 |
+
|
115 |
+
def apply_processor(self, x, processor: Processor, rms_normalize):
|
116 |
+
"""
|
117 |
+
Pass audio in `x` through `processor` and output the respective processed audio.
|
118 |
+
|
119 |
+
Args:
|
120 |
+
x (Numpy array): Input audio of shape `n_samples` x `n_channels`.
|
121 |
+
processor (Processor): Audio effect that we want to apply.
|
122 |
+
rms_normalize (bool): If `True`, the processed signal is normalized by the RMS of the signal.
|
123 |
+
|
124 |
+
Returns:
|
125 |
+
Numpy array: Processed audio of shape `n_samples` x `n_channels` (same size as `x')
|
126 |
+
"""
|
127 |
+
|
128 |
+
n_samples_input = x.shape[0]
|
129 |
+
|
130 |
+
if processor.block_size is None:
|
131 |
+
y = processor.process(x)
|
132 |
+
else:
|
133 |
+
# make sure that n_samples is a multiple of `processor.block_size`
|
134 |
+
if x.shape[0] % processor.block_size != 0:
|
135 |
+
n_pad = processor.block_size - x.shape[0] % processor.block_size
|
136 |
+
x = np.pad(x, ((0, n_pad), (0, 0)), mode='reflective')
|
137 |
+
|
138 |
+
y = np.zeros_like(x)
|
139 |
+
for idx in range(0, x.shape[0], processor.block_size):
|
140 |
+
y[idx:idx+processor.block_size, :] = processor.process(x[idx:idx+processor.block_size, :])
|
141 |
+
|
142 |
+
if rms_normalize:
|
143 |
+
# normalize output energy such that it is the same as the input energy
|
144 |
+
scale = np.sqrt(np.mean(np.square(x)) / np.maximum(1e-7, np.mean(np.square(y))))
|
145 |
+
y *= scale
|
146 |
+
|
147 |
+
# return audio of same length as x
|
148 |
+
return y[:n_samples_input, :]
|
149 |
+
|
150 |
+
def apply_same_processor(self, x_list, processor: Processor, rms_normalize):
|
151 |
+
for i in range(len(x_list)):
|
152 |
+
x_list[i] = self.apply_processor(x_list[i], processor, rms_normalize)
|
153 |
+
|
154 |
+
return x_list
|
155 |
+
|
156 |
+
def __call__(self, x_list):
|
157 |
+
"""
|
158 |
+
Apply the same augmentation chain to audio tracks in list `x_list`.
|
159 |
+
|
160 |
+
Args:
|
161 |
+
x_list (list of Numpy array) : List of audio samples of shape `n_samples` x `n_channels`.
|
162 |
+
|
163 |
+
Returns:
|
164 |
+
y_list (list of Numpy array) : List of processed audio of same shape as `x_list` where the same effects have been applied.
|
165 |
+
"""
|
166 |
+
# randomly shuffle effect order if `self.shuffle` is True
|
167 |
+
if self.shuffle:
|
168 |
+
shuffle(self.fxs)
|
169 |
+
|
170 |
+
# apply effects with probabilities given in `self.fxs`
|
171 |
+
y_list = x_list.copy()
|
172 |
+
for fx, p, rms_normalize in self.fxs:
|
173 |
+
if np.random.rand() < p:
|
174 |
+
if isinstance(fx, Processor):
|
175 |
+
# randomize all effect parameters (also calls `update()` for each processor)
|
176 |
+
if self.randomize_param_value:
|
177 |
+
fx.randomize()
|
178 |
+
else:
|
179 |
+
fx.update(None)
|
180 |
+
|
181 |
+
# apply processor
|
182 |
+
y_list = self.apply_same_processor(y_list, fx, rms_normalize)
|
183 |
+
else:
|
184 |
+
y_list = fx(y_list)
|
185 |
+
|
186 |
+
if self.parallel:
|
187 |
+
# weighting factor of input signal in the range of (0.0 ~ 0.5)
|
188 |
+
weight_in = self.parallel_weight_factor if self.parallel_weight_factor else np.random.rand() / 2.
|
189 |
+
for i in range(len(y_list)):
|
190 |
+
y_list[i] = weight_in*x_list[i] + (1-weight_in)*y_list[i]
|
191 |
+
|
192 |
+
return y_list
|
193 |
+
|
194 |
+
def __repr__(self):
|
195 |
+
"""
|
196 |
+
Human-readable representation.
|
197 |
+
|
198 |
+
Returns:
|
199 |
+
string representation of object.
|
200 |
+
"""
|
201 |
+
return f'AugmentationChain(fxs={self.fxs!r}, shuffle={self.shuffle!r})'
|
202 |
+
|
203 |
+
|
204 |
+
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% DISTORTION %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
205 |
+
def hard_clip(x, threshold_dB, drive):
|
206 |
+
"""
|
207 |
+
Hard clip distortion.
|
208 |
+
|
209 |
+
Args:
|
210 |
+
x: input audio
|
211 |
+
threshold_dB: threshold
|
212 |
+
drive: drive
|
213 |
+
|
214 |
+
Returns:
|
215 |
+
(Numpy array): distorted audio
|
216 |
+
"""
|
217 |
+
drive_linear = np.power(10., drive / 20.).astype(np.float32)
|
218 |
+
threshold_linear = 10. ** (threshold_dB / 20.)
|
219 |
+
return np.clip(x * drive_linear, -threshold_linear, threshold_linear)
|
220 |
+
|
221 |
+
|
222 |
+
def overdrive(x, drive, colour, sample_rate):
|
223 |
+
"""
|
224 |
+
Overdrive distortion.
|
225 |
+
|
226 |
+
Args:
|
227 |
+
x: input audio
|
228 |
+
drive: Controls the amount of distortion (dB).
|
229 |
+
colour: Controls the amount of even harmonic content in the output(dB)
|
230 |
+
sample_rate: sampling rate
|
231 |
+
|
232 |
+
Returns:
|
233 |
+
(Numpy array): distorted audio
|
234 |
+
"""
|
235 |
+
scale = np.max(np.abs(x))
|
236 |
+
if scale > 0.9:
|
237 |
+
clips = True
|
238 |
+
x = x * (0.9 / scale)
|
239 |
+
else:
|
240 |
+
clips = False
|
241 |
+
|
242 |
+
tfm = sox.Transformer()
|
243 |
+
tfm.overdrive(gain_db=drive, colour=colour)
|
244 |
+
y = tfm.build_array(input_array=x, sample_rate_in=sample_rate).astype(np.float32)
|
245 |
+
|
246 |
+
if clips:
|
247 |
+
y *= scale / 0.9 # rescale output to original scale
|
248 |
+
return y
|
249 |
+
|
250 |
+
|
251 |
+
def hyperbolic_tangent(x, drive):
|
252 |
+
"""
|
253 |
+
Hyperbolic Tanh distortion.
|
254 |
+
|
255 |
+
Args:
|
256 |
+
x: input audio
|
257 |
+
drive: drive
|
258 |
+
|
259 |
+
Returns:
|
260 |
+
(Numpy array): distorted audio
|
261 |
+
"""
|
262 |
+
drive_linear = np.power(10., drive / 20.).astype(np.float32)
|
263 |
+
return np.tanh(2. * x * drive_linear)
|
264 |
+
|
265 |
+
|
266 |
+
def soft_sine(x, drive):
|
267 |
+
"""
|
268 |
+
Soft sine distortion.
|
269 |
+
|
270 |
+
Args:
|
271 |
+
x: input audio
|
272 |
+
drive: drive
|
273 |
+
|
274 |
+
Returns:
|
275 |
+
(Numpy array): distorted audio
|
276 |
+
"""
|
277 |
+
drive_linear = np.power(10., drive / 20.).astype(np.float32)
|
278 |
+
y = np.clip(x * drive_linear, -np.pi/4.0, np.pi/4.0)
|
279 |
+
return np.sin(2. * y)
|
280 |
+
|
281 |
+
|
282 |
+
def bit_crusher(x, bits):
|
283 |
+
"""
|
284 |
+
Bit crusher distortion.
|
285 |
+
|
286 |
+
Args:
|
287 |
+
x: input audio
|
288 |
+
bits: bits
|
289 |
+
|
290 |
+
Returns:
|
291 |
+
(Numpy array): distorted audio
|
292 |
+
"""
|
293 |
+
return np.rint(x * (2 ** bits)) / (2 ** bits)
|
294 |
+
|
295 |
+
|
296 |
+
class Distortion(Processor):
|
297 |
+
"""
|
298 |
+
Distortion processor.
|
299 |
+
|
300 |
+
Processor parameters:
|
301 |
+
mode (str): Currently supports the following five modes: hard_clip, waveshaper, soft_sine, tanh, bit_crusher.
|
302 |
+
Each mode has different parameters such as threshold, factor, or bits.
|
303 |
+
threshold (float): threshold
|
304 |
+
drive (float): drive
|
305 |
+
factor (float): factor
|
306 |
+
limit_range (float): limit range
|
307 |
+
bits (int): bits
|
308 |
+
"""
|
309 |
+
|
310 |
+
def __init__(self, sample_rate, name='Distortion', parameters=None):
|
311 |
+
"""
|
312 |
+
Initialize processor.
|
313 |
+
|
314 |
+
Args:
|
315 |
+
sample_rate (int): sample rate.
|
316 |
+
name (str): Name of processor.
|
317 |
+
parameters (parameter_list): Parameters for this processor.
|
318 |
+
"""
|
319 |
+
super().__init__(name, None, block_size=None, sample_rate=sample_rate)
|
320 |
+
if not parameters:
|
321 |
+
self.parameters = ParameterList()
|
322 |
+
self.parameters.add(Parameter('mode', 'hard_clip', 'string',
|
323 |
+
options=['hard_clip',
|
324 |
+
'overdrive',
|
325 |
+
'soft_sine',
|
326 |
+
'tanh',
|
327 |
+
'bit_crusher']))
|
328 |
+
self.parameters.add(Parameter('threshold', 0.0, 'float',
|
329 |
+
units='dB', maximum=0.0, minimum=-20.0))
|
330 |
+
self.parameters.add(Parameter('drive', 0.0, 'float',
|
331 |
+
units='dB', maximum=20.0, minimum=0.0))
|
332 |
+
self.parameters.add(Parameter('colour', 20.0, 'float',
|
333 |
+
maximum=100.0, minimum=0.0))
|
334 |
+
self.parameters.add(Parameter('bits', 12, 'int',
|
335 |
+
maximum=12, minimum=8))
|
336 |
+
|
337 |
+
def process(self, x):
|
338 |
+
"""
|
339 |
+
Process audio.
|
340 |
+
|
341 |
+
Args:
|
342 |
+
x (Numpy array): input audio of size `n_samples x n_channels`.
|
343 |
+
|
344 |
+
Returns:
|
345 |
+
(Numpy array): distorted audio of size `n_samples x n_channels`.
|
346 |
+
"""
|
347 |
+
if self.parameters.mode.value == 'hard_clip':
|
348 |
+
y = hard_clip(x, self.parameters.threshold.value, self.parameters.drive.value)
|
349 |
+
elif self.parameters.mode.value == 'overdrive':
|
350 |
+
y = overdrive(x, self.parameters.drive.value,
|
351 |
+
self.parameters.colour.value, self.sample_rate)
|
352 |
+
elif self.parameters.mode.value == 'soft_sine':
|
353 |
+
y = soft_sine(x, self.parameters.drive.value)
|
354 |
+
elif self.parameters.mode.value == 'tanh':
|
355 |
+
y = hyperbolic_tangent(x, self.parameters.drive.value)
|
356 |
+
elif self.parameters.mode.value == 'bit_crusher':
|
357 |
+
y = bit_crusher(x, self.parameters.bits.value)
|
358 |
+
|
359 |
+
# If the output has low amplitude, (some distortion settigns can "crush" down the amplitude)
|
360 |
+
# Then it`s normalised to the input's amplitude
|
361 |
+
x_max = np.max(np.abs(x)) + 1e-8
|
362 |
+
o_max = np.max(np.abs(y)) + 1e-8
|
363 |
+
if x_max > o_max:
|
364 |
+
y = y*(x_max/o_max)
|
365 |
+
|
366 |
+
return y
|
367 |
+
|
368 |
+
|
369 |
+
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% EQUALISER %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
370 |
+
class Equaliser(Processor):
|
371 |
+
"""
|
372 |
+
Five band parametric equaliser (two shelves and three central bands).
|
373 |
+
|
374 |
+
All gains are set in dB values and range from `MIN_GAIN` dB to `MAX_GAIN` dB.
|
375 |
+
This processor is implemented as cascade of five biquad IIR filters
|
376 |
+
that are implemented using the infamous cookbook formulae from RBJ.
|
377 |
+
|
378 |
+
Processor parameters:
|
379 |
+
low_shelf_gain (float), low_shelf_freq (float)
|
380 |
+
first_band_gain (float), first_band_freq (float), first_band_q (float)
|
381 |
+
second_band_gain (float), second_band_freq (float), second_band_q (float)
|
382 |
+
third_band_gain (float), third_band_freq (float), third_band_q (float)
|
383 |
+
|
384 |
+
original from https://github.com/csteinmetz1/pymixconsole/blob/master/pymixconsole/processors/equaliser.py
|
385 |
+
"""
|
386 |
+
|
387 |
+
def __init__(self, n_channels,
|
388 |
+
sample_rate,
|
389 |
+
gain_range=(-15.0, 15.0),
|
390 |
+
q_range=(0.1, 2.0),
|
391 |
+
bands=['low_shelf', 'first_band', 'second_band', 'third_band', 'high_shelf'],
|
392 |
+
hard_clip=False,
|
393 |
+
name='Equaliser', parameters=None):
|
394 |
+
"""
|
395 |
+
Initialize processor.
|
396 |
+
|
397 |
+
Args:
|
398 |
+
n_channels (int): Number of audio channels.
|
399 |
+
sample_rate (int): Sample rate of audio.
|
400 |
+
gain_range (tuple of floats): minimum and maximum gain that can be used.
|
401 |
+
q_range (tuple of floats): minimum and maximum q value.
|
402 |
+
hard_clip (bool): Whether we clip to [-1, 1.] after processing.
|
403 |
+
name (str): Name of processor.
|
404 |
+
parameters (parameter_list): Parameters for this processor.
|
405 |
+
"""
|
406 |
+
super().__init__(name, parameters=parameters, block_size=None, sample_rate=sample_rate)
|
407 |
+
|
408 |
+
self.n_channels = n_channels
|
409 |
+
|
410 |
+
MIN_GAIN, MAX_GAIN = gain_range
|
411 |
+
MIN_Q, MAX_Q = q_range
|
412 |
+
|
413 |
+
if not parameters:
|
414 |
+
self.parameters = ParameterList()
|
415 |
+
# low shelf parameters -------
|
416 |
+
self.parameters.add(Parameter('low_shelf_gain', 0.0, 'float', minimum=MIN_GAIN, maximum=MAX_GAIN))
|
417 |
+
self.parameters.add(Parameter('low_shelf_freq', 80.0, 'float', minimum=30.0, maximum=200.0))
|
418 |
+
# first band parameters ------
|
419 |
+
self.parameters.add(Parameter('first_band_gain', 0.0, 'float', minimum=MIN_GAIN, maximum=MAX_GAIN))
|
420 |
+
self.parameters.add(Parameter('first_band_freq', 400.0, 'float', minimum=200.0, maximum=1000.0))
|
421 |
+
self.parameters.add(Parameter('first_band_q', 0.7, 'float', minimum=MIN_Q, maximum=MAX_Q))
|
422 |
+
# second band parameters -----
|
423 |
+
self.parameters.add(Parameter('second_band_gain', 0.0, 'float', minimum=MIN_GAIN, maximum=MAX_GAIN))
|
424 |
+
self.parameters.add(Parameter('second_band_freq', 2000.0, 'float', minimum=1000.0, maximum=3000.0))
|
425 |
+
self.parameters.add(Parameter('second_band_q', 0.7, 'float', minimum=MIN_Q, maximum=MAX_Q))
|
426 |
+
# third band parameters ------
|
427 |
+
self.parameters.add(Parameter('third_band_gain', 0.0, 'float', minimum=MIN_GAIN, maximum=MAX_GAIN))
|
428 |
+
self.parameters.add(Parameter('third_band_freq', 4000.0, 'float', minimum=3000.0, maximum=8000.0))
|
429 |
+
self.parameters.add(Parameter('third_band_q', 0.7, 'float', minimum=MIN_Q, maximum=MAX_Q))
|
430 |
+
# high shelf parameters ------
|
431 |
+
self.parameters.add(Parameter('high_shelf_gain', 0.0, 'float', minimum=MIN_GAIN, maximum=MAX_GAIN))
|
432 |
+
self.parameters.add(Parameter('high_shelf_freq', 8000.0, 'float', minimum=5000.0, maximum=10000.0))
|
433 |
+
|
434 |
+
self.bands = bands
|
435 |
+
self.filters = self.setup_filters()
|
436 |
+
self.hard_clip = hard_clip
|
437 |
+
|
438 |
+
def setup_filters(self):
|
439 |
+
"""
|
440 |
+
Create IIR filters.
|
441 |
+
|
442 |
+
Returns:
|
443 |
+
IIR filters
|
444 |
+
"""
|
445 |
+
filters = {}
|
446 |
+
|
447 |
+
for band in self.bands:
|
448 |
+
|
449 |
+
G = getattr(self.parameters, band + '_gain').value
|
450 |
+
fc = getattr(self.parameters, band + '_freq').value
|
451 |
+
rate = self.sample_rate
|
452 |
+
|
453 |
+
if band in ['low_shelf', 'high_shelf']:
|
454 |
+
Q = 0.707
|
455 |
+
filter_type = band
|
456 |
+
else:
|
457 |
+
Q = getattr(self.parameters, band + '_q').value
|
458 |
+
filter_type = 'peaking'
|
459 |
+
|
460 |
+
filters[band] = pymc.components.iirfilter.IIRfilter(G, Q, fc, rate, filter_type, n_channels=self.n_channels)
|
461 |
+
|
462 |
+
return filters
|
463 |
+
|
464 |
+
def update_filter(self, band):
|
465 |
+
"""
|
466 |
+
Update filters.
|
467 |
+
|
468 |
+
Args:
|
469 |
+
band (str): Band that should be updated.
|
470 |
+
"""
|
471 |
+
self.filters[band].G = getattr(self.parameters, band + '_gain').value
|
472 |
+
self.filters[band].fc = getattr(self.parameters, band + '_freq').value
|
473 |
+
self.filters[band].rate = self.sample_rate
|
474 |
+
|
475 |
+
if band in ['first_band', 'second_band', 'third_band']:
|
476 |
+
self.filters[band].Q = getattr(self.parameters, band + '_q').value
|
477 |
+
|
478 |
+
def update(self, parameter_name=None):
|
479 |
+
"""
|
480 |
+
Update processor after randomization of parameters.
|
481 |
+
|
482 |
+
Args:
|
483 |
+
parameter_name (str): Parameter whose value has changed.
|
484 |
+
"""
|
485 |
+
if parameter_name is not None:
|
486 |
+
bands = ['_'.join(parameter_name.split('_')[:2])]
|
487 |
+
else:
|
488 |
+
bands = self.bands
|
489 |
+
|
490 |
+
for band in bands:
|
491 |
+
self.update_filter(band)
|
492 |
+
|
493 |
+
for _band, iirfilter in self.filters.items():
|
494 |
+
iirfilter.reset_state()
|
495 |
+
|
496 |
+
def reset_state(self):
|
497 |
+
"""Reset state."""
|
498 |
+
for _band, iirfilter in self.filters.items():
|
499 |
+
iirfilter.reset_state()
|
500 |
+
|
501 |
+
def process(self, x):
|
502 |
+
"""
|
503 |
+
Process audio.
|
504 |
+
|
505 |
+
Args:
|
506 |
+
x (Numpy array): input audio of size `n_samples x n_channels`.
|
507 |
+
|
508 |
+
Returns:
|
509 |
+
(Numpy array): equalized audio of size `n_samples x n_channels`.
|
510 |
+
"""
|
511 |
+
for _band, iirfilter in self.filters.items():
|
512 |
+
iirfilter.reset_state()
|
513 |
+
x = iirfilter.apply_filter(x)
|
514 |
+
|
515 |
+
if self.hard_clip:
|
516 |
+
x = np.clip(x, -1.0, 1.0)
|
517 |
+
|
518 |
+
# make sure that we have float32 as IIR filtering returns float64
|
519 |
+
x = x.astype(np.float32)
|
520 |
+
|
521 |
+
# make sure that we have two dimensions (if `n_channels == 1`)
|
522 |
+
if x.ndim == 1:
|
523 |
+
x = x[:, np.newaxis]
|
524 |
+
|
525 |
+
return x
|
526 |
+
|
527 |
+
|
528 |
+
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% COMPRESSOR %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
529 |
+
@jit(nopython=True)
|
530 |
+
def compressor_process(x, threshold, attack_time, release_time, ratio, makeup_gain, sample_rate, yL_prev):
|
531 |
+
"""
|
532 |
+
Apply compressor.
|
533 |
+
|
534 |
+
Args:
|
535 |
+
x (Numpy array): audio data.
|
536 |
+
threshold: threshold in dB.
|
537 |
+
attack_time: attack_time in ms.
|
538 |
+
release_time: release_time in ms.
|
539 |
+
ratio: ratio.
|
540 |
+
makeup_gain: makeup_gain.
|
541 |
+
sample_rate: sample rate.
|
542 |
+
yL_prev: internal state of the envelop gain.
|
543 |
+
|
544 |
+
Returns:
|
545 |
+
compressed audio.
|
546 |
+
"""
|
547 |
+
M = x.shape[0]
|
548 |
+
x_g = np.zeros(M)
|
549 |
+
x_l = np.zeros(M)
|
550 |
+
y_g = np.zeros(M)
|
551 |
+
y_l = np.zeros(M)
|
552 |
+
c = np.zeros(M)
|
553 |
+
yL_prev = 0.
|
554 |
+
|
555 |
+
alpha_attack = np.exp(-1/(0.001 * sample_rate * attack_time))
|
556 |
+
alpha_release = np.exp(-1/(0.001 * sample_rate * release_time))
|
557 |
+
|
558 |
+
for i in np.arange(M):
|
559 |
+
if np.abs(x[i]) < 0.000001:
|
560 |
+
x_g[i] = -120.0
|
561 |
+
else:
|
562 |
+
x_g[i] = 20 * np.log10(np.abs(x[i]))
|
563 |
+
|
564 |
+
if ratio > 1:
|
565 |
+
if x_g[i] >= threshold:
|
566 |
+
y_g[i] = threshold + (x_g[i] - threshold) / ratio
|
567 |
+
else:
|
568 |
+
y_g[i] = x_g[i]
|
569 |
+
elif ratio < 1:
|
570 |
+
if x_g[i] <= threshold:
|
571 |
+
y_g[i] = threshold + (x_g[i] - threshold) / (1/ratio)
|
572 |
+
else:
|
573 |
+
y_g[i] = x_g[i]
|
574 |
+
|
575 |
+
x_l[i] = x_g[i] - y_g[i]
|
576 |
+
|
577 |
+
if x_l[i] > yL_prev:
|
578 |
+
y_l[i] = alpha_attack * yL_prev + (1 - alpha_attack) * x_l[i]
|
579 |
+
else:
|
580 |
+
y_l[i] = alpha_release * yL_prev + (1 - alpha_release) * x_l[i]
|
581 |
+
|
582 |
+
c[i] = np.power(10.0, (makeup_gain - y_l[i]) / 20.0)
|
583 |
+
yL_prev = y_l[i]
|
584 |
+
|
585 |
+
y = x * c
|
586 |
+
|
587 |
+
return y, yL_prev
|
588 |
+
|
589 |
+
|
590 |
+
class Compressor(Processor):
|
591 |
+
"""
|
592 |
+
Single band stereo dynamic range compressor.
|
593 |
+
|
594 |
+
Processor parameters:
|
595 |
+
threshold (float)
|
596 |
+
attack_time (float)
|
597 |
+
release_time (float)
|
598 |
+
ratio (float)
|
599 |
+
makeup_gain (float)
|
600 |
+
"""
|
601 |
+
|
602 |
+
def __init__(self, sample_rate, name='Compressor', parameters=None):
|
603 |
+
"""
|
604 |
+
Initialize processor.
|
605 |
+
|
606 |
+
Args:
|
607 |
+
sample_rate (int): Sample rate of input audio.
|
608 |
+
name (str): Name of processor.
|
609 |
+
parameters (parameter_list): Parameters for this processor.
|
610 |
+
"""
|
611 |
+
super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate)
|
612 |
+
|
613 |
+
if not parameters:
|
614 |
+
self.parameters = ParameterList()
|
615 |
+
self.parameters.add(Parameter('threshold', -20.0, 'float', units='dB', minimum=-80.0, maximum=-5.0))
|
616 |
+
self.parameters.add(Parameter('attack_time', 2.0, 'float', units='ms', minimum=1., maximum=20.0))
|
617 |
+
self.parameters.add(Parameter('release_time', 100.0, 'float', units='ms', minimum=50.0, maximum=500.0))
|
618 |
+
self.parameters.add(Parameter('ratio', 4.0, 'float', minimum=4., maximum=40.0))
|
619 |
+
# we remove makeup_gain parameter inside the Compressor
|
620 |
+
|
621 |
+
# store internal state (for block-wise processing)
|
622 |
+
self.yL_prev = None
|
623 |
+
|
624 |
+
def process(self, x):
|
625 |
+
"""
|
626 |
+
Process audio.
|
627 |
+
|
628 |
+
Args:
|
629 |
+
x (Numpy array): input audio of size `n_samples x n_channels`.
|
630 |
+
|
631 |
+
Returns:
|
632 |
+
(Numpy array): compressed audio of size `n_samples x n_channels`.
|
633 |
+
"""
|
634 |
+
if self.yL_prev is None:
|
635 |
+
self.yL_prev = [0.] * x.shape[1]
|
636 |
+
|
637 |
+
if not self.parameters.threshold.value == 0.0 or not self.parameters.ratio.value == 1.0:
|
638 |
+
y = np.zeros_like(x)
|
639 |
+
|
640 |
+
for ch in range(x.shape[1]):
|
641 |
+
y[:, ch], self.yL_prev[ch] = compressor_process(x[:, ch],
|
642 |
+
self.parameters.threshold.value,
|
643 |
+
self.parameters.attack_time.value,
|
644 |
+
self.parameters.release_time.value,
|
645 |
+
self.parameters.ratio.value,
|
646 |
+
0.0, # makeup_gain = 0
|
647 |
+
self.sample_rate,
|
648 |
+
self.yL_prev[ch])
|
649 |
+
else:
|
650 |
+
y = x
|
651 |
+
|
652 |
+
return y
|
653 |
+
|
654 |
+
def update(self, parameter_name=None):
|
655 |
+
"""
|
656 |
+
Update processor after randomization of parameters.
|
657 |
+
|
658 |
+
Args:
|
659 |
+
parameter_name (str): Parameter whose value has changed.
|
660 |
+
"""
|
661 |
+
self.yL_prev = None
|
662 |
+
|
663 |
+
|
664 |
+
# %%%%%%%%%%%%%%%%%%%%%%%%%% CONVOLUTIONAL REVERB %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
665 |
+
class ConvolutionalReverb(Processor):
|
666 |
+
"""
|
667 |
+
Convolutional Reverb.
|
668 |
+
|
669 |
+
Processor parameters:
|
670 |
+
wet_dry (float): Wet/dry ratio.
|
671 |
+
decay (float): Applies a fade out to the impulse response.
|
672 |
+
pre_delay (float): Value in ms. Shifts the IR in time and allows.
|
673 |
+
A positive value produces a traditional delay between the dry signal and the wet.
|
674 |
+
A negative delay is, in reality, zero delay, but effectively trims off the start of IR,
|
675 |
+
so the reverb response begins at a point further in.
|
676 |
+
"""
|
677 |
+
|
678 |
+
def __init__(self, impulse_responses, sample_rate, name='ConvolutionalReverb', parameters=None):
|
679 |
+
"""
|
680 |
+
Initialize processor.
|
681 |
+
|
682 |
+
Args:
|
683 |
+
impulse_responses (list): List with impulse responses created by `common_dataprocessing.create_dataset`
|
684 |
+
sample_rate (int): Sample rate that we should assume (used for fade-out computation)
|
685 |
+
name (str): Name of processor.
|
686 |
+
parameters (parameter_list): Parameters for this processor.
|
687 |
+
|
688 |
+
Raises:
|
689 |
+
ValueError: if no impulse responses are provided.
|
690 |
+
"""
|
691 |
+
super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate)
|
692 |
+
|
693 |
+
if impulse_responses is None:
|
694 |
+
raise ValueError('List of impulse responses must be provided for ConvolutionalReverb processor.')
|
695 |
+
self.impulse_responses = impulse_responses
|
696 |
+
|
697 |
+
if not parameters:
|
698 |
+
self.parameters = ParameterList()
|
699 |
+
self.max_ir_num = len(max(impulse_responses, key=len))
|
700 |
+
self.parameters.add(Parameter('index', 0, 'int', minimum=0, maximum=len(impulse_responses)))
|
701 |
+
self.parameters.add(Parameter('index_ir', 0, 'int', minimum=0, maximum=self.max_ir_num))
|
702 |
+
self.parameters.add(Parameter('wet', 1.0, 'float', minimum=1.0, maximum=1.0))
|
703 |
+
self.parameters.add(Parameter('dry', 0.0, 'float', minimum=0.0, maximum=0.0))
|
704 |
+
self.parameters.add(Parameter('decay', 1.0, 'float', minimum=1.0, maximum=1.0))
|
705 |
+
self.parameters.add(Parameter('pre_delay', 0, 'int', units='ms', minimum=0, maximum=0))
|
706 |
+
|
707 |
+
def update(self, parameter_name=None):
|
708 |
+
"""
|
709 |
+
Update processor after randomization of parameters.
|
710 |
+
|
711 |
+
Args:
|
712 |
+
parameter_name (str): Parameter whose value has changed.
|
713 |
+
"""
|
714 |
+
# we sample IR with a uniform random distribution according to RT60 values
|
715 |
+
chosen_ir_duration = self.impulse_responses[self.parameters.index.value]
|
716 |
+
chosen_ir_idx = self.parameters.index_ir.value % len(chosen_ir_duration)
|
717 |
+
self.h = np.copy(chosen_ir_duration[chosen_ir_idx]['impulse_response']())
|
718 |
+
|
719 |
+
# fade out the impulse based on the decay setting (starting from peak value)
|
720 |
+
if self.parameters.decay.value < 1.:
|
721 |
+
idx_peak = np.argmax(np.max(np.abs(self.h), axis=1), axis=0)
|
722 |
+
fstart = np.minimum(self.h.shape[0],
|
723 |
+
idx_peak + int(self.parameters.decay.value * (self.h.shape[0] - idx_peak)))
|
724 |
+
fstop = np.minimum(self.h.shape[0], fstart + int(0.020*self.sample_rate)) # constant 20 ms fade out
|
725 |
+
flen = fstop - fstart
|
726 |
+
|
727 |
+
fade = np.arange(1, flen+1, dtype=self.dtype)/flen
|
728 |
+
fade = np.power(0.1, fade * 5)
|
729 |
+
self.h[fstart:fstop, :] *= fade[:, np.newaxis]
|
730 |
+
self.h = self.h[:fstop]
|
731 |
+
|
732 |
+
def process(self, x):
|
733 |
+
"""
|
734 |
+
Process audio.
|
735 |
+
|
736 |
+
Args:
|
737 |
+
x (Numpy array): input audio of size `n_samples x n_channels`.
|
738 |
+
|
739 |
+
Returns:
|
740 |
+
(Numpy array): reverbed audio of size `n_samples x n_channels`.
|
741 |
+
"""
|
742 |
+
# reshape IR to the correct size
|
743 |
+
n_channels = x.shape[1]
|
744 |
+
if self.h.shape[1] == 1 and n_channels > 1:
|
745 |
+
self.h = np.hstack([self.h] * n_channels) # repeat mono IR for multi-channel input
|
746 |
+
if self.h.shape[1] > 1 and n_channels == 1:
|
747 |
+
self.h = self.h[:, np.random.randint(self.h.shape[1]), np.newaxis] # randomly choose one IR channel
|
748 |
+
|
749 |
+
if self.parameters.wet.value == 0.0:
|
750 |
+
return x
|
751 |
+
else:
|
752 |
+
# perform convolution to get wet signal
|
753 |
+
y = oaconvolve(x, self.h, mode='full', axes=0)
|
754 |
+
|
755 |
+
# cut out wet signal (compensating for the delay that the IR is introducing + predelay)
|
756 |
+
idx = np.argmax(np.max(np.abs(self.h), axis=1), axis=0)
|
757 |
+
idx += int(0.001 * np.abs(self.parameters.pre_delay.value) * self.sample_rate)
|
758 |
+
|
759 |
+
idx = np.clip(idx, 0, self.h.shape[0]-1)
|
760 |
+
|
761 |
+
y = y[idx:idx+x.shape[0], :]
|
762 |
+
|
763 |
+
# return weighted sum of dry and wet signal
|
764 |
+
return self.parameters.dry.value * x + self.parameters.wet.value * y
|
765 |
+
|
766 |
+
|
767 |
+
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%% HAAS EFFECT %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
768 |
+
def haas_process(x, delay, feedback, wet_channel):
|
769 |
+
"""
|
770 |
+
Add Haas effect to audio.
|
771 |
+
|
772 |
+
Args:
|
773 |
+
x (Numpy array): input audio.
|
774 |
+
delay: Delay that we apply to one of the channels (in samples).
|
775 |
+
feedback: Feedback value.
|
776 |
+
wet_channel: Which channel we process (`left` or `right`).
|
777 |
+
|
778 |
+
Returns:
|
779 |
+
(Numpy array): Audio with Haas effect.
|
780 |
+
"""
|
781 |
+
y = np.copy(x)
|
782 |
+
if wet_channel == 'left':
|
783 |
+
y[:, 0] += feedback * np.roll(x[:, 0], delay)
|
784 |
+
elif wet_channel == 'right':
|
785 |
+
y[:, 1] += feedback * np.roll(x[:, 1], delay)
|
786 |
+
|
787 |
+
return y
|
788 |
+
|
789 |
+
|
790 |
+
class Haas(Processor):
|
791 |
+
"""
|
792 |
+
Haas Effect Processor.
|
793 |
+
|
794 |
+
Randomly selects one channel and applies a short delay to it.
|
795 |
+
|
796 |
+
Processor parameters:
|
797 |
+
delay (int)
|
798 |
+
feedback (float)
|
799 |
+
wet_channel (string)
|
800 |
+
"""
|
801 |
+
|
802 |
+
def __init__(self, sample_rate, delay_range=(-0.040, 0.040), name='Haas', parameters=None,
|
803 |
+
):
|
804 |
+
"""
|
805 |
+
Initialize processor.
|
806 |
+
|
807 |
+
Args:
|
808 |
+
sample_rate (int): Sample rate of input audio.
|
809 |
+
delay_range (tuple of floats): minimum/maximum delay for Haas effect.
|
810 |
+
name (str): Name of processor.
|
811 |
+
parameters (parameter_list): Parameters for this processor.
|
812 |
+
"""
|
813 |
+
super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate)
|
814 |
+
|
815 |
+
if not parameters:
|
816 |
+
self.parameters = ParameterList()
|
817 |
+
self.parameters.add(Parameter('delay', int(delay_range[1] * sample_rate), 'int', units='samples',
|
818 |
+
minimum=int(delay_range[0] * sample_rate),
|
819 |
+
maximum=int(delay_range[1] * sample_rate)))
|
820 |
+
self.parameters.add(Parameter('feedback', 0.35, 'float', minimum=0.33, maximum=0.66))
|
821 |
+
self.parameters.add(Parameter('wet_channel', 'left', 'string', options=['left', 'right']))
|
822 |
+
|
823 |
+
def process(self, x):
|
824 |
+
"""
|
825 |
+
Process audio.
|
826 |
+
|
827 |
+
Args:
|
828 |
+
x (Numpy array): input audio of size `n_samples x n_channels`.
|
829 |
+
|
830 |
+
Returns:
|
831 |
+
(Numpy array): audio with Haas effect of size `n_samples x n_channels`.
|
832 |
+
"""
|
833 |
+
assert x.shape[1] == 1 or x.shape[1] == 2, 'Haas effect only works with monaural or stereo audio.'
|
834 |
+
|
835 |
+
if x.shape[1] < 2:
|
836 |
+
x = np.repeat(x, 2, axis=1)
|
837 |
+
|
838 |
+
y = haas_process(x, self.parameters.delay.value,
|
839 |
+
self.parameters.feedback.value, self.parameters.wet_channel.value)
|
840 |
+
|
841 |
+
return y
|
842 |
+
|
843 |
+
def update(self, parameter_name=None):
|
844 |
+
"""
|
845 |
+
Update processor after randomization of parameters.
|
846 |
+
|
847 |
+
Args:
|
848 |
+
parameter_name (str): Parameter whose value has changed.
|
849 |
+
"""
|
850 |
+
self.reset_state()
|
851 |
+
|
852 |
+
def reset_state(self):
|
853 |
+
"""Reset state."""
|
854 |
+
self.read_idx = 0
|
855 |
+
self.write_idx = self.parameters.delay.value
|
856 |
+
self.buffer = np.zeros((65536, 2))
|
857 |
+
|
858 |
+
|
859 |
+
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% PANNER %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
860 |
+
class Panner(Processor):
|
861 |
+
"""
|
862 |
+
Simple stereo panner.
|
863 |
+
|
864 |
+
If input is mono, output is stereo.
|
865 |
+
Original edited from https://github.com/csteinmetz1/pymixconsole/blob/master/pymixconsole/processors/panner.py
|
866 |
+
"""
|
867 |
+
|
868 |
+
def __init__(self, name='Panner', parameters=None):
|
869 |
+
"""
|
870 |
+
Initialize processor.
|
871 |
+
|
872 |
+
Args:
|
873 |
+
name (str): Name of processor.
|
874 |
+
parameters (parameter_list): Parameters for this processor.
|
875 |
+
"""
|
876 |
+
# default processor class constructor
|
877 |
+
super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=None)
|
878 |
+
|
879 |
+
if not parameters:
|
880 |
+
self.parameters = ParameterList()
|
881 |
+
self.parameters.add(Parameter('pan', 0.5, 'float', minimum=0., maximum=1.))
|
882 |
+
self.parameters.add(Parameter('pan_law', '-4.5dB', 'string',
|
883 |
+
options=['-4.5dB', 'linear', 'constant_power']))
|
884 |
+
|
885 |
+
# setup the coefficents based on default params
|
886 |
+
self.update()
|
887 |
+
|
888 |
+
def _calculate_pan_coefficents(self):
|
889 |
+
"""
|
890 |
+
Calculate panning coefficients from the chosen pan law.
|
891 |
+
|
892 |
+
Based on the set pan law determine the gain value
|
893 |
+
to apply for the left and right channel to achieve panning effect.
|
894 |
+
This operates on the assumption that the input channel is mono.
|
895 |
+
The output data will be stereo at the moment, but could be expanded
|
896 |
+
to a higher channel count format.
|
897 |
+
The panning value is in the range [0, 1], where
|
898 |
+
0 means the signal is panned completely to the left, and
|
899 |
+
1 means the signal is apanned copletely to the right.
|
900 |
+
|
901 |
+
Raises:
|
902 |
+
ValueError: `self.parameters.pan_law` is not supported.
|
903 |
+
"""
|
904 |
+
self.gains = np.zeros(2, dtype=self.dtype)
|
905 |
+
|
906 |
+
# first scale the linear [0, 1] to [0, pi/2]
|
907 |
+
theta = self.parameters.pan.value * (np.pi/2)
|
908 |
+
|
909 |
+
if self.parameters.pan_law.value == 'linear':
|
910 |
+
self.gains[0] = ((np.pi/2) - theta) * (2/np.pi)
|
911 |
+
self.gains[1] = theta * (2/np.pi)
|
912 |
+
elif self.parameters.pan_law.value == 'constant_power':
|
913 |
+
self.gains[0] = np.cos(theta)
|
914 |
+
self.gains[1] = np.sin(theta)
|
915 |
+
elif self.parameters.pan_law.value == '-4.5dB':
|
916 |
+
self.gains[0] = np.sqrt(((np.pi/2) - theta) * (2/np.pi) * np.cos(theta))
|
917 |
+
self.gains[1] = np.sqrt(theta * (2/np.pi) * np.sin(theta))
|
918 |
+
else:
|
919 |
+
raise ValueError(f'Invalid pan_law {self.parameters.pan_law.value}.')
|
920 |
+
|
921 |
+
|
922 |
+
def process(self, x):
|
923 |
+
"""
|
924 |
+
Process audio.
|
925 |
+
|
926 |
+
Args:
|
927 |
+
x (Numpy array): input audio of size `n_samples x n_channels`.
|
928 |
+
|
929 |
+
Returns:
|
930 |
+
(Numpy array): panned audio of size `n_samples x n_channels`.
|
931 |
+
"""
|
932 |
+
assert x.shape[1] == 1 or x.shape[1] == 2, 'Panner only works with monaural or stereo audio.'
|
933 |
+
|
934 |
+
if x.shape[1] < 2:
|
935 |
+
x = np.repeat(x, 2, axis=1)
|
936 |
+
|
937 |
+
|
938 |
+
return x * self.gains
|
939 |
+
|
940 |
+
def update(self, parameter_name=None):
|
941 |
+
"""
|
942 |
+
Update processor after randomization of parameters.
|
943 |
+
|
944 |
+
Args:
|
945 |
+
parameter_name (str): Parameter whose value has changed.
|
946 |
+
"""
|
947 |
+
self._calculate_pan_coefficents()
|
948 |
+
|
949 |
+
def reset_state(self):
|
950 |
+
"""Reset state."""
|
951 |
+
self._output_buffer = np.empty([self.block_size, 2])
|
952 |
+
self.update()
|
953 |
+
|
954 |
+
|
955 |
+
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% STEREO IMAGER %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
956 |
+
class MidSideImager(Processor):
|
957 |
+
def __init__(self, name='IMAGER', parameters=None):
|
958 |
+
super().__init__(name, parameters=parameters, block_size=None, sample_rate=None)
|
959 |
+
|
960 |
+
if not parameters:
|
961 |
+
self.parameters = ParameterList()
|
962 |
+
# values of 0.0~1.0 indicate making the signal more centered while 1.0~2.0 means making the signal more wider
|
963 |
+
self.parameters.add(Parameter("bal", 0.0, "float", processor=self, minimum=0.0, maximum=2.0))
|
964 |
+
|
965 |
+
def process(self, data):
|
966 |
+
"""
|
967 |
+
# input shape : [signal length, 2]
|
968 |
+
### note! stereo imager won't work if the input signal is a mono signal (left==right)
|
969 |
+
### if you want to apply stereo imager to a mono signal, first stereoize it with Haas effects
|
970 |
+
"""
|
971 |
+
|
972 |
+
# to mid-side channels
|
973 |
+
mid, side = self.lr_to_ms(data[:,0], data[:,1])
|
974 |
+
# apply mid-side weights according to energy
|
975 |
+
mid_e, side_e = np.sum(mid**2), np.sum(side**2)
|
976 |
+
total_e = mid_e + side_e
|
977 |
+
# apply weights
|
978 |
+
max_side_multiplier = np.sqrt(total_e / (side_e + 1e-3))
|
979 |
+
# compute current multiply factor
|
980 |
+
cur_bal = round(getattr(self.parameters, "bal").value, 3)
|
981 |
+
side_gain = cur_bal if cur_bal <= 1. else max_side_multiplier * (cur_bal-1)
|
982 |
+
# multiply weighting factor
|
983 |
+
new_side = side * side_gain
|
984 |
+
new_side_e = side_e * (side_gain ** 2)
|
985 |
+
left_mid_e = total_e - new_side_e
|
986 |
+
mid_gain = np.sqrt(left_mid_e / (mid_e + 1e-3))
|
987 |
+
new_mid = mid * mid_gain
|
988 |
+
# convert back to left-right channels
|
989 |
+
left, right = self.ms_to_lr(new_mid, new_side)
|
990 |
+
imaged = np.stack([left, right], 1)
|
991 |
+
|
992 |
+
return imaged
|
993 |
+
|
994 |
+
# left-right channeled signal to mid-side signal
|
995 |
+
def lr_to_ms(self, left, right):
|
996 |
+
mid = left + right
|
997 |
+
side = left - right
|
998 |
+
return mid, side
|
999 |
+
|
1000 |
+
# mid-side channeled signal to left-right signal
|
1001 |
+
def ms_to_lr(self, mid, side):
|
1002 |
+
left = (mid + side) / 2
|
1003 |
+
right = (mid - side) / 2
|
1004 |
+
return left, right
|
1005 |
+
|
1006 |
+
def update(self, parameter_name=None):
|
1007 |
+
return parameter_name
|
1008 |
+
|
1009 |
+
|
1010 |
+
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% GAIN %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
1011 |
+
class Gain(Processor):
|
1012 |
+
"""
|
1013 |
+
Gain Processor.
|
1014 |
+
|
1015 |
+
Applies gain in dB and can also randomly inverts polarity.
|
1016 |
+
|
1017 |
+
Processor parameters:
|
1018 |
+
gain (float): Gain that should be applied (dB scale).
|
1019 |
+
invert (bool): If True, then we also invert the waveform.
|
1020 |
+
"""
|
1021 |
+
|
1022 |
+
def __init__(self, name='Gain', parameters=None):
|
1023 |
+
"""
|
1024 |
+
Initialize processor.
|
1025 |
+
|
1026 |
+
Args:
|
1027 |
+
name (str): Name of processor.
|
1028 |
+
parameters (parameter_list): Parameters for this processor.
|
1029 |
+
"""
|
1030 |
+
super().__init__(name, parameters=parameters, block_size=None, sample_rate=None)
|
1031 |
+
|
1032 |
+
if not parameters:
|
1033 |
+
self.parameters = ParameterList()
|
1034 |
+
# self.parameters.add(Parameter('gain', 1.0, 'float', units='dB', minimum=-12.0, maximum=6.0))
|
1035 |
+
self.parameters.add(Parameter('gain', 1.0, 'float', units='dB', minimum=-6.0, maximum=9.0))
|
1036 |
+
self.parameters.add(Parameter('invert', False, 'bool'))
|
1037 |
+
|
1038 |
+
def process(self, x):
|
1039 |
+
"""
|
1040 |
+
Process audio.
|
1041 |
+
|
1042 |
+
Args:
|
1043 |
+
x (Numpy array): input audio of size `n_samples x n_channels`.
|
1044 |
+
|
1045 |
+
Returns:
|
1046 |
+
(Numpy array): gain-augmented audio of size `n_samples x n_channels`.
|
1047 |
+
"""
|
1048 |
+
gain = 10 ** (self.parameters.gain.value / 20.)
|
1049 |
+
if self.parameters.invert.value:
|
1050 |
+
gain = -gain
|
1051 |
+
return gain * x
|
1052 |
+
|
1053 |
+
|
1054 |
+
# %%%%%%%%%%%%%%%%%%%%%%% SIMPLE CHANNEL SWAP %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
1055 |
+
class SwapChannels(Processor):
|
1056 |
+
"""
|
1057 |
+
Swap channels in multi-channel audio.
|
1058 |
+
|
1059 |
+
Processor parameters:
|
1060 |
+
index (int) Selects the permutation that we are using.
|
1061 |
+
Please note that "no permutation" is one of the permutations in `self.permutations` at index `0`.
|
1062 |
+
"""
|
1063 |
+
|
1064 |
+
def __init__(self, n_channels, name='SwapChannels', parameters=None):
|
1065 |
+
"""
|
1066 |
+
Initialize processor.
|
1067 |
+
|
1068 |
+
Args:
|
1069 |
+
n_channels (int): Number of channels in audio that we want to process.
|
1070 |
+
name (str): Name of processor.
|
1071 |
+
parameters (parameter_list): Parameters for this processor.
|
1072 |
+
"""
|
1073 |
+
super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=None)
|
1074 |
+
|
1075 |
+
self.permutations = tuple(permutations(range(n_channels), n_channels))
|
1076 |
+
|
1077 |
+
if not parameters:
|
1078 |
+
self.parameters = ParameterList()
|
1079 |
+
self.parameters.add(Parameter('index', 0, 'int', minimum=0, maximum=len(self.permutations)))
|
1080 |
+
|
1081 |
+
def process(self, x):
|
1082 |
+
"""
|
1083 |
+
Process audio.
|
1084 |
+
|
1085 |
+
Args:
|
1086 |
+
x (Numpy array): input audio of size `n_samples x n_channels`.
|
1087 |
+
|
1088 |
+
Returns:
|
1089 |
+
(Numpy array): channel-swapped audio of size `n_samples x n_channels`.
|
1090 |
+
"""
|
1091 |
+
return x[:, self.permutations[self.parameters.index.value]]
|
1092 |
+
|
1093 |
+
|
1094 |
+
# %%%%%%%%%%%%%%%%%%%%%%% Monauralize %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
1095 |
+
class Monauralize(Processor):
|
1096 |
+
"""
|
1097 |
+
Monauralizes audio (i.e., removes spatial information).
|
1098 |
+
|
1099 |
+
Process parameters:
|
1100 |
+
seed_channel (int): channel that we use for overwriting the others.
|
1101 |
+
"""
|
1102 |
+
|
1103 |
+
def __init__(self, n_channels, name='Monauralize', parameters=None):
|
1104 |
+
"""
|
1105 |
+
Initialize processor.
|
1106 |
+
|
1107 |
+
Args:
|
1108 |
+
n_channels (int): Number of channels in audio that we want to process.
|
1109 |
+
name (str): Name of processor.
|
1110 |
+
parameters (parameter_list): Parameters for this processor.
|
1111 |
+
"""
|
1112 |
+
super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=None)
|
1113 |
+
|
1114 |
+
if not parameters:
|
1115 |
+
self.parameters = ParameterList()
|
1116 |
+
self.parameters.add(Parameter('seed_channel', 0, 'int', minimum=0, maximum=n_channels))
|
1117 |
+
|
1118 |
+
def process(self, x):
|
1119 |
+
"""
|
1120 |
+
Process audio.
|
1121 |
+
|
1122 |
+
Args:
|
1123 |
+
x (Numpy array): input audio of size `n_samples x n_channels`.
|
1124 |
+
|
1125 |
+
Returns:
|
1126 |
+
(Numpy array): monauralized audio of size `n_samples x n_channels`.
|
1127 |
+
"""
|
1128 |
+
return np.tile(x[:, [self.parameters.seed_channel.value]], (1, x.shape[1]))
|
1129 |
+
|
1130 |
+
|
1131 |
+
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% PITCH SHIFT %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
1132 |
+
class PitchShift(Processor):
|
1133 |
+
"""
|
1134 |
+
Simple pitch shifter using SoX and soxbindings (https://github.com/pseeth/soxbindings).
|
1135 |
+
|
1136 |
+
Processor parameters:
|
1137 |
+
steps (float): Pitch shift as positive/negative semitones
|
1138 |
+
quick (bool): If True, this effect will run faster but with lower sound quality.
|
1139 |
+
"""
|
1140 |
+
|
1141 |
+
def __init__(self, sample_rate, fix_length=True, name='PitchShift', parameters=None):
|
1142 |
+
"""
|
1143 |
+
Initialize processor.
|
1144 |
+
|
1145 |
+
Args:
|
1146 |
+
sample_rate (int): Sample rate of input audio.
|
1147 |
+
fix_length (bool): If True, then output has same length as input.
|
1148 |
+
name (str): Name of processor.
|
1149 |
+
parameters (parameter_list): Parameters for this processor.
|
1150 |
+
"""
|
1151 |
+
super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate)
|
1152 |
+
|
1153 |
+
if not parameters:
|
1154 |
+
self.parameters = ParameterList()
|
1155 |
+
self.parameters.add(Parameter('steps', 0.0, 'float', minimum=-6., maximum=6.))
|
1156 |
+
self.parameters.add(Parameter('quick', False, 'bool'))
|
1157 |
+
|
1158 |
+
self.fix_length = fix_length
|
1159 |
+
self.clips = False
|
1160 |
+
|
1161 |
+
def process(self, x):
|
1162 |
+
"""
|
1163 |
+
Process audio.
|
1164 |
+
|
1165 |
+
Args:
|
1166 |
+
x (Numpy array): input audio of size `n_samples x n_channels`.
|
1167 |
+
|
1168 |
+
Returns:
|
1169 |
+
(Numpy array): pitch-shifted audio of size `n_samples x n_channels`.
|
1170 |
+
"""
|
1171 |
+
if self.parameters.steps.value == 0.0:
|
1172 |
+
y = x
|
1173 |
+
else:
|
1174 |
+
scale = np.max(np.abs(x))
|
1175 |
+
if scale > 0.9:
|
1176 |
+
clips = True
|
1177 |
+
x = x * (0.9 / scale)
|
1178 |
+
else:
|
1179 |
+
clips = False
|
1180 |
+
|
1181 |
+
tfm = sox.Transformer()
|
1182 |
+
tfm.pitch(self.parameters.steps.value, quick=bool(self.parameters.quick.value))
|
1183 |
+
y = tfm.build_array(input_array=x, sample_rate_in=self.sample_rate).astype(np.float32)
|
1184 |
+
|
1185 |
+
if clips:
|
1186 |
+
y *= scale / 0.9 # rescale output to original scale
|
1187 |
+
|
1188 |
+
if self.fix_length:
|
1189 |
+
n_samples_input = x.shape[0]
|
1190 |
+
n_samples_output = y.shape[0]
|
1191 |
+
if n_samples_input < n_samples_output:
|
1192 |
+
idx1 = (n_samples_output - n_samples_input) // 2
|
1193 |
+
idx2 = idx1 + n_samples_input
|
1194 |
+
y = y[idx1:idx2]
|
1195 |
+
elif n_samples_input > n_samples_output:
|
1196 |
+
n_pad = n_samples_input - n_samples_output
|
1197 |
+
y = np.pad(y, ((n_pad//2, n_pad - n_pad//2), (0, 0)))
|
1198 |
+
|
1199 |
+
return y
|
1200 |
+
|
1201 |
+
|
1202 |
+
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% TIME STRETCH %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
1203 |
+
class TimeStretch(Processor):
|
1204 |
+
"""
|
1205 |
+
Simple time stretcher using SoX and soxbindings (https://github.com/pseeth/soxbindings).
|
1206 |
+
|
1207 |
+
Processor parameters:
|
1208 |
+
factor (float): Time stretch factor.
|
1209 |
+
quick (bool): If True, this effect will run faster but with lower sound quality.
|
1210 |
+
stretch_type (str): Algorithm used for stretching (`tempo` or `stretch`).
|
1211 |
+
audio_type (str): Sets which time segments are most optmial when finding
|
1212 |
+
the best overlapping points for time stretching.
|
1213 |
+
"""
|
1214 |
+
|
1215 |
+
def __init__(self, sample_rate, fix_length=True, name='TimeStretch', parameters=None):
|
1216 |
+
"""
|
1217 |
+
Initialize processor.
|
1218 |
+
|
1219 |
+
Args:
|
1220 |
+
sample_rate (int): Sample rate of input audio.
|
1221 |
+
fix_length (bool): If True, then output has same length as input.
|
1222 |
+
name (str): Name of processor.
|
1223 |
+
parameters (parameter_list): Parameters for this processor.
|
1224 |
+
"""
|
1225 |
+
super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate)
|
1226 |
+
|
1227 |
+
if not parameters:
|
1228 |
+
self.parameters = ParameterList()
|
1229 |
+
self.parameters.add(Parameter('factor', 1.0, 'float', minimum=1/1.33, maximum=1.33))
|
1230 |
+
self.parameters.add(Parameter('quick', False, 'bool'))
|
1231 |
+
self.parameters.add(Parameter('stretch_type', 'tempo', 'string', options=['tempo', 'stretch']))
|
1232 |
+
self.parameters.add(Parameter('audio_type', 'l', 'string', options=['m', 's', 'l']))
|
1233 |
+
|
1234 |
+
self.fix_length = fix_length
|
1235 |
+
|
1236 |
+
def process(self, x):
|
1237 |
+
"""
|
1238 |
+
Process audio.
|
1239 |
+
|
1240 |
+
Args:
|
1241 |
+
x (Numpy array): input audio of size `n_samples x n_channels`.
|
1242 |
+
|
1243 |
+
Returns:
|
1244 |
+
(Numpy array): time-stretched audio of size `n_samples x n_channels`.
|
1245 |
+
"""
|
1246 |
+
if self.parameters.factor.value == 1.0:
|
1247 |
+
y = x
|
1248 |
+
else:
|
1249 |
+
scale = np.max(np.abs(x))
|
1250 |
+
if scale > 0.9:
|
1251 |
+
clips = True
|
1252 |
+
x = x * (0.9 / scale)
|
1253 |
+
else:
|
1254 |
+
clips = False
|
1255 |
+
|
1256 |
+
tfm = sox.Transformer()
|
1257 |
+
if self.parameters.stretch_type.value == 'stretch':
|
1258 |
+
tfm.stretch(self.parameters.factor.value)
|
1259 |
+
elif self.parameters.stretch_type.value == 'tempo':
|
1260 |
+
tfm.tempo(self.parameters.factor.value,
|
1261 |
+
audio_type=self.parameters.audio_type.value,
|
1262 |
+
quick=bool(self.parameters.quick.value))
|
1263 |
+
y = tfm.build_array(input_array=x, sample_rate_in=self.sample_rate).astype(np.float32)
|
1264 |
+
|
1265 |
+
if clips:
|
1266 |
+
y *= scale / 0.9 # rescale output to original scale
|
1267 |
+
|
1268 |
+
if self.fix_length:
|
1269 |
+
n_samples_input = x.shape[0]
|
1270 |
+
n_samples_output = y.shape[0]
|
1271 |
+
if n_samples_input < n_samples_output:
|
1272 |
+
idx1 = (n_samples_output - n_samples_input) // 2
|
1273 |
+
idx2 = idx1 + n_samples_input
|
1274 |
+
y = y[idx1:idx2]
|
1275 |
+
elif n_samples_input > n_samples_output:
|
1276 |
+
n_pad = n_samples_input - n_samples_output
|
1277 |
+
y = np.pad(y, ((n_pad//2, n_pad - n_pad//2), (0, 0)))
|
1278 |
+
|
1279 |
+
return y
|
1280 |
+
|
1281 |
+
|
1282 |
+
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% PLAYBACK SPEED %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
1283 |
+
class PlaybackSpeed(Processor):
|
1284 |
+
"""
|
1285 |
+
Simple playback speed effect using SoX and soxbindings (https://github.com/pseeth/soxbindings).
|
1286 |
+
|
1287 |
+
Processor parameters:
|
1288 |
+
factor (float): Playback speed factor.
|
1289 |
+
"""
|
1290 |
+
|
1291 |
+
def __init__(self, sample_rate, fix_length=True, name='PlaybackSpeed', parameters=None):
|
1292 |
+
"""
|
1293 |
+
Initialize processor.
|
1294 |
+
|
1295 |
+
Args:
|
1296 |
+
sample_rate (int): Sample rate of input audio.
|
1297 |
+
fix_length (bool): If True, then output has same length as input.
|
1298 |
+
name (str): Name of processor.
|
1299 |
+
parameters (parameter_list): Parameters for this processor.
|
1300 |
+
"""
|
1301 |
+
super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate)
|
1302 |
+
|
1303 |
+
if not parameters:
|
1304 |
+
self.parameters = ParameterList()
|
1305 |
+
self.parameters.add(Parameter('factor', 1.0, 'float', minimum=1./1.33, maximum=1.33))
|
1306 |
+
|
1307 |
+
self.fix_length = fix_length
|
1308 |
+
|
1309 |
+
def process(self, x):
|
1310 |
+
"""
|
1311 |
+
Process audio.
|
1312 |
+
|
1313 |
+
Args:
|
1314 |
+
x (Numpy array): input audio of size `n_samples x n_channels`.
|
1315 |
+
|
1316 |
+
Returns:
|
1317 |
+
(Numpy array): resampled audio of size `n_samples x n_channels`.
|
1318 |
+
"""
|
1319 |
+
if self.parameters.factor.value == 1.0:
|
1320 |
+
y = x
|
1321 |
+
else:
|
1322 |
+
scale = np.max(np.abs(x))
|
1323 |
+
if scale > 0.9:
|
1324 |
+
clips = True
|
1325 |
+
x = x * (0.9 / scale)
|
1326 |
+
else:
|
1327 |
+
clips = False
|
1328 |
+
|
1329 |
+
tfm = sox.Transformer()
|
1330 |
+
tfm.speed(self.parameters.factor.value)
|
1331 |
+
y = tfm.build_array(input_array=x, sample_rate_in=self.sample_rate).astype(np.float32)
|
1332 |
+
|
1333 |
+
if clips:
|
1334 |
+
y *= scale / 0.9 # rescale output to original scale
|
1335 |
+
|
1336 |
+
if self.fix_length:
|
1337 |
+
n_samples_input = x.shape[0]
|
1338 |
+
n_samples_output = y.shape[0]
|
1339 |
+
if n_samples_input < n_samples_output:
|
1340 |
+
idx1 = (n_samples_output - n_samples_input) // 2
|
1341 |
+
idx2 = idx1 + n_samples_input
|
1342 |
+
y = y[idx1:idx2]
|
1343 |
+
elif n_samples_input > n_samples_output:
|
1344 |
+
n_pad = n_samples_input - n_samples_output
|
1345 |
+
y = np.pad(y, ((n_pad//2, n_pad - n_pad//2), (0, 0)))
|
1346 |
+
|
1347 |
+
return y
|
1348 |
+
|
1349 |
+
|
1350 |
+
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% BEND %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
1351 |
+
class Bend(Processor):
|
1352 |
+
"""
|
1353 |
+
Simple bend effect using SoX and soxbindings (https://github.com/pseeth/soxbindings).
|
1354 |
+
|
1355 |
+
Processor parameters:
|
1356 |
+
n_bends (int): Number of segments or intervals to pitch shift
|
1357 |
+
"""
|
1358 |
+
|
1359 |
+
def __init__(self, sample_rate, pitch_range=(-600, 600), fix_length=True, name='Bend', parameters=None):
|
1360 |
+
"""
|
1361 |
+
Initialize processor.
|
1362 |
+
|
1363 |
+
Args:
|
1364 |
+
sample_rate (int): Sample rate of input audio.
|
1365 |
+
pitch_range (tuple of ints): min and max pitch bending ranges in cents
|
1366 |
+
fix_length (bool): If True, then output has same length as input.
|
1367 |
+
name (str): Name of processor.
|
1368 |
+
parameters (parameter_list): Parameters for this processor.
|
1369 |
+
"""
|
1370 |
+
super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate)
|
1371 |
+
|
1372 |
+
if not parameters:
|
1373 |
+
self.parameters = ParameterList()
|
1374 |
+
self.parameters.add(Parameter('n_bends', 2, 'int', minimum=2, maximum=10))
|
1375 |
+
self.pitch_range_min, self.pitch_range_max = pitch_range
|
1376 |
+
|
1377 |
+
def process(self, x):
|
1378 |
+
"""
|
1379 |
+
Process audio.
|
1380 |
+
|
1381 |
+
Args:
|
1382 |
+
x (Numpy array): input audio of size `n_samples x n_channels`.
|
1383 |
+
|
1384 |
+
Returns:
|
1385 |
+
(Numpy array): pitch-bended audio of size `n_samples x n_channels`.
|
1386 |
+
"""
|
1387 |
+
n_bends = self.parameters.n_bends.value
|
1388 |
+
max_length = x.shape[0] / self.sample_rate
|
1389 |
+
|
1390 |
+
# Generates random non-overlapping segments
|
1391 |
+
delta = 1. / self.sample_rate
|
1392 |
+
boundaries = np.sort(delta + np.random.rand(n_bends-1) * (max_length - delta))
|
1393 |
+
|
1394 |
+
start, end = np.zeros(n_bends), np.zeros(n_bends)
|
1395 |
+
start[0] = delta
|
1396 |
+
for i, b in enumerate(boundaries):
|
1397 |
+
end[i] = b
|
1398 |
+
start[i+1] = b
|
1399 |
+
end[-1] = max_length
|
1400 |
+
|
1401 |
+
# randomly sample pitch-shifts in cents
|
1402 |
+
cents = np.random.randint(self.pitch_range_min, self.pitch_range_max+1, n_bends)
|
1403 |
+
|
1404 |
+
# remove segment if cent value is zero or start == end (as SoX does not allow such values)
|
1405 |
+
idx_keep = np.logical_and(cents != 0, start != end)
|
1406 |
+
n_bends, start, end, cents = sum(idx_keep), start[idx_keep], end[idx_keep], cents[idx_keep]
|
1407 |
+
|
1408 |
+
scale = np.max(np.abs(x))
|
1409 |
+
if scale > 0.9:
|
1410 |
+
clips = True
|
1411 |
+
x = x * (0.9 / scale)
|
1412 |
+
else:
|
1413 |
+
clips = False
|
1414 |
+
|
1415 |
+
tfm = sox.Transformer()
|
1416 |
+
tfm.bend(n_bends=int(n_bends), start_times=list(start), end_times=list(end), cents=list(cents))
|
1417 |
+
y = tfm.build_array(input_array=x, sample_rate_in=self.sample_rate).astype(np.float32)
|
1418 |
+
|
1419 |
+
if clips:
|
1420 |
+
y *= scale / 0.9 # rescale output to original scale
|
1421 |
+
|
1422 |
+
return y
|
1423 |
+
|
1424 |
+
|
1425 |
+
|
1426 |
+
|
1427 |
+
|
1428 |
+
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ALGORITHMIC REVERB %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
1429 |
+
class AlgorithmicReverb(Processor):
|
1430 |
+
def __init__(self, name="algoreverb", parameters=None, sample_rate=44100, **kwargs):
|
1431 |
+
|
1432 |
+
super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate, **kwargs)
|
1433 |
+
|
1434 |
+
if not parameters:
|
1435 |
+
self.parameters = ParameterList()
|
1436 |
+
self.parameters.add(Parameter("room_size", 0.5, "float", minimum=0.05, maximum=0.85))
|
1437 |
+
self.parameters.add(Parameter("damping", 0.1, "float", minimum=0.0, maximum=1.0))
|
1438 |
+
self.parameters.add(Parameter("dry_mix", 0.9, "float", minimum=0.0, maximum=1.0))
|
1439 |
+
self.parameters.add(Parameter("wet_mix", 0.1, "float", minimum=0.0, maximum=1.0))
|
1440 |
+
self.parameters.add(Parameter("width", 0.7, "float", minimum=0.0, maximum=1.0))
|
1441 |
+
|
1442 |
+
# Tuning
|
1443 |
+
self.stereospread = 23
|
1444 |
+
self.scalegain = 0.2
|
1445 |
+
|
1446 |
+
|
1447 |
+
def process(self, data):
|
1448 |
+
|
1449 |
+
if data.ndim >= 2:
|
1450 |
+
dataL = data[:,0]
|
1451 |
+
if data.shape[1] == 2:
|
1452 |
+
dataR = data[:,1]
|
1453 |
+
else:
|
1454 |
+
dataR = data[:,0]
|
1455 |
+
else:
|
1456 |
+
dataL = data
|
1457 |
+
dataR = data
|
1458 |
+
|
1459 |
+
output = np.zeros((data.shape[0], 2))
|
1460 |
+
|
1461 |
+
xL, xR = self.process_filters(dataL.copy(), dataR.copy())
|
1462 |
+
|
1463 |
+
wet1_g = self.parameters.wet_mix.value * ((self.parameters.width.value/2) + 0.5)
|
1464 |
+
wet2_g = self.parameters.wet_mix.value * ((1-self.parameters.width.value)/2)
|
1465 |
+
dry_g = self.parameters.dry_mix.value
|
1466 |
+
|
1467 |
+
output[:,0] = (wet1_g * xL) + (wet2_g * xR) + (dry_g * dataL)
|
1468 |
+
output[:,1] = (wet1_g * xR) + (wet2_g * xL) + (dry_g * dataR)
|
1469 |
+
|
1470 |
+
return output
|
1471 |
+
|
1472 |
+
def process_filters(self, dataL, dataR):
|
1473 |
+
|
1474 |
+
xL = self.combL1.process(dataL.copy() * self.scalegain)
|
1475 |
+
xL += self.combL2.process(dataL.copy() * self.scalegain)
|
1476 |
+
xL += self.combL3.process(dataL.copy() * self.scalegain)
|
1477 |
+
xL += self.combL4.process(dataL.copy() * self.scalegain)
|
1478 |
+
xL = self.combL5.process(dataL.copy() * self.scalegain)
|
1479 |
+
xL += self.combL6.process(dataL.copy() * self.scalegain)
|
1480 |
+
xL += self.combL7.process(dataL.copy() * self.scalegain)
|
1481 |
+
xL += self.combL8.process(dataL.copy() * self.scalegain)
|
1482 |
+
|
1483 |
+
xR = self.combR1.process(dataR.copy() * self.scalegain)
|
1484 |
+
xR += self.combR2.process(dataR.copy() * self.scalegain)
|
1485 |
+
xR += self.combR3.process(dataR.copy() * self.scalegain)
|
1486 |
+
xR += self.combR4.process(dataR.copy() * self.scalegain)
|
1487 |
+
xR = self.combR5.process(dataR.copy() * self.scalegain)
|
1488 |
+
xR += self.combR6.process(dataR.copy() * self.scalegain)
|
1489 |
+
xR += self.combR7.process(dataR.copy() * self.scalegain)
|
1490 |
+
xR += self.combR8.process(dataR.copy() * self.scalegain)
|
1491 |
+
|
1492 |
+
yL1 = self.allpassL1.process(xL)
|
1493 |
+
yL2 = self.allpassL2.process(yL1)
|
1494 |
+
yL3 = self.allpassL3.process(yL2)
|
1495 |
+
yL4 = self.allpassL4.process(yL3)
|
1496 |
+
|
1497 |
+
yR1 = self.allpassR1.process(xR)
|
1498 |
+
yR2 = self.allpassR2.process(yR1)
|
1499 |
+
yR3 = self.allpassR3.process(yR2)
|
1500 |
+
yR4 = self.allpassR4.process(yR3)
|
1501 |
+
|
1502 |
+
return yL4, yR4
|
1503 |
+
|
1504 |
+
def update(self, parameter_name):
|
1505 |
+
|
1506 |
+
rs = self.parameters.room_size.value
|
1507 |
+
dp = self.parameters.damping.value
|
1508 |
+
ss = self.stereospread
|
1509 |
+
|
1510 |
+
# initialize allpass and feedback comb-filters
|
1511 |
+
# (with coefficients optimized for fs=44.1kHz)
|
1512 |
+
self.allpassL1 = pymc.components.allpass.Allpass(556, rs, self.block_size)
|
1513 |
+
self.allpassR1 = pymc.components.allpass.Allpass(556+ss, rs, self.block_size)
|
1514 |
+
self.allpassL2 = pymc.components.allpass.Allpass(441, rs, self.block_size)
|
1515 |
+
self.allpassR2 = pymc.components.allpass.Allpass(441+ss, rs, self.block_size)
|
1516 |
+
self.allpassL3 = pymc.components.allpass.Allpass(341, rs, self.block_size)
|
1517 |
+
self.allpassR3 = pymc.components.allpass.Allpass(341+ss, rs, self.block_size)
|
1518 |
+
self.allpassL4 = pymc.components.allpass.Allpass(225, rs, self.block_size)
|
1519 |
+
self.allpassR4 = pymc.components.allpass.Allpass(255+ss, rs, self.block_size)
|
1520 |
+
|
1521 |
+
self.combL1 = pymc.components.comb.Comb(1116, dp, rs, self.block_size)
|
1522 |
+
self.combR1 = pymc.components.comb.Comb(1116+ss, dp, rs, self.block_size)
|
1523 |
+
self.combL2 = pymc.components.comb.Comb(1188, dp, rs, self.block_size)
|
1524 |
+
self.combR2 = pymc.components.comb.Comb(1188+ss, dp, rs, self.block_size)
|
1525 |
+
self.combL3 = pymc.components.comb.Comb(1277, dp, rs, self.block_size)
|
1526 |
+
self.combR3 = pymc.components.comb.Comb(1277+ss, dp, rs, self.block_size)
|
1527 |
+
self.combL4 = pymc.components.comb.Comb(1356, dp, rs, self.block_size)
|
1528 |
+
self.combR4 = pymc.components.comb.Comb(1356+ss, dp, rs, self.block_size)
|
1529 |
+
self.combL5 = pymc.components.comb.Comb(1422, dp, rs, self.block_size)
|
1530 |
+
self.combR5 = pymc.components.comb.Comb(1422+ss, dp, rs, self.block_size)
|
1531 |
+
self.combL6 = pymc.components.comb.Comb(1491, dp, rs, self.block_size)
|
1532 |
+
self.combR6 = pymc.components.comb.Comb(1491+ss, dp, rs, self.block_size)
|
1533 |
+
self.combL7 = pymc.components.comb.Comb(1557, dp, rs, self.block_size)
|
1534 |
+
self.combR7 = pymc.components.comb.Comb(1557+ss, dp, rs, self.block_size)
|
1535 |
+
self.combL8 = pymc.components.comb.Comb(1617, dp, rs, self.block_size)
|
1536 |
+
self.combR8 = pymc.components.comb.Comb(1617+ss, dp, rs, self.block_size)
|
1537 |
+
|
modules/common_miscellaneous.py
ADDED
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Common miscellaneous functions.
|
3 |
+
|
4 |
+
AI Music Technology Group, Sony Group Corporation
|
5 |
+
AI Speech and Sound Group, Sony Europe
|
6 |
+
|
7 |
+
This implementation originally belongs to Sony Group Corporation,
|
8 |
+
which has been introduced in the work "Automatic music mixing with deep learning and out-of-domain data".
|
9 |
+
Original repo link: https://github.com/sony/FxNorm-automix
|
10 |
+
"""
|
11 |
+
import os
|
12 |
+
import psutil
|
13 |
+
import sys
|
14 |
+
import numpy as np
|
15 |
+
import librosa
|
16 |
+
import torch
|
17 |
+
import math
|
18 |
+
|
19 |
+
|
20 |
+
def uprint(s):
|
21 |
+
"""
|
22 |
+
Unbuffered print to stdout.
|
23 |
+
|
24 |
+
We also flush stderr to have the log-file in sync.
|
25 |
+
|
26 |
+
Args:
|
27 |
+
s: string to print
|
28 |
+
"""
|
29 |
+
print(s)
|
30 |
+
sys.stdout.flush()
|
31 |
+
sys.stderr.flush()
|
32 |
+
|
33 |
+
|
34 |
+
def recursive_getattr(obj, attr):
|
35 |
+
"""
|
36 |
+
Run `getattr` recursively (e.g., for `fc1.weight`).
|
37 |
+
|
38 |
+
Args:
|
39 |
+
obj: object
|
40 |
+
attr: attribute to get
|
41 |
+
|
42 |
+
Returns:
|
43 |
+
object
|
44 |
+
"""
|
45 |
+
for a in attr.split('.'):
|
46 |
+
obj = getattr(obj, a)
|
47 |
+
return obj
|
48 |
+
|
49 |
+
|
50 |
+
def compute_stft(samples, hop_length, fft_size, stft_window):
|
51 |
+
"""
|
52 |
+
Compute the STFT of `samples` applying a Hann window of size `FFT_SIZE`, shifted for each frame by `hop_length`.
|
53 |
+
|
54 |
+
Args:
|
55 |
+
samples: num samples x channels
|
56 |
+
hop_length: window shift in samples
|
57 |
+
fft_size: FFT size which is also the window size
|
58 |
+
stft_window: STFT analysis window
|
59 |
+
|
60 |
+
Returns:
|
61 |
+
stft: frames x channels x freqbins
|
62 |
+
"""
|
63 |
+
n_channels = samples.shape[1]
|
64 |
+
n_frames = 1+int((samples.shape[0] - fft_size)/hop_length)
|
65 |
+
stft = np.empty((n_frames, n_channels, fft_size//2+1), dtype=np.complex64)
|
66 |
+
|
67 |
+
# convert into f_contiguous (such that [:,n] slicing is c_contiguous)
|
68 |
+
samples = np.asfortranarray(samples)
|
69 |
+
|
70 |
+
for n in range(n_channels):
|
71 |
+
# compute STFT (output has size `n_frames x N_BINS`)
|
72 |
+
stft[:, n, :] = librosa.stft(samples[:, n],
|
73 |
+
n_fft=fft_size,
|
74 |
+
hop_length=hop_length,
|
75 |
+
window=stft_window,
|
76 |
+
center=False).transpose()
|
77 |
+
return stft
|
78 |
+
|
79 |
+
|
80 |
+
def compute_istft(stft, hop_length, stft_window):
|
81 |
+
"""
|
82 |
+
Compute the inverse STFT of `stft`.
|
83 |
+
|
84 |
+
Args:
|
85 |
+
stft: frames x channels x freqbins
|
86 |
+
hop_length: window shift in samples
|
87 |
+
stft_window: STFT synthesis window
|
88 |
+
|
89 |
+
Returns:
|
90 |
+
samples: num samples x channels
|
91 |
+
"""
|
92 |
+
for n in range(stft.shape[1]):
|
93 |
+
s = librosa.istft(stft[:, n, :].transpose(),
|
94 |
+
hop_length=hop_length, window=stft_window, center=False)
|
95 |
+
if n == 0:
|
96 |
+
samples = s
|
97 |
+
else:
|
98 |
+
samples = np.column_stack((samples, s))
|
99 |
+
|
100 |
+
# ensure that we have a 2d array (monaural files are just loaded as vectors)
|
101 |
+
if samples.ndim == 1:
|
102 |
+
samples = samples[:, np.newaxis]
|
103 |
+
|
104 |
+
return samples
|
105 |
+
|
106 |
+
|
107 |
+
def get_size(obj):
|
108 |
+
"""
|
109 |
+
Recursively find size of objects (in bytes).
|
110 |
+
|
111 |
+
Args:
|
112 |
+
obj: object
|
113 |
+
|
114 |
+
Returns:
|
115 |
+
size of object
|
116 |
+
"""
|
117 |
+
size = sys.getsizeof(obj)
|
118 |
+
|
119 |
+
import functools
|
120 |
+
|
121 |
+
if isinstance(obj, dict):
|
122 |
+
size += sum([get_size(v) for v in obj.values()])
|
123 |
+
size += sum([get_size(k) for k in obj.keys()])
|
124 |
+
elif isinstance(obj, functools.partial):
|
125 |
+
size += sum([get_size(v) for v in obj.keywords.values()])
|
126 |
+
size += sum([get_size(k) for k in obj.keywords.keys()])
|
127 |
+
elif isinstance(obj, list):
|
128 |
+
size += sum([get_size(i) for i in obj])
|
129 |
+
elif isinstance(obj, tuple):
|
130 |
+
size += sum([get_size(i) for i in obj])
|
131 |
+
return size
|
132 |
+
|
133 |
+
|
134 |
+
def get_process_memory():
|
135 |
+
"""
|
136 |
+
Return memory consumption in GBytes.
|
137 |
+
|
138 |
+
Returns:
|
139 |
+
memory used by the process
|
140 |
+
"""
|
141 |
+
return psutil.Process(os.getpid()).memory_info()[0] / (2 ** 30)
|
142 |
+
|
143 |
+
|
144 |
+
def check_complete_convolution(input_size, kernel_size, stride=1,
|
145 |
+
padding=0, dilation=1, note=''):
|
146 |
+
"""
|
147 |
+
Check where the convolution is complete.
|
148 |
+
|
149 |
+
Returns true if no time steps left over in a Conv1d
|
150 |
+
|
151 |
+
Args:
|
152 |
+
input_size: size of input
|
153 |
+
kernel_size: size of kernel
|
154 |
+
stride: stride
|
155 |
+
padding: padding
|
156 |
+
dilation: dilation
|
157 |
+
note: string for additional notes
|
158 |
+
"""
|
159 |
+
is_complete = ((input_size + 2*padding - dilation * (kernel_size - 1) - 1)
|
160 |
+
/ stride + 1).is_integer()
|
161 |
+
uprint(f'{note} {is_complete}')
|
162 |
+
|
163 |
+
|
164 |
+
def pad_to_shape(x: torch.Tensor, y: int) -> torch.Tensor:
|
165 |
+
"""
|
166 |
+
Right-pad or right-trim first argument last dimension to have same size as second argument.
|
167 |
+
|
168 |
+
Args:
|
169 |
+
x: Tensor to be padded.
|
170 |
+
y: Size to pad/trim x last dimension to
|
171 |
+
|
172 |
+
Returns:
|
173 |
+
`x` padded to match `y`'s dimension.
|
174 |
+
"""
|
175 |
+
inp_len = y
|
176 |
+
output_len = x.shape[-1]
|
177 |
+
return torch.nn.functional.pad(x, [0, inp_len - output_len])
|
178 |
+
|
179 |
+
|
180 |
+
def valid_length(input_size, kernel_size, stride=1, padding=0, dilation=1):
|
181 |
+
"""
|
182 |
+
Return the nearest valid upper length to use with the model so that there is no time steps left over in a 1DConv.
|
183 |
+
|
184 |
+
For all layers, size of the (input - kernel_size) % stride = 0.
|
185 |
+
Here valid means that there is no left over frame neglected and discarded.
|
186 |
+
|
187 |
+
Args:
|
188 |
+
input_size: size of input
|
189 |
+
kernel_size: size of kernel
|
190 |
+
stride: stride
|
191 |
+
padding: padding
|
192 |
+
dilation: dilation
|
193 |
+
|
194 |
+
Returns:
|
195 |
+
valid length for convolution
|
196 |
+
"""
|
197 |
+
length = math.ceil((input_size + 2*padding - dilation * (kernel_size - 1) - 1)/stride) + 1
|
198 |
+
length = (length - 1) * stride - 2*padding + dilation * (kernel_size - 1) + 1
|
199 |
+
|
200 |
+
return int(length)
|
201 |
+
|
202 |
+
|
203 |
+
def td_length_from_fd(fd_length: int, fft_size: int, fft_hop: int) -> int:
|
204 |
+
"""
|
205 |
+
Return the length in time domain, given the length in frequency domain.
|
206 |
+
|
207 |
+
Return the necessary length in the time domain of a signal to be transformed into
|
208 |
+
a signal of length `fd_length` in time-frequency domain with the given STFT
|
209 |
+
parameters `fft_size` and `fft_hop`. No padding is assumed.
|
210 |
+
|
211 |
+
Args:
|
212 |
+
fd_length: length in frequency domain
|
213 |
+
fft_size: size of FFT
|
214 |
+
fft_hop: hop length
|
215 |
+
|
216 |
+
Returns:
|
217 |
+
length in time domain
|
218 |
+
"""
|
219 |
+
return (fd_length - 1) * fft_hop + fft_size
|
modules/data_normalization.py
ADDED
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Implementation of the 'audio effects chain normalization'
|
3 |
+
"""
|
4 |
+
import numpy as np
|
5 |
+
import scipy
|
6 |
+
import soundfile as sf
|
7 |
+
import pyloudnorm
|
8 |
+
|
9 |
+
from glob import glob
|
10 |
+
import os
|
11 |
+
import sys
|
12 |
+
currentdir = os.path.dirname(os.path.realpath(__file__))
|
13 |
+
sys.path.append(currentdir)
|
14 |
+
from utils_data_normalization import *
|
15 |
+
from normalization_imager import *
|
16 |
+
|
17 |
+
|
18 |
+
'''
|
19 |
+
Audio Effects Chain Normalization
|
20 |
+
process: normalizes input stems according to given precomputed features
|
21 |
+
'''
|
22 |
+
class Audio_Effects_Normalizer:
|
23 |
+
def __init__(self, precomputed_feature_path=None, \
|
24 |
+
STEMS=['drums', 'bass', 'other', 'vocals'], \
|
25 |
+
EFFECTS=['eq', 'compression', 'imager', 'loudness'], \
|
26 |
+
audio_extension='wav'):
|
27 |
+
self.STEMS = STEMS # Stems to be normalized
|
28 |
+
self.EFFECTS = EFFECTS # Effects to be normalized, order matters
|
29 |
+
self.audio_extension = audio_extension
|
30 |
+
self.precomputed_feature_path = precomputed_feature_path
|
31 |
+
|
32 |
+
# Audio settings
|
33 |
+
self.SR = 44100
|
34 |
+
self.SUBTYPE = 'PCM_16'
|
35 |
+
|
36 |
+
# General Settings
|
37 |
+
self.FFT_SIZE = 2**16
|
38 |
+
self.HOP_LENGTH = self.FFT_SIZE//4
|
39 |
+
|
40 |
+
# Loudness
|
41 |
+
self.NTAPS = 1001
|
42 |
+
self.LUFS = -30
|
43 |
+
self.MIN_DB = -40 # Min amplitude to apply EQ matching
|
44 |
+
|
45 |
+
# Compressor
|
46 |
+
self.COMP_USE_EXPANDER = False
|
47 |
+
self.COMP_PEAK_NORM = -10.0
|
48 |
+
self.COMP_TRUE_PEAK = False
|
49 |
+
self.COMP_PERCENTILE = 75 # features_mean (v1) was done with 25
|
50 |
+
self.COMP_MIN_TH = -40
|
51 |
+
self.COMP_MAX_RATIO = 20
|
52 |
+
comp_settings = {key:{} for key in self.STEMS}
|
53 |
+
for key in comp_settings:
|
54 |
+
if key=='vocals':
|
55 |
+
comp_settings[key]['attack'] = 7.5
|
56 |
+
comp_settings[key]['release'] = 400.0
|
57 |
+
comp_settings[key]['ratio'] = 4
|
58 |
+
comp_settings[key]['n_mels'] = 128
|
59 |
+
elif key=='drums':
|
60 |
+
comp_settings[key]['attack'] = 10.0
|
61 |
+
comp_settings[key]['release'] = 180.0
|
62 |
+
comp_settings[key]['ratio'] = 6
|
63 |
+
comp_settings[key]['n_mels'] = 128
|
64 |
+
elif key=='bass':
|
65 |
+
comp_settings[key]['attack'] = 10.0
|
66 |
+
comp_settings[key]['release'] = 500.0
|
67 |
+
comp_settings[key]['ratio'] = 5
|
68 |
+
comp_settings[key]['n_mels'] = 16
|
69 |
+
elif key=='other' or key=='mixture':
|
70 |
+
comp_settings[key]['attack'] = 15.0
|
71 |
+
comp_settings[key]['release'] = 666.0
|
72 |
+
comp_settings[key]['ratio'] = 4
|
73 |
+
comp_settings[key]['n_mels'] = 128
|
74 |
+
self.comp_settings = comp_settings
|
75 |
+
|
76 |
+
if precomputed_feature_path!=None and os.path.isfile(precomputed_feature_path):
|
77 |
+
# Load Pre-computed Audio Effects Features
|
78 |
+
features_mean = np.load(precomputed_feature_path, allow_pickle='TRUE')[()]
|
79 |
+
self.features_mean = self.smooth_feature(features_mean)
|
80 |
+
|
81 |
+
|
82 |
+
# compute audio effects' mean feature values
|
83 |
+
def compute_mean(self, base_dir_path, save_feat=True, single_file=False):
|
84 |
+
|
85 |
+
audio_path_dict = {}
|
86 |
+
for cur_stem in self.STEMS:
|
87 |
+
# if single_file=True, base_dir_path = the target file path
|
88 |
+
audio_path_dict[cur_stem] = [base_dir_path] if single_file else glob(os.path.join(base_dir_path, "**", f"{cur_stem}.{self.audio_extension}"), recursive=True)
|
89 |
+
|
90 |
+
features_dict = {}
|
91 |
+
features_mean = {}
|
92 |
+
for effect in self.EFFECTS:
|
93 |
+
features_dict[effect] = {key:[] for key in self.STEMS}
|
94 |
+
features_mean[effect] = {key:[] for key in self.STEMS}
|
95 |
+
|
96 |
+
stems_names = self.STEMS.copy()
|
97 |
+
for effect in self.EFFECTS:
|
98 |
+
print(f'{effect} ...')
|
99 |
+
j=0
|
100 |
+
for key in self.STEMS:
|
101 |
+
print(f'{key} ...')
|
102 |
+
i = []
|
103 |
+
for i_, p_ in enumerate(audio_path_dict[key]):
|
104 |
+
i.append(i_)
|
105 |
+
i = np.asarray(i) + j
|
106 |
+
j += len(i)
|
107 |
+
|
108 |
+
features_ = []
|
109 |
+
for cur_i, cur_audio_path in enumerate(audio_path_dict[key]):
|
110 |
+
print(f'getting {effect} features for {key}- stem {cur_i} of {len(audio_path_dict[key])-1} {cur_audio_path}')
|
111 |
+
features_.append(self.get_norm_feature(cur_audio_path, cur_i, effect, key))
|
112 |
+
|
113 |
+
features_dict[effect][key] = features_
|
114 |
+
|
115 |
+
print(effect, key, len(features_dict[effect][key]))
|
116 |
+
s = np.asarray(features_dict[effect][key])
|
117 |
+
s = np.mean(s, axis=0)
|
118 |
+
features_mean[effect][key] = s
|
119 |
+
|
120 |
+
if effect == 'eq':
|
121 |
+
assert len(s)==1+self.FFT_SIZE//2, len(s)
|
122 |
+
elif effect == 'compression':
|
123 |
+
assert len(s)==2, len(s)
|
124 |
+
elif effect == 'panning':
|
125 |
+
assert len(s)==1+self.FFT_SIZE//2, len(s)
|
126 |
+
elif effect == 'loudness':
|
127 |
+
assert len(s)==1, len(s)
|
128 |
+
|
129 |
+
if effect == 'eq':
|
130 |
+
if key in ['other', 'vocals', 'mixture']:
|
131 |
+
f = 401
|
132 |
+
else:
|
133 |
+
f = 151
|
134 |
+
features_mean[effect][key] = scipy.signal.savgol_filter(features_mean[effect][key],
|
135 |
+
f, 1, mode='mirror')
|
136 |
+
elif effect == 'panning':
|
137 |
+
features_mean[effect][key] = scipy.signal.savgol_filter(features_mean[effect][key],
|
138 |
+
501, 1, mode='mirror')
|
139 |
+
if save_feat:
|
140 |
+
np.save(self.precomputed_feature_path, features_mean)
|
141 |
+
self.features_mean = self.smooth_feature(features_mean)
|
142 |
+
print('---feature mean computation completed---')
|
143 |
+
|
144 |
+
return self.features_mean
|
145 |
+
|
146 |
+
|
147 |
+
def get_norm_feature(self, path, i, effect, stem):
|
148 |
+
|
149 |
+
if isinstance(path, str):
|
150 |
+
audio, fs = sf.read(path)
|
151 |
+
assert(fs == self.SR)
|
152 |
+
else:
|
153 |
+
audio = path
|
154 |
+
fs = self.SR
|
155 |
+
all_zeros = not np.any(audio)
|
156 |
+
|
157 |
+
if all_zeros == False:
|
158 |
+
|
159 |
+
audio = np.pad(audio, ((self.FFT_SIZE, self.FFT_SIZE), (0, 0)), mode='constant')
|
160 |
+
|
161 |
+
max_db = amp_to_db(np.max(np.abs(audio)))
|
162 |
+
|
163 |
+
if max_db > self.MIN_DB:
|
164 |
+
|
165 |
+
if effect == 'loudness':
|
166 |
+
meter = pyln.Meter(self.SR)
|
167 |
+
loudness = meter.integrated_loudness(audio)
|
168 |
+
return [loudness]
|
169 |
+
|
170 |
+
elif effect == 'eq':
|
171 |
+
audio = lufs_normalize(audio, self.SR, self.LUFS, log=False)
|
172 |
+
audio_spec = compute_stft(audio,
|
173 |
+
self.HOP_LENGTH,
|
174 |
+
self.FFT_SIZE,
|
175 |
+
np.sqrt(np.hanning(self.FFT_SIZE+1)[:-1]))
|
176 |
+
audio_spec = np.abs(audio_spec)
|
177 |
+
audio_spec_avg = np.mean(audio_spec, axis=(0,1))
|
178 |
+
return audio_spec_avg
|
179 |
+
|
180 |
+
elif effect == 'panning':
|
181 |
+
phi = get_SPS(audio,
|
182 |
+
n_fft=self.FFT_SIZE,
|
183 |
+
hop_length=self.HOP_LENGTH,
|
184 |
+
smooth=False,
|
185 |
+
frames=False)
|
186 |
+
return(phi[1])
|
187 |
+
|
188 |
+
elif effect == 'compression':
|
189 |
+
x = pyln.normalize.peak(audio, self.COMP_PEAK_NORM)
|
190 |
+
peak_std = get_mean_peak(x,
|
191 |
+
sr=self.SR,
|
192 |
+
true_peak=self.COMP_TRUE_PEAK,
|
193 |
+
percentile=self.COMP_PERCENTILE,
|
194 |
+
n_mels=self.comp_settings[stem]['n_mels'])
|
195 |
+
|
196 |
+
if peak_std is not None:
|
197 |
+
return peak_std
|
198 |
+
else:
|
199 |
+
return None
|
200 |
+
|
201 |
+
elif effect == 'imager':
|
202 |
+
mid, side = lr_to_ms(audio[:,0], audio[:,1])
|
203 |
+
return print_balance(mid, side, verbose=False)
|
204 |
+
|
205 |
+
else:
|
206 |
+
print(f'{path} is silence...')
|
207 |
+
return None
|
208 |
+
|
209 |
+
else:
|
210 |
+
|
211 |
+
print(f'{path} is only zeros...')
|
212 |
+
return None
|
213 |
+
|
214 |
+
|
215 |
+
# normalize current audio input with the order of designed audio FX
|
216 |
+
def normalize_audio(self, audio, src):
|
217 |
+
assert src in self.STEMS
|
218 |
+
|
219 |
+
normalized_audio = audio
|
220 |
+
for cur_effect in self.EFFECTS:
|
221 |
+
normalized_audio = self.normalize_audio_per_effect(normalized_audio, src=src, effect=cur_effect)
|
222 |
+
|
223 |
+
return normalized_audio
|
224 |
+
|
225 |
+
|
226 |
+
# normalize current audio input with current targeted audio FX
|
227 |
+
def normalize_audio_per_effect(self, audio, src, effect):
|
228 |
+
audio = audio.astype(dtype=np.float32)
|
229 |
+
audio_track = np.pad(audio, ((self.FFT_SIZE, self.FFT_SIZE), (0, 0)), mode='constant')
|
230 |
+
|
231 |
+
assert len(audio_track.shape) == 2 # Always expects two dimensions
|
232 |
+
|
233 |
+
if audio_track.shape[1] == 1: # Converts mono to stereo with repeated channels
|
234 |
+
audio_track = np.repeat(audio_track, 2, axis=-1)
|
235 |
+
|
236 |
+
output_audio = audio_track.copy()
|
237 |
+
|
238 |
+
max_db = amp_to_db(np.max(np.abs(output_audio)))
|
239 |
+
if max_db > self.MIN_DB:
|
240 |
+
|
241 |
+
if effect == 'eq':
|
242 |
+
# normalize each channel
|
243 |
+
for ch in range(audio_track.shape[1]):
|
244 |
+
audio_eq_matched = get_eq_matching(output_audio[:, ch],
|
245 |
+
self.features_mean[effect][src],
|
246 |
+
sr=self.SR,
|
247 |
+
n_fft=self.FFT_SIZE,
|
248 |
+
hop_length=self.HOP_LENGTH,
|
249 |
+
min_db=self.MIN_DB,
|
250 |
+
ntaps=self.NTAPS,
|
251 |
+
lufs=self.LUFS)
|
252 |
+
np.copyto(output_audio[:,ch], audio_eq_matched)
|
253 |
+
|
254 |
+
elif effect == 'compression':
|
255 |
+
assert(len(self.features_mean[effect][src])==2)
|
256 |
+
# normalize each channel
|
257 |
+
for ch in range(audio_track.shape[1]):
|
258 |
+
try:
|
259 |
+
audio_comp_matched = get_comp_matching(output_audio[:, ch],
|
260 |
+
self.features_mean[effect][src][0],
|
261 |
+
self.features_mean[effect][src][1],
|
262 |
+
self.comp_settings[src]['ratio'],
|
263 |
+
self.comp_settings[src]['attack'],
|
264 |
+
self.comp_settings[src]['release'],
|
265 |
+
sr=self.SR,
|
266 |
+
min_db=self.MIN_DB,
|
267 |
+
min_th=self.COMP_MIN_TH,
|
268 |
+
comp_peak_norm=self.COMP_PEAK_NORM,
|
269 |
+
max_ratio=self.COMP_MAX_RATIO,
|
270 |
+
n_mels=self.comp_settings[src]['n_mels'],
|
271 |
+
true_peak=self.COMP_TRUE_PEAK,
|
272 |
+
percentile=self.COMP_PERCENTILE,
|
273 |
+
expander=self.COMP_USE_EXPANDER)
|
274 |
+
|
275 |
+
np.copyto(output_audio[:,ch], audio_comp_matched[:, 0])
|
276 |
+
except:
|
277 |
+
break
|
278 |
+
|
279 |
+
elif effect == 'loudness':
|
280 |
+
output_audio = lufs_normalize(output_audio, self.SR, self.features_mean[effect][src], log=False)
|
281 |
+
|
282 |
+
elif effect == 'imager':
|
283 |
+
# threshold of applying Haas effects
|
284 |
+
mono_threshold = 0.99 if src=='bass' else 0.975
|
285 |
+
audio_imager_matched = normalize_imager(output_audio, \
|
286 |
+
target_side_mid_bal=self.features_mean[effect][src][0], \
|
287 |
+
mono_threshold=mono_threshold, \
|
288 |
+
sr=self.SR)
|
289 |
+
|
290 |
+
np.copyto(output_audio, audio_imager_matched)
|
291 |
+
|
292 |
+
output_audio = output_audio[self.FFT_SIZE:self.FFT_SIZE+audio.shape[0]]
|
293 |
+
|
294 |
+
return output_audio
|
295 |
+
|
296 |
+
|
297 |
+
def smooth_feature(self, feature_dict_):
|
298 |
+
|
299 |
+
for effect in self.EFFECTS:
|
300 |
+
for key in self.STEMS:
|
301 |
+
if effect == 'eq':
|
302 |
+
if key in ['other', 'vocals', 'mixture']:
|
303 |
+
f = 401
|
304 |
+
else:
|
305 |
+
f = 151
|
306 |
+
feature_dict_[effect][key] = scipy.signal.savgol_filter(feature_dict_[effect][key],
|
307 |
+
f, 1, mode='mirror')
|
308 |
+
elif effect == 'panning':
|
309 |
+
feature_dict_[effect][key] = scipy.signal.savgol_filter(feature_dict_[effect][key],
|
310 |
+
501, 1, mode='mirror')
|
311 |
+
return feature_dict_
|
312 |
+
|
313 |
+
|
314 |
+
# compute "normalization" based on a single sample
|
315 |
+
def feature_matching(self, src_aud_path, ref_aud_path):
|
316 |
+
# compute mean features from reference audio
|
317 |
+
mean_feature = self.compute_mean(ref_aud_path, save_feat=False, single_file=True)
|
318 |
+
print(mean_feature)
|
319 |
+
|
320 |
+
src_aud, sr = sf.read(src_aud_path)
|
321 |
+
normalized_audio = self.normalize_audio(src_aud, 'mixture')
|
322 |
+
|
323 |
+
return normalized_audio
|
324 |
+
|
325 |
+
|
326 |
+
|
327 |
+
def lufs_normalize(x, sr, lufs, log=True):
|
328 |
+
|
329 |
+
# measure the loudness first
|
330 |
+
meter = pyloudnorm.Meter(sr) # create BS.1770 meter
|
331 |
+
loudness = meter.integrated_loudness(x+1e-10)
|
332 |
+
if log:
|
333 |
+
print("original loudness: ", loudness," max value: ", np.max(np.abs(x)))
|
334 |
+
|
335 |
+
loudness_normalized_audio = pyloudnorm.normalize.loudness(x, loudness, lufs)
|
336 |
+
|
337 |
+
maxabs_amp = np.maximum(1.0, 1e-6 + np.max(np.abs(loudness_normalized_audio)))
|
338 |
+
loudness_normalized_audio /= maxabs_amp
|
339 |
+
|
340 |
+
loudness = meter.integrated_loudness(loudness_normalized_audio)
|
341 |
+
if log:
|
342 |
+
print("new loudness: ", loudness," max value: ", np.max(np.abs(loudness_normalized_audio)))
|
modules/fx_utils.py
ADDED
@@ -0,0 +1,308 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import warnings
|
2 |
+
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import scipy
|
6 |
+
import math
|
7 |
+
import librosa
|
8 |
+
import librosa.display
|
9 |
+
import fnmatch
|
10 |
+
import os
|
11 |
+
from functools import partial
|
12 |
+
import pyloudnorm
|
13 |
+
from scipy.signal import lfilter
|
14 |
+
from sklearn.metrics import mean_absolute_error, mean_squared_error
|
15 |
+
from sklearn.metrics.pairwise import paired_distances
|
16 |
+
|
17 |
+
|
18 |
+
import matplotlib.pyplot as plt
|
19 |
+
|
20 |
+
def db(x):
|
21 |
+
"""Computes the decible energy of a signal"""
|
22 |
+
return 20*np.log10(np.sqrt(np.mean(np.square(x))))
|
23 |
+
|
24 |
+
def melspectrogram(y, mirror_pad=False):
|
25 |
+
"""Compute melspectrogram feature extraction
|
26 |
+
|
27 |
+
Keyword arguments:
|
28 |
+
signal -- input audio as a signal in a numpy object
|
29 |
+
inputnorm -- normalization of output
|
30 |
+
mirror_pad -- pre and post-pend mirror signals
|
31 |
+
|
32 |
+
Returns freq x time
|
33 |
+
|
34 |
+
|
35 |
+
Assumes the input sampling rate is 22050Hz
|
36 |
+
"""
|
37 |
+
|
38 |
+
# Extract mel.
|
39 |
+
fftsize = 1024
|
40 |
+
window = 1024
|
41 |
+
hop = 512
|
42 |
+
melBin = 128
|
43 |
+
sr = 22050
|
44 |
+
|
45 |
+
# mirror pad signal
|
46 |
+
# first embedding centered on time 0
|
47 |
+
# last embedding centered on end of signal
|
48 |
+
if mirror_pad:
|
49 |
+
y = np.insert(y, 0, y[0:int(half_frame_length_sec * sr)][::-1])
|
50 |
+
y = np.insert(y, len(y), y[-int(half_frame_length_sec * sr):][::-1])
|
51 |
+
|
52 |
+
S = librosa.core.stft(y,n_fft=fftsize,hop_length=hop,win_length=window)
|
53 |
+
X = np.abs(S)
|
54 |
+
mel_basis = librosa.filters.mel(sr,n_fft=fftsize,n_mels=melBin)
|
55 |
+
mel_S = np.dot(mel_basis,X)
|
56 |
+
|
57 |
+
# value log compression
|
58 |
+
mel_S = np.log10(1+10*mel_S)
|
59 |
+
mel_S = mel_S.astype(np.float32)
|
60 |
+
|
61 |
+
|
62 |
+
return mel_S
|
63 |
+
|
64 |
+
|
65 |
+
def getFilesPath(directory, extension):
|
66 |
+
|
67 |
+
n_path=[]
|
68 |
+
for path, subdirs, files in os.walk(directory):
|
69 |
+
for name in files:
|
70 |
+
if fnmatch.fnmatch(name, extension):
|
71 |
+
n_path.append(os.path.join(path,name))
|
72 |
+
n_path.sort()
|
73 |
+
|
74 |
+
return n_path
|
75 |
+
|
76 |
+
|
77 |
+
|
78 |
+
def getRandomTrim(x, length, pad=0, start=None):
|
79 |
+
|
80 |
+
length = length+pad
|
81 |
+
if x.shape[0] <= length:
|
82 |
+
x_ = x
|
83 |
+
while(x.shape[0] <= length):
|
84 |
+
x_ = np.concatenate((x_,x_))
|
85 |
+
else:
|
86 |
+
if start is None:
|
87 |
+
start = np.random.randint(0, x.shape[0]-length, size=None)
|
88 |
+
end = length+start
|
89 |
+
if end > x.shape[0]:
|
90 |
+
x_ = x[start:]
|
91 |
+
x_ = np.concatenate((x_, x[:length-x.shape[0]]))
|
92 |
+
else:
|
93 |
+
x_ = x[start:length+start]
|
94 |
+
|
95 |
+
return x_[:length]
|
96 |
+
|
97 |
+
def fadeIn(x, length=128):
|
98 |
+
|
99 |
+
w = scipy.signal.hann(length*2, sym=True)
|
100 |
+
w1 = w[0:length]
|
101 |
+
ones = np.ones(int(x.shape[0]-length))
|
102 |
+
w = np.append(w1, ones)
|
103 |
+
|
104 |
+
return x*w
|
105 |
+
|
106 |
+
def fadeOut(x, length=128):
|
107 |
+
|
108 |
+
w = scipy.signal.hann(length*2, sym=True)
|
109 |
+
w2 = w[length:length*2]
|
110 |
+
ones = np.ones(int(x.shape[0]-length))
|
111 |
+
w = np.append(ones, w2)
|
112 |
+
|
113 |
+
return x*w
|
114 |
+
|
115 |
+
|
116 |
+
def plotTimeFreq(audio, sr, n_fft=512, hop_length=128, ylabels=None):
|
117 |
+
|
118 |
+
n = len(audio)
|
119 |
+
# plt.figure(figsize=(14, 4*n))
|
120 |
+
colors = list(plt.cm.viridis(np.linspace(0,1,n)))
|
121 |
+
|
122 |
+
X = []
|
123 |
+
X_db = []
|
124 |
+
maxs = np.zeros((n,))
|
125 |
+
mins = np.zeros((n,))
|
126 |
+
maxs_t = np.zeros((n,))
|
127 |
+
for i, x in enumerate(audio):
|
128 |
+
|
129 |
+
if x.ndim == 2 and x.shape[-1] == 2:
|
130 |
+
x = librosa.core.to_mono(x.T)
|
131 |
+
X_ = librosa.stft(x, n_fft=n_fft, hop_length=hop_length)
|
132 |
+
X_db_ = librosa.amplitude_to_db(abs(X_))
|
133 |
+
X.append(X_)
|
134 |
+
X_db.append(X_db_)
|
135 |
+
maxs[i] = np.max(X_db_)
|
136 |
+
mins[i] = np.min(X_db_)
|
137 |
+
maxs_t[i] = np.max(np.abs(x))
|
138 |
+
vmax = np.max(maxs)
|
139 |
+
vmin = np.min(mins)
|
140 |
+
tmax = np.max(maxs_t)
|
141 |
+
for i, x in enumerate(audio):
|
142 |
+
|
143 |
+
if x.ndim == 2 and x.shape[-1] == 2:
|
144 |
+
x = librosa.core.to_mono(x.T)
|
145 |
+
|
146 |
+
plt.subplot(n, 2, 2*i+1)
|
147 |
+
librosa.display.waveplot(x, sr=sr, color=colors[i])
|
148 |
+
if ylabels:
|
149 |
+
plt.ylabel(ylabels[i])
|
150 |
+
|
151 |
+
plt.ylim(-tmax,tmax)
|
152 |
+
plt.subplot(n, 2, 2*i+2)
|
153 |
+
librosa.display.specshow(X_db[i], sr=sr, x_axis='time', y_axis='log',
|
154 |
+
hop_length=hop_length, cmap='GnBu', vmax=vmax, vmin=vmin)
|
155 |
+
# plt.colorbar(format='%+2.0f dB')
|
156 |
+
|
157 |
+
|
158 |
+
|
159 |
+
|
160 |
+
|
161 |
+
|
162 |
+
|
163 |
+
|
164 |
+
def slicing(x, win_length, hop_length, center = True, windowing = False, pad = 0):
|
165 |
+
# Pad the time series so that frames are centered
|
166 |
+
if center:
|
167 |
+
# x = np.pad(x, int((win_length-hop_length+pad) // 2), mode='constant')
|
168 |
+
x = np.pad(x, ((int((win_length-hop_length+pad)//2), int((win_length+hop_length+pad)//2)),), mode='constant')
|
169 |
+
|
170 |
+
# Window the time series.
|
171 |
+
y_frames = librosa.util.frame(x, frame_length=win_length, hop_length=hop_length)
|
172 |
+
if windowing:
|
173 |
+
window = scipy.signal.hann(win_length, sym=False)
|
174 |
+
else:
|
175 |
+
window = 1.0
|
176 |
+
f = []
|
177 |
+
for i in range(len(y_frames.T)):
|
178 |
+
f.append(y_frames.T[i]*window)
|
179 |
+
return np.float32(np.asarray(f))
|
180 |
+
|
181 |
+
|
182 |
+
def overlap(x, x_len, win_length, hop_length, windowing = True, rate = 1):
|
183 |
+
x = x.reshape(x.shape[0],x.shape[1]).T
|
184 |
+
if windowing:
|
185 |
+
window = scipy.signal.hann(win_length, sym=False)
|
186 |
+
rate = rate*hop_length/win_length
|
187 |
+
else:
|
188 |
+
window = 1
|
189 |
+
rate = 1
|
190 |
+
n_frames = x_len / hop_length
|
191 |
+
expected_signal_len = int(win_length + hop_length * (n_frames))
|
192 |
+
y = np.zeros(expected_signal_len)
|
193 |
+
for i in range(int(n_frames)):
|
194 |
+
sample = i * hop_length
|
195 |
+
w = x[:, i]
|
196 |
+
y[sample:(sample + win_length)] = y[sample:(sample + win_length)] + w*window
|
197 |
+
y = y[int(win_length // 2):-int(win_length // 2)]
|
198 |
+
return np.float32(y*rate)
|
199 |
+
|
200 |
+
|
201 |
+
|
202 |
+
|
203 |
+
|
204 |
+
|
205 |
+
|
206 |
+
def highpassFiltering(x_list, f0, sr):
|
207 |
+
|
208 |
+
b1, a1 = scipy.signal.butter(4, f0/(sr/2),'highpass')
|
209 |
+
x_f = []
|
210 |
+
for x in x_list:
|
211 |
+
x_f_ = scipy.signal.filtfilt(b1, a1, x).copy(order='F')
|
212 |
+
x_f.append(x_f_)
|
213 |
+
return x_f
|
214 |
+
|
215 |
+
def lineartodB(x):
|
216 |
+
return 20*np.log10(x)
|
217 |
+
def dBtoLinear(x):
|
218 |
+
return np.power(10,x/20)
|
219 |
+
|
220 |
+
def lufs_normalize(x, sr, lufs, log=True):
|
221 |
+
|
222 |
+
# measure the loudness first
|
223 |
+
meter = pyloudnorm.Meter(sr) # create BS.1770 meter
|
224 |
+
loudness = meter.integrated_loudness(x+1e-10)
|
225 |
+
if log:
|
226 |
+
print("original loudness: ", loudness," max value: ", np.max(np.abs(x)))
|
227 |
+
|
228 |
+
loudness_normalized_audio = pyloudnorm.normalize.loudness(x, loudness, lufs)
|
229 |
+
|
230 |
+
maxabs_amp = np.maximum(1.0, 1e-6 + np.max(np.abs(loudness_normalized_audio)))
|
231 |
+
loudness_normalized_audio /= maxabs_amp
|
232 |
+
|
233 |
+
loudness = meter.integrated_loudness(loudness_normalized_audio)
|
234 |
+
if log:
|
235 |
+
print("new loudness: ", loudness," max value: ", np.max(np.abs(loudness_normalized_audio)))
|
236 |
+
|
237 |
+
|
238 |
+
return loudness_normalized_audio
|
239 |
+
|
240 |
+
import soxbindings as sox
|
241 |
+
|
242 |
+
def lufs_normalize_compand(x, sr, lufs):
|
243 |
+
|
244 |
+
tfm = sox.Transformer()
|
245 |
+
tfm.compand(attack_time = 0.001,
|
246 |
+
decay_time = 0.01,
|
247 |
+
soft_knee_db = 1.0,
|
248 |
+
tf_points = [(-70, -70), (-0.1, -20), (0, 0)])
|
249 |
+
|
250 |
+
x = tfm.build_array(input_array=x, sample_rate_in=sr).astype(np.float32)
|
251 |
+
|
252 |
+
# measure the loudness first
|
253 |
+
meter = pyloudnorm.Meter(sr) # create BS.1770 meter
|
254 |
+
loudness = meter.integrated_loudness(x)
|
255 |
+
print("original loudness: ", loudness," max value: ", np.max(np.abs(x)))
|
256 |
+
|
257 |
+
loudness_normalized_audio = pyloudnorm.normalize.loudness(x, loudness, lufs)
|
258 |
+
|
259 |
+
maxabs_amp = np.maximum(1.0, 1e-6 + np.max(np.abs(loudness_normalized_audio)))
|
260 |
+
loudness_normalized_audio /= maxabs_amp
|
261 |
+
|
262 |
+
loudness = meter.integrated_loudness(loudness_normalized_audio)
|
263 |
+
print("new loudness: ", loudness," max value: ", np.max(np.abs(loudness_normalized_audio)))
|
264 |
+
|
265 |
+
return loudness_normalized_audio
|
266 |
+
|
267 |
+
|
268 |
+
|
269 |
+
|
270 |
+
|
271 |
+
def getDistances(x,y):
|
272 |
+
|
273 |
+
distances = {}
|
274 |
+
distances['mae'] = mean_absolute_error(x, y)
|
275 |
+
distances['mse'] = mean_squared_error(x, y)
|
276 |
+
distances['euclidean'] = np.mean(paired_distances(x, y, metric='euclidean'))
|
277 |
+
distances['manhattan'] = np.mean(paired_distances(x, y, metric='manhattan'))
|
278 |
+
distances['cosine'] = np.mean(paired_distances(x, y, metric='cosine'))
|
279 |
+
|
280 |
+
distances['mae'] = round(distances['mae'], 5)
|
281 |
+
distances['mse'] = round(distances['mse'], 5)
|
282 |
+
distances['euclidean'] = round(distances['euclidean'], 5)
|
283 |
+
distances['manhattan'] = round(distances['manhattan'], 5)
|
284 |
+
distances['cosine'] = round(distances['cosine'], 5)
|
285 |
+
|
286 |
+
return distances
|
287 |
+
|
288 |
+
def getMFCC(x, sr, mels=128, mfcc=13, mean_norm=False):
|
289 |
+
|
290 |
+
melspec = librosa.feature.melspectrogram(y=x, sr=sr, S=None,
|
291 |
+
n_fft=1024, hop_length=256,
|
292 |
+
n_mels=mels, power=2.0)
|
293 |
+
melspec_dB = librosa.power_to_db(melspec, ref=np.max)
|
294 |
+
mfcc = librosa.feature.mfcc(S=melspec_dB, sr=sr, n_mfcc=mfcc)
|
295 |
+
if mean_norm:
|
296 |
+
mfcc -= (np.mean(mfcc, axis=0))
|
297 |
+
return mfcc
|
298 |
+
|
299 |
+
|
300 |
+
def getMSE_MFCC(y_true, y_pred, sr, mels=128, mfcc=13, mean_norm=False):
|
301 |
+
|
302 |
+
ratio = np.mean(np.abs(y_true))/np.mean(np.abs(y_pred))
|
303 |
+
y_pred = ratio*y_pred
|
304 |
+
|
305 |
+
y_mfcc = getMFCC(y_true, sr, mels=mels, mfcc=mfcc, mean_norm=mean_norm)
|
306 |
+
z_mfcc = getMFCC(y_pred, sr, mels=mels, mfcc=mfcc, mean_norm=mean_norm)
|
307 |
+
|
308 |
+
return getDistances(y_mfcc[:,:], z_mfcc[:,:])
|
modules/normalization_imager.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Implementation of the normalization process of stereo-imaging and panning effects
|
3 |
+
"""
|
4 |
+
import numpy as np
|
5 |
+
import sys
|
6 |
+
import os
|
7 |
+
|
8 |
+
currentdir = os.path.dirname(os.path.realpath(__file__))
|
9 |
+
sys.path.append(currentdir)
|
10 |
+
from common_audioeffects import AugmentationChain, Haas
|
11 |
+
|
12 |
+
|
13 |
+
'''
|
14 |
+
### normalization algorithm for stereo imaging and panning effects ###
|
15 |
+
process :
|
16 |
+
1. inputs 2-channeled audio
|
17 |
+
2. apply Haas effects if the input audio is almost mono
|
18 |
+
3. normalize mid-side channels according to target precomputed feature value
|
19 |
+
4. normalize left-right channels 50-50
|
20 |
+
5. normalize mid-side channels again
|
21 |
+
'''
|
22 |
+
def normalize_imager(data, \
|
23 |
+
target_side_mid_bal=0.9, \
|
24 |
+
mono_threshold=0.95, \
|
25 |
+
sr=44100, \
|
26 |
+
eps=1e-04, \
|
27 |
+
verbose=False):
|
28 |
+
|
29 |
+
# to mid-side channels
|
30 |
+
mid, side = lr_to_ms(data[:,0], data[:,1])
|
31 |
+
|
32 |
+
if verbose:
|
33 |
+
print_balance(data[:,0], data[:,1])
|
34 |
+
print_balance(mid, side)
|
35 |
+
print()
|
36 |
+
|
37 |
+
# apply mid-side weights according to energy
|
38 |
+
mid_e, side_e = np.sum(mid**2), np.sum(side**2)
|
39 |
+
total_e = mid_e + side_e
|
40 |
+
# apply haas effect to almost-mono signal
|
41 |
+
if mid_e/total_e > mono_threshold:
|
42 |
+
aug_chain = AugmentationChain(fxs=[(Haas(sample_rate=sr), 1, True)])
|
43 |
+
data = aug_chain([data])[0]
|
44 |
+
mid, side = lr_to_ms(data[:,0], data[:,1])
|
45 |
+
|
46 |
+
if verbose:
|
47 |
+
print_balance(data[:,0], data[:,1])
|
48 |
+
print_balance(mid, side)
|
49 |
+
print()
|
50 |
+
|
51 |
+
# normalize mid-side channels (stereo imaging)
|
52 |
+
new_mid, new_side = process_balance(mid, side, tgt_e1_bal=target_side_mid_bal, eps=eps)
|
53 |
+
left, right = ms_to_lr(new_mid, new_side)
|
54 |
+
imaged = np.stack([left, right], 1)
|
55 |
+
|
56 |
+
if verbose:
|
57 |
+
print_balance(new_mid, new_side)
|
58 |
+
print_balance(left, right)
|
59 |
+
print()
|
60 |
+
|
61 |
+
# normalize panning to have the balance of left-right channels 50-50
|
62 |
+
left, right = process_balance(left, right, tgt_e1_bal=0.5, eps=eps)
|
63 |
+
mid, side = lr_to_ms(left, right)
|
64 |
+
|
65 |
+
if verbose:
|
66 |
+
print_balance(mid, side)
|
67 |
+
print_balance(left, right)
|
68 |
+
print()
|
69 |
+
|
70 |
+
# normalize again mid-side channels (stereo imaging)
|
71 |
+
new_mid, new_side = process_balance(mid, side, tgt_e1_bal=target_side_mid_bal, eps=eps)
|
72 |
+
left, right = ms_to_lr(new_mid, new_side)
|
73 |
+
imaged = np.stack([left, right], 1)
|
74 |
+
|
75 |
+
if verbose:
|
76 |
+
print_balance(new_mid, new_side)
|
77 |
+
print_balance(left, right)
|
78 |
+
print()
|
79 |
+
|
80 |
+
return imaged
|
81 |
+
|
82 |
+
|
83 |
+
# balance out 2 input data's energy according to given balance
|
84 |
+
# tgt_e1_bal range = [0.0, 1.0]
|
85 |
+
# tgt_e2_bal = 1.0 - tgt_e1_bal_range
|
86 |
+
def process_balance(data_1, data_2, tgt_e1_bal=0.5, eps=1e-04):
|
87 |
+
|
88 |
+
e_1, e_2 = np.sum(data_1**2), np.sum(data_2**2)
|
89 |
+
total_e = e_1 + e_2
|
90 |
+
|
91 |
+
tgt_1_gain = np.sqrt(tgt_e1_bal * total_e / (e_1 + eps))
|
92 |
+
|
93 |
+
new_data_1 = data_1 * tgt_1_gain
|
94 |
+
new_e_1 = e_1 * (tgt_1_gain ** 2)
|
95 |
+
left_e_1 = total_e - new_e_1
|
96 |
+
tgt_2_gain = np.sqrt(left_e_1 / (e_2 + 1e-3))
|
97 |
+
new_data_2 = data_2 * tgt_2_gain
|
98 |
+
|
99 |
+
return new_data_1, new_data_2
|
100 |
+
|
101 |
+
|
102 |
+
# left-right channeled signal to mid-side signal
|
103 |
+
def lr_to_ms(left, right):
|
104 |
+
mid = left + right
|
105 |
+
side = left - right
|
106 |
+
return mid, side
|
107 |
+
|
108 |
+
|
109 |
+
# mid-side channeled signal to left-right signal
|
110 |
+
def ms_to_lr(mid, side):
|
111 |
+
left = (mid + side) / 2
|
112 |
+
right = (mid - side) / 2
|
113 |
+
return left, right
|
114 |
+
|
115 |
+
|
116 |
+
# print energy balance of 2 inputs
|
117 |
+
def print_balance(data_1, data_2, verbose=True):
|
118 |
+
e_1, e_2 = np.sum(data_1**2), np.sum(data_2**2)
|
119 |
+
total_e = e_1 + e_2
|
120 |
+
if verbose:
|
121 |
+
print(total_e, e_1/total_e, e_2/total_e)
|
122 |
+
return e_1/total_e, e_2/total_e
|
123 |
+
|
modules/utils_data_normalization.py
ADDED
@@ -0,0 +1,992 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import sys
|
4 |
+
import time
|
5 |
+
import numpy as np
|
6 |
+
import scipy
|
7 |
+
import librosa
|
8 |
+
import pyloudnorm as pyln
|
9 |
+
|
10 |
+
sys.setrecursionlimit(int(1e6))
|
11 |
+
|
12 |
+
import sklearn
|
13 |
+
|
14 |
+
currentdir = os.path.dirname(os.path.realpath(__file__))
|
15 |
+
sys.path.append(currentdir)
|
16 |
+
from common_miscellaneous import compute_stft, compute_istft
|
17 |
+
from common_audioeffects import Panner, Compressor, AugmentationChain, ConvolutionalReverb, Equaliser, AlgorithmicReverb
|
18 |
+
import fx_utils
|
19 |
+
|
20 |
+
import soundfile as sf
|
21 |
+
import aubio
|
22 |
+
|
23 |
+
import time
|
24 |
+
|
25 |
+
import warnings
|
26 |
+
|
27 |
+
import torch
|
28 |
+
import torchaudio.functional as F
|
29 |
+
|
30 |
+
# Functions
|
31 |
+
|
32 |
+
def print_dict(dict_):
|
33 |
+
for i in dict_:
|
34 |
+
print(i)
|
35 |
+
for j in dict_[i]:
|
36 |
+
print('\t', j)
|
37 |
+
|
38 |
+
def amp_to_db(x):
|
39 |
+
return 20*np.log10(x + 1e-30)
|
40 |
+
|
41 |
+
def db_to_amp(x):
|
42 |
+
return 10**(x/20)
|
43 |
+
|
44 |
+
def get_running_stats(x, features, N=20):
|
45 |
+
mean = []
|
46 |
+
std = []
|
47 |
+
for i in range(len(features)):
|
48 |
+
mean_, std_ = running_mean_std(x[:,i], N)
|
49 |
+
mean.append(mean_)
|
50 |
+
std.append(std_)
|
51 |
+
mean = np.asarray(mean)
|
52 |
+
std = np.asarray(std)
|
53 |
+
|
54 |
+
return mean, std
|
55 |
+
|
56 |
+
def running_mean_std(x, N):
|
57 |
+
|
58 |
+
with warnings.catch_warnings():
|
59 |
+
warnings.simplefilter("ignore", category=RuntimeWarning)
|
60 |
+
cumsum = np.cumsum(np.insert(x, 0, 0))
|
61 |
+
cumsum2 = np.cumsum(np.insert(x**2, 0, 0))
|
62 |
+
mean = (cumsum[N:] - cumsum[:-N]) / float(N)
|
63 |
+
|
64 |
+
std = np.sqrt(((cumsum2[N:] - cumsum2[:-N]) / N) - (mean * mean))
|
65 |
+
|
66 |
+
return mean, std
|
67 |
+
|
68 |
+
def get_eq_matching(audio_t, ref_spec, sr=44100, n_fft=65536, hop_length=16384,
|
69 |
+
min_db=-50, ntaps=101, lufs=-30):
|
70 |
+
|
71 |
+
audio_t = np.copy(audio_t)
|
72 |
+
max_db = amp_to_db(np.max(np.abs(audio_t)))
|
73 |
+
if max_db > min_db:
|
74 |
+
|
75 |
+
audio_t = fx_utils.lufs_normalize(audio_t, sr, lufs, log=False)
|
76 |
+
audio_D = compute_stft(np.expand_dims(audio_t, 1),
|
77 |
+
hop_length,
|
78 |
+
n_fft,
|
79 |
+
np.sqrt(np.hanning(n_fft+1)[:-1]))
|
80 |
+
audio_D = np.abs(audio_D)
|
81 |
+
audio_D_avg = np.mean(audio_D, axis=0)[0]
|
82 |
+
|
83 |
+
m = ref_spec.shape[0]
|
84 |
+
|
85 |
+
Ts = 1.0/sr # sampling interval
|
86 |
+
n = m # length of the signal
|
87 |
+
kk = np.arange(n)
|
88 |
+
T = n/sr
|
89 |
+
frq = kk/T # two sides frequency range
|
90 |
+
frq /=2
|
91 |
+
|
92 |
+
diff_eq = amp_to_db(ref_spec)-amp_to_db(audio_D_avg)
|
93 |
+
diff_eq = db_to_amp(diff_eq)
|
94 |
+
diff_eq = np.sqrt(diff_eq)
|
95 |
+
|
96 |
+
diff_filter = scipy.signal.firwin2(ntaps,
|
97 |
+
frq/np.max(frq),
|
98 |
+
diff_eq,
|
99 |
+
nfreqs=None, window='hamming',
|
100 |
+
nyq=None, antisymmetric=False)
|
101 |
+
|
102 |
+
output = scipy.signal.filtfilt(diff_filter, 1, audio_t,
|
103 |
+
axis=-1, padtype='odd', padlen=None,
|
104 |
+
method='pad', irlen=None)
|
105 |
+
|
106 |
+
else:
|
107 |
+
output = audio_t
|
108 |
+
|
109 |
+
return output
|
110 |
+
|
111 |
+
def get_eq_matching_gpu(audio_t, ref_spec, sr=44100, n_fft=65536, hop_length=16384,
|
112 |
+
min_db=-50, ntaps=101, lufs=-30):
|
113 |
+
|
114 |
+
audio_t = np.copy(audio_t)
|
115 |
+
max_db = amp_to_db(np.max(np.abs(audio_t)))
|
116 |
+
if max_db > min_db:
|
117 |
+
|
118 |
+
|
119 |
+
start_time = time.time()
|
120 |
+
|
121 |
+
audio_t = fx_utils.lufs_normalize(audio_t, sr, lufs, log=False)
|
122 |
+
# audio_D = compute_stft(np.expand_dims(audio_t, 1),
|
123 |
+
# hop_length,
|
124 |
+
# n_fft,
|
125 |
+
# np.sqrt(np.hanning(n_fft+1)[:-1]))
|
126 |
+
audio_D = compute_stft(audio_t,
|
127 |
+
hop_length,
|
128 |
+
n_fft,
|
129 |
+
np.sqrt(np.hanning(n_fft+1)[:-1]))
|
130 |
+
audio_D = np.abs(audio_D)
|
131 |
+
# audio_D_avg = np.mean(audio_D, axis=0)
|
132 |
+
audio_D_avg = np.mean(audio_D, axis=0)[0]
|
133 |
+
|
134 |
+
m = ref_spec.shape[0]
|
135 |
+
|
136 |
+
Ts = 1.0/sr # sampling interval
|
137 |
+
n = m # length of the signal
|
138 |
+
kk = np.arange(n)
|
139 |
+
T = n/sr
|
140 |
+
frq = kk/T # two sides frequency range
|
141 |
+
frq /=2
|
142 |
+
|
143 |
+
diff_eq_l = amp_to_db(ref_spec)-amp_to_db(audio_D_avg)
|
144 |
+
diff_eq_l = db_to_amp(diff_eq_l)
|
145 |
+
diff_eq_l = np.sqrt(diff_eq_l)
|
146 |
+
diff_eq_r = amp_to_db(ref_spec)-amp_to_db(audio_D_avg)
|
147 |
+
diff_eq_r = db_to_amp(diff_eq_r)
|
148 |
+
diff_eq_r = np.sqrt(diff_eq_r)
|
149 |
+
|
150 |
+
diff_filter_l = scipy.signal.firwin2(ntaps,
|
151 |
+
frq/np.max(frq),
|
152 |
+
diff_eq_l,
|
153 |
+
nfreqs=None, window='hamming',
|
154 |
+
nyq=None, antisymmetric=False)
|
155 |
+
diff_filter_r = scipy.signal.firwin2(ntaps,
|
156 |
+
frq/np.max(frq),
|
157 |
+
diff_eq_r,
|
158 |
+
nfreqs=None, window='hamming',
|
159 |
+
nyq=None, antisymmetric=False)
|
160 |
+
diff_filter = np.stack((diff_filter_l, diff_filter_r), axis=0)
|
161 |
+
|
162 |
+
# output = scipy.signal.filtfilt(diff_filter, 1, audio_t,
|
163 |
+
# axis=-1, padtype='odd', padlen=None,
|
164 |
+
# method='pad', irlen=None)
|
165 |
+
|
166 |
+
print(f"\t\tall previous: {time.time()-start_time}")
|
167 |
+
|
168 |
+
start_time = time.time()
|
169 |
+
|
170 |
+
# device = torch.cuda()
|
171 |
+
audio_t = torch.from_numpy(audio_t.transpose()).float().cuda()
|
172 |
+
diff_filter = torch.from_numpy(diff_filter).float().cuda()
|
173 |
+
denom_coef = torch.ones(diff_filter.size()).cuda()
|
174 |
+
print(f'input to gpu - audio shape: {audio_t.shape}')
|
175 |
+
# audio_t = F.filtfilt(waveform=audio_t, a_coeffs=denom_coef, b_coeffs=diff_filter, clamp=False).transpose()
|
176 |
+
audio_t = F.filtfilt(waveform=audio_t, a_coeffs=denom_coef, b_coeffs=diff_filter, clamp=False)
|
177 |
+
audio_t = audio_t.transpose(1, 0)
|
178 |
+
print(audio_t.shape)
|
179 |
+
print('filtered')
|
180 |
+
print(f"\t\tgpu filtfilt: {time.time()-start_time}")
|
181 |
+
print(torch.mean(audio_t))
|
182 |
+
output = audio_t.detach()
|
183 |
+
print(f"\t\t1gpu filtfilt: {time.time()-start_time}")
|
184 |
+
output = audio_t.cpu()
|
185 |
+
print(f"\t\t2gpu filtfilt: {time.time()-start_time}")
|
186 |
+
output = audio_t.detach().cpu().numpy()
|
187 |
+
print(f"\t\t3gpu filtfilt: {time.time()-start_time}")
|
188 |
+
|
189 |
+
|
190 |
+
else:
|
191 |
+
output = audio_t
|
192 |
+
|
193 |
+
return output
|
194 |
+
|
195 |
+
def get_SPS(x, n_fft=2048, hop_length=1024, smooth=False, frames=False):
|
196 |
+
|
197 |
+
x = np.copy(x)
|
198 |
+
eps = 1e-20
|
199 |
+
|
200 |
+
audio_D = compute_stft(x,
|
201 |
+
hop_length,
|
202 |
+
n_fft,
|
203 |
+
np.sqrt(np.hanning(n_fft+1)[:-1]))
|
204 |
+
|
205 |
+
audio_D_l = np.abs(audio_D[:, 0, :] + eps)
|
206 |
+
audio_D_r = np.abs(audio_D[:, 1, :] + eps)
|
207 |
+
|
208 |
+
phi = 2 * (np.abs(audio_D_l*np.conj(audio_D_r)))/(np.abs(audio_D_l)**2+np.abs(audio_D_r)**2)
|
209 |
+
|
210 |
+
phi_l = np.abs(audio_D_l*np.conj(audio_D_r))/(np.abs(audio_D_l)**2)
|
211 |
+
phi_r = np.abs(audio_D_r*np.conj(audio_D_l))/(np.abs(audio_D_r)**2)
|
212 |
+
delta = phi_l - phi_r
|
213 |
+
delta_ = np.sign(delta)
|
214 |
+
SPS = (1-phi)*delta_
|
215 |
+
|
216 |
+
phi_mean = np.mean(phi, axis=0)
|
217 |
+
if smooth:
|
218 |
+
phi_mean = scipy.signal.savgol_filter(phi_mean, 501, 1, mode='mirror')
|
219 |
+
|
220 |
+
SPS_mean = np.mean(SPS, axis=0)
|
221 |
+
if smooth:
|
222 |
+
SPS_mean = scipy.signal.savgol_filter(SPS_mean, 501, 1, mode='mirror')
|
223 |
+
|
224 |
+
|
225 |
+
return SPS_mean, phi_mean, SPS, phi
|
226 |
+
|
227 |
+
|
228 |
+
def get_mean_side(sps, freqs=[50,2500], sr=44100, n_fft=2048):
|
229 |
+
|
230 |
+
sign = np.sign(sps+ 1e-10)
|
231 |
+
|
232 |
+
idx1 = freqs[0]
|
233 |
+
idx2 = freqs[1]
|
234 |
+
|
235 |
+
f1 = int(np.floor(idx1*n_fft/sr))
|
236 |
+
f2 = int(np.floor(idx2*n_fft/sr))
|
237 |
+
|
238 |
+
sign_mean = np.mean(sign[f1:f2])/np.abs(np.mean(sign[f1:f2]))
|
239 |
+
sign_mean
|
240 |
+
|
241 |
+
return sign_mean
|
242 |
+
|
243 |
+
def get_panning_param_values(phi, side):
|
244 |
+
|
245 |
+
p = np.zeros_like(phi)
|
246 |
+
|
247 |
+
g = (np.clip(phi+1e-30, 0, 1))/2
|
248 |
+
|
249 |
+
for i, g_ in enumerate(g):
|
250 |
+
|
251 |
+
if side > 0:
|
252 |
+
p[i] = 1 - g_
|
253 |
+
|
254 |
+
elif side < 0:
|
255 |
+
p[i] = g_
|
256 |
+
|
257 |
+
else:
|
258 |
+
p[i] = 0.5
|
259 |
+
|
260 |
+
g_l = 1-p
|
261 |
+
g_r = p
|
262 |
+
|
263 |
+
return p, [g_l, g_r]
|
264 |
+
|
265 |
+
def get_panning_matching(audio, ref_phi,
|
266 |
+
sr=44100, n_fft=2048, hop_length=1024,
|
267 |
+
min_db_f=-10, max_freq_pan=16000, frames=True):
|
268 |
+
|
269 |
+
eps = 1e-20
|
270 |
+
window = np.sqrt(np.hanning(n_fft+1)[:-1])
|
271 |
+
audio = np.copy(audio)
|
272 |
+
audio_t = np.pad(audio, ((n_fft, n_fft), (0, 0)), mode='constant')
|
273 |
+
|
274 |
+
sps_mean_, phi_mean_, _, _ = get_SPS(audio_t, n_fft=n_fft, hop_length=hop_length, smooth=True)
|
275 |
+
|
276 |
+
side = get_mean_side(sps_mean_, sr=sr, n_fft=n_fft)
|
277 |
+
|
278 |
+
if side > 0:
|
279 |
+
alpha = 0.7
|
280 |
+
else:
|
281 |
+
alpha = 0.3
|
282 |
+
|
283 |
+
processor = Panner()
|
284 |
+
processor.parameters.pan.value = alpha
|
285 |
+
processor.parameters.pan_law.value = 'linear'
|
286 |
+
processor.update()
|
287 |
+
audio_t_ = processor.process(audio_t)
|
288 |
+
|
289 |
+
sps_mean_, phi_mean, sps_frames, phi_frames = get_SPS(audio_t_, n_fft=n_fft,
|
290 |
+
hop_length=hop_length,
|
291 |
+
smooth=True, frames=frames)
|
292 |
+
|
293 |
+
if frames:
|
294 |
+
|
295 |
+
p_i_ = []
|
296 |
+
g_i_ = []
|
297 |
+
p_ref = []
|
298 |
+
g_ref = []
|
299 |
+
for i in range(len(sps_frames)):
|
300 |
+
sps_ = sps_frames[i]
|
301 |
+
phi_ = phi_frames[i]
|
302 |
+
p_, g_ = get_panning_param_values(phi_, side)
|
303 |
+
p_i_.append(p_)
|
304 |
+
g_i_.append(g_)
|
305 |
+
p_, g_ = get_panning_param_values(ref_phi, side)
|
306 |
+
p_ref.append(p_)
|
307 |
+
g_ref.append(g_)
|
308 |
+
ratio = (np.asarray(g_ref)/(np.asarray(g_i_)+eps))
|
309 |
+
g_l = ratio[:,0,:]
|
310 |
+
g_r = ratio[:,1,:]
|
311 |
+
|
312 |
+
|
313 |
+
else:
|
314 |
+
p, g = get_panning_param_values(ref_phi, side)
|
315 |
+
p_i, g_i = get_panning_param_values(phi_mean, side)
|
316 |
+
ratio = (np.asarray(g)/np.asarray(g_i))
|
317 |
+
g_l = ratio[0]
|
318 |
+
g_r = ratio[1]
|
319 |
+
|
320 |
+
audio_new_D = compute_stft(audio_t_,
|
321 |
+
hop_length,
|
322 |
+
n_fft,
|
323 |
+
window)
|
324 |
+
|
325 |
+
audio_new_D_mono = audio_new_D.copy()
|
326 |
+
audio_new_D_mono = audio_new_D_mono[:, 0, :] + audio_new_D_mono[:, 1, :]
|
327 |
+
audio_new_D_mono = np.abs(audio_new_D_mono)
|
328 |
+
|
329 |
+
audio_new_D_phase = np.angle(audio_new_D)
|
330 |
+
audio_new_D = np.abs(audio_new_D)
|
331 |
+
|
332 |
+
audio_new_D_l = audio_new_D[:, 0, :]
|
333 |
+
audio_new_D_r = audio_new_D[:, 1, :]
|
334 |
+
|
335 |
+
if frames:
|
336 |
+
for i, frame in enumerate(audio_new_D_mono):
|
337 |
+
max_db = amp_to_db(np.max(np.abs(frame)))
|
338 |
+
if max_db < min_db_f:
|
339 |
+
g_r[i] = np.ones_like(frame)
|
340 |
+
g_l[i] = np.ones_like(frame)
|
341 |
+
|
342 |
+
idx1 = max_freq_pan
|
343 |
+
f1 = int(np.floor(idx1*n_fft/sr))
|
344 |
+
ones = np.ones_like(g_l)
|
345 |
+
g_l[f1:] = ones[f1:]
|
346 |
+
g_r[f1:] = ones[f1:]
|
347 |
+
|
348 |
+
audio_new_D_l = audio_new_D_l*g_l
|
349 |
+
audio_new_D_r = audio_new_D_r*g_r
|
350 |
+
|
351 |
+
audio_new_D_l = np.expand_dims(audio_new_D_l, 0)
|
352 |
+
audio_new_D_r = np.expand_dims(audio_new_D_r, 0)
|
353 |
+
|
354 |
+
audio_new_D_ = np.concatenate((audio_new_D_l,audio_new_D_r))
|
355 |
+
|
356 |
+
audio_new_D_ = np.moveaxis(audio_new_D_, 0, 1)
|
357 |
+
|
358 |
+
audio_new_D_ = audio_new_D_ * (np.cos(audio_new_D_phase) + np.sin(audio_new_D_phase)*1j)
|
359 |
+
|
360 |
+
audio_new_t = compute_istft(audio_new_D_,
|
361 |
+
hop_length,
|
362 |
+
window)
|
363 |
+
|
364 |
+
audio_new_t = audio_new_t[n_fft:n_fft+audio.shape[0]]
|
365 |
+
|
366 |
+
return audio_new_t
|
367 |
+
|
368 |
+
|
369 |
+
|
370 |
+
def get_mean_peak(audio, sr=44100, true_peak=False, n_mels=128, percentile=75):
|
371 |
+
|
372 |
+
# Returns mean peak value in dB after the 1Q is removed.
|
373 |
+
# Input should be in the shape samples x channel
|
374 |
+
|
375 |
+
audio_ = audio
|
376 |
+
window_size = 2**10 # FFT size
|
377 |
+
hop_size = window_size
|
378 |
+
|
379 |
+
peak = []
|
380 |
+
std = []
|
381 |
+
for ch in range(audio_.shape[-1]):
|
382 |
+
x = np.ascontiguousarray(audio_[:, ch])
|
383 |
+
|
384 |
+
if true_peak:
|
385 |
+
x = librosa.resample(x, sr, 4*sr)
|
386 |
+
sr = 4*sr
|
387 |
+
window_size = 4*window_size
|
388 |
+
hop_size = 4*hop_size
|
389 |
+
|
390 |
+
onset_func = aubio.onset('hfc', buf_size=window_size, hop_size=hop_size, samplerate=sr)
|
391 |
+
|
392 |
+
frames = np.float32(librosa.util.frame(x, frame_length=window_size, hop_length=hop_size))
|
393 |
+
|
394 |
+
onset_times = []
|
395 |
+
for frame in frames.T:
|
396 |
+
|
397 |
+
if onset_func(frame):
|
398 |
+
|
399 |
+
onset_time = onset_func.get_last()
|
400 |
+
onset_times.append(onset_time)
|
401 |
+
|
402 |
+
samples=[]
|
403 |
+
if onset_times:
|
404 |
+
for i, p in enumerate(onset_times[:-1]):
|
405 |
+
samples.append(onset_times[i]+np.argmax(np.abs(x[onset_times[i]:onset_times[i+1]])))
|
406 |
+
samples.append(onset_times[-1]+np.argmax(np.abs(x[onset_times[-1]:])))
|
407 |
+
|
408 |
+
p_value = []
|
409 |
+
for p in samples:
|
410 |
+
p_ = amp_to_db(np.abs(x[p]))
|
411 |
+
p_value.append(p_)
|
412 |
+
p_value_=[]
|
413 |
+
for p in p_value:
|
414 |
+
if p > np.percentile(p_value, percentile):
|
415 |
+
p_value_.append(p)
|
416 |
+
if p_value_:
|
417 |
+
peak.append(np.mean(p_value_))
|
418 |
+
std.append(np.std(p_value_))
|
419 |
+
elif p_value:
|
420 |
+
peak.append(np.mean(p_value))
|
421 |
+
std.append(np.std(p_value))
|
422 |
+
else:
|
423 |
+
return None
|
424 |
+
return [np.mean(peak), np.mean(std)]
|
425 |
+
|
426 |
+
def compress(processor, audio, sr, th, ratio, attack, release):
|
427 |
+
|
428 |
+
eps = 1e-20
|
429 |
+
x = audio
|
430 |
+
|
431 |
+
processor.parameters.threshold.value = th
|
432 |
+
processor.parameters.ratio.value = ratio
|
433 |
+
processor.parameters.attack_time.value = attack
|
434 |
+
processor.parameters.release_time.value = release
|
435 |
+
processor.update()
|
436 |
+
output = processor.process(x)
|
437 |
+
|
438 |
+
if np.max(np.abs(output)) >= 1.0:
|
439 |
+
output = np.clip(output, -1.0, 1.0)
|
440 |
+
|
441 |
+
return output
|
442 |
+
|
443 |
+
def get_comp_matching(audio,
|
444 |
+
ref_peak, ref_std,
|
445 |
+
ratio, attack, release, sr=44100,
|
446 |
+
min_db=-50, comp_peak_norm=-10.0,
|
447 |
+
min_th=-40, max_ratio=20, n_mels=128,
|
448 |
+
true_peak=False, percentile=75, expander=True):
|
449 |
+
|
450 |
+
x = audio.copy()
|
451 |
+
|
452 |
+
if x.ndim < 2:
|
453 |
+
x = np.expand_dims(x, 1)
|
454 |
+
|
455 |
+
max_db = amp_to_db(np.max(np.abs(x)))
|
456 |
+
if max_db > min_db:
|
457 |
+
|
458 |
+
x = pyln.normalize.peak(x, comp_peak_norm)
|
459 |
+
|
460 |
+
peak, std = get_mean_peak(x, sr,
|
461 |
+
n_mels=n_mels,
|
462 |
+
true_peak=true_peak,
|
463 |
+
percentile=percentile)
|
464 |
+
|
465 |
+
if peak > (ref_peak - ref_std) and peak < (ref_peak + ref_std):
|
466 |
+
return x
|
467 |
+
|
468 |
+
# DownwardCompress
|
469 |
+
elif peak > (ref_peak - ref_std):
|
470 |
+
processor = Compressor(sample_rate=sr)
|
471 |
+
# print('compress')
|
472 |
+
ratios = np.linspace(ratio, max_ratio, max_ratio-ratio+1)
|
473 |
+
ths = np.linspace(-1-9, min_th, 2*np.abs(min_th)-1-18)
|
474 |
+
for rt in ratios:
|
475 |
+
for th in ths:
|
476 |
+
y = compress(processor, x, sr, th, rt, attack, release)
|
477 |
+
peak, std = get_mean_peak(y, sr,
|
478 |
+
n_mels=n_mels,
|
479 |
+
true_peak=true_peak,
|
480 |
+
percentile=percentile)
|
481 |
+
if peak < (ref_peak + ref_std):
|
482 |
+
break
|
483 |
+
else:
|
484 |
+
continue
|
485 |
+
break
|
486 |
+
|
487 |
+
return y
|
488 |
+
|
489 |
+
# Upward Expand
|
490 |
+
elif peak < (ref_peak + ref_std):
|
491 |
+
|
492 |
+
if expander:
|
493 |
+
processor = Compressor(sample_rate=sr)
|
494 |
+
ratios = np.linspace(ratio, max_ratio, max_ratio-ratio+1)
|
495 |
+
ths = np.linspace(-1, min_th, 2*np.abs(min_th)-1)[::-1]
|
496 |
+
|
497 |
+
for rt in ratios:
|
498 |
+
for th in ths:
|
499 |
+
y = compress(processor, x, sr, th, 1/rt, attack, release)
|
500 |
+
peak, std = get_mean_peak(y, sr,
|
501 |
+
n_mels=n_mels,
|
502 |
+
true_peak=true_peak,
|
503 |
+
percentile=percentile)
|
504 |
+
if peak > (ref_peak - ref_std):
|
505 |
+
break
|
506 |
+
else:
|
507 |
+
continue
|
508 |
+
break
|
509 |
+
|
510 |
+
return y
|
511 |
+
|
512 |
+
else:
|
513 |
+
return x
|
514 |
+
else:
|
515 |
+
return x
|
516 |
+
|
517 |
+
|
518 |
+
|
519 |
+
# REVERB
|
520 |
+
|
521 |
+
|
522 |
+
def get_reverb_send(audio, eq_parameters, rv_parameters, impulse_responses=None,
|
523 |
+
eq_prob=1.0, rv_prob=1.0, parallel=True, shuffle=False, sr=44100, bands=['low_shelf', 'high_shelf']):
|
524 |
+
|
525 |
+
x = audio.copy()
|
526 |
+
|
527 |
+
if x.ndim < 2:
|
528 |
+
x = np.expand_dims(x, 1)
|
529 |
+
|
530 |
+
channels = x.shape[-1]
|
531 |
+
eq_gain = eq_parameters.low_shelf_gain.value
|
532 |
+
|
533 |
+
|
534 |
+
eq = Equaliser(n_channels=channels,
|
535 |
+
sample_rate=sr,
|
536 |
+
gain_range=(eq_gain, eq_gain),
|
537 |
+
bands=bands,
|
538 |
+
hard_clip=False,
|
539 |
+
name='Equaliser', parameters=eq_parameters)
|
540 |
+
eq.randomize()
|
541 |
+
|
542 |
+
if impulse_responses:
|
543 |
+
|
544 |
+
reverb = ConvolutionalReverb(impulse_responses=impulse_responses,
|
545 |
+
sample_rate=sr,
|
546 |
+
parameters=rv_parameters)
|
547 |
+
|
548 |
+
else:
|
549 |
+
|
550 |
+
reverb = AlgorithmicReverb(sample_rate=sr,
|
551 |
+
parameters=rv_parameters)
|
552 |
+
|
553 |
+
reverb.randomize()
|
554 |
+
|
555 |
+
fxchain = AugmentationChain([
|
556 |
+
(eq, rv_prob, False),
|
557 |
+
(reverb, eq_prob, False)
|
558 |
+
],
|
559 |
+
shuffle=shuffle, parallel=parallel)
|
560 |
+
|
561 |
+
output = fxchain(x)
|
562 |
+
|
563 |
+
return output
|
564 |
+
|
565 |
+
|
566 |
+
|
567 |
+
# FUNCTIONS TO COMPUTE FEATURES
|
568 |
+
|
569 |
+
def compute_loudness_features(args_):
|
570 |
+
|
571 |
+
audio_out_ = args_[0]
|
572 |
+
audio_tar_ = args_[1]
|
573 |
+
idx = args_[2]
|
574 |
+
sr = args_[3]
|
575 |
+
|
576 |
+
loudness_ = {key:[] for key in ['d_lufs', 'd_peak',]}
|
577 |
+
|
578 |
+
peak_tar = np.max(np.abs(audio_tar_))
|
579 |
+
peak_tar_db = 20.0 * np.log10(peak_tar)
|
580 |
+
|
581 |
+
peak_out = np.max(np.abs(audio_out_))
|
582 |
+
peak_out_db = 20.0 * np.log10(peak_out)
|
583 |
+
|
584 |
+
with warnings.catch_warnings():
|
585 |
+
warnings.simplefilter("ignore", category=RuntimeWarning)
|
586 |
+
meter = pyln.Meter(sr) # create BS.1770 meter
|
587 |
+
loudness_tar = meter.integrated_loudness(audio_tar_)
|
588 |
+
loudness_out = meter.integrated_loudness(audio_out_)
|
589 |
+
|
590 |
+
loudness_['d_lufs'].append(sklearn.metrics.mean_absolute_percentage_error([loudness_tar], [loudness_out]))
|
591 |
+
loudness_['d_peak'].append(sklearn.metrics.mean_absolute_percentage_error([peak_tar_db], [peak_out_db]))
|
592 |
+
|
593 |
+
return loudness_
|
594 |
+
|
595 |
+
def compute_spectral_features(args_):
|
596 |
+
|
597 |
+
audio_out_ = args_[0]
|
598 |
+
audio_tar_ = args_[1]
|
599 |
+
idx = args_[2]
|
600 |
+
sr = args_[3]
|
601 |
+
fft_size = args_[4]
|
602 |
+
hop_length = args_[5]
|
603 |
+
channels = args_[6]
|
604 |
+
|
605 |
+
audio_out_ = pyln.normalize.peak(audio_out_, -1.0)
|
606 |
+
audio_tar_ = pyln.normalize.peak(audio_tar_, -1.0)
|
607 |
+
|
608 |
+
spec_out_ = compute_stft(audio_out_,
|
609 |
+
hop_length,
|
610 |
+
fft_size,
|
611 |
+
np.sqrt(np.hanning(fft_size+1)[:-1]))
|
612 |
+
spec_out_ = np.transpose(spec_out_, axes=[1, -1, 0])
|
613 |
+
spec_out_ = np.abs(spec_out_)
|
614 |
+
|
615 |
+
spec_tar_ = compute_stft(audio_tar_,
|
616 |
+
hop_length,
|
617 |
+
fft_size,
|
618 |
+
np.sqrt(np.hanning(fft_size+1)[:-1]))
|
619 |
+
spec_tar_ = np.transpose(spec_tar_, axes=[1, -1, 0])
|
620 |
+
spec_tar_ = np.abs(spec_tar_)
|
621 |
+
|
622 |
+
spectral_ = {key:[] for key in ['centroid_mean',
|
623 |
+
'bandwidth_mean',
|
624 |
+
'contrast_l_mean',
|
625 |
+
'contrast_m_mean',
|
626 |
+
'contrast_h_mean',
|
627 |
+
'rolloff_mean',
|
628 |
+
'flatness_mean',
|
629 |
+
'mape_mean',
|
630 |
+
]}
|
631 |
+
|
632 |
+
centroid_mean_ = []
|
633 |
+
centroid_std_ = []
|
634 |
+
bandwidth_mean_ = []
|
635 |
+
bandwidth_std_ = []
|
636 |
+
contrast_l_mean_ = []
|
637 |
+
contrast_l_std_ = []
|
638 |
+
contrast_m_mean_ = []
|
639 |
+
contrast_m_std_ = []
|
640 |
+
contrast_h_mean_ = []
|
641 |
+
contrast_h_std_ = []
|
642 |
+
rolloff_mean_ = []
|
643 |
+
rolloff_std_ = []
|
644 |
+
flatness_mean_ = []
|
645 |
+
|
646 |
+
for ch in range(channels):
|
647 |
+
tar = spec_tar_[ch]
|
648 |
+
out = spec_out_[ch]
|
649 |
+
|
650 |
+
tar_sc = librosa.feature.spectral_centroid(y=None, sr=sr, S=tar,
|
651 |
+
n_fft=fft_size, hop_length=hop_length)
|
652 |
+
|
653 |
+
out_sc = librosa.feature.spectral_centroid(y=None, sr=sr, S=out,
|
654 |
+
n_fft=fft_size, hop_length=hop_length)
|
655 |
+
|
656 |
+
tar_bw = librosa.feature.spectral_bandwidth(y=None, sr=sr, S=tar,
|
657 |
+
n_fft=fft_size, hop_length=hop_length,
|
658 |
+
centroid=tar_sc, norm=True, p=2)
|
659 |
+
|
660 |
+
out_bw = librosa.feature.spectral_bandwidth(y=None, sr=sr, S=out,
|
661 |
+
n_fft=fft_size, hop_length=hop_length,
|
662 |
+
centroid=out_sc, norm=True, p=2)
|
663 |
+
# l = 0-250, m = 1-2-3 = 250 - 2000, h = 2000 - SR/2
|
664 |
+
tar_ct = librosa.feature.spectral_contrast(y=None, sr=sr, S=tar,
|
665 |
+
n_fft=fft_size, hop_length=hop_length,
|
666 |
+
fmin=250.0, n_bands=4, quantile=0.02, linear=False)
|
667 |
+
|
668 |
+
out_ct = librosa.feature.spectral_contrast(y=None, sr=sr, S=out,
|
669 |
+
n_fft=fft_size, hop_length=hop_length,
|
670 |
+
fmin=250.0, n_bands=4, quantile=0.02, linear=False)
|
671 |
+
|
672 |
+
tar_ro = librosa.feature.spectral_rolloff(y=None, sr=sr, S=tar,
|
673 |
+
n_fft=fft_size, hop_length=hop_length,
|
674 |
+
roll_percent=0.85)
|
675 |
+
|
676 |
+
out_ro = librosa.feature.spectral_rolloff(y=None, sr=sr, S=out,
|
677 |
+
n_fft=fft_size, hop_length=hop_length,
|
678 |
+
roll_percent=0.85)
|
679 |
+
|
680 |
+
tar_ft = librosa.feature.spectral_flatness(y=None, S=tar,
|
681 |
+
n_fft=fft_size, hop_length=hop_length,
|
682 |
+
amin=1e-10, power=2.0)
|
683 |
+
|
684 |
+
out_ft = librosa.feature.spectral_flatness(y=None, S=out,
|
685 |
+
n_fft=fft_size, hop_length=hop_length,
|
686 |
+
amin=1e-10, power=2.0)
|
687 |
+
|
688 |
+
|
689 |
+
eps = 1e-0
|
690 |
+
N = 40
|
691 |
+
mean_sc_tar, std_sc_tar = get_running_stats(tar_sc.T+eps, [0], N=N)
|
692 |
+
mean_sc_out, std_sc_out = get_running_stats(out_sc.T+eps, [0], N=N)
|
693 |
+
|
694 |
+
assert np.isnan(mean_sc_tar).any() == False, f'NAN values mean_sc_tar {idx}'
|
695 |
+
assert np.isnan(mean_sc_out).any() == False, f'NAN values mean_sc_out {idx}'
|
696 |
+
|
697 |
+
|
698 |
+
mean_bw_tar, std_bw_tar = get_running_stats(tar_bw.T+eps, [0], N=N)
|
699 |
+
mean_bw_out, std_bw_out = get_running_stats(out_bw.T+eps, [0], N=N)
|
700 |
+
|
701 |
+
assert np.isnan(mean_bw_tar).any() == False, f'NAN values tar mean bw {idx}'
|
702 |
+
assert np.isnan(mean_bw_out).any() == False, f'NAN values out mean bw {idx}'
|
703 |
+
|
704 |
+
mean_ct_tar, std_ct_tar = get_running_stats(tar_ct.T, list(range(tar_ct.shape[0])), N=N)
|
705 |
+
mean_ct_out, std_ct_out = get_running_stats(out_ct.T, list(range(out_ct.shape[0])), N=N)
|
706 |
+
|
707 |
+
assert np.isnan(mean_ct_tar).any() == False, f'NAN values tar mean ct {idx}'
|
708 |
+
assert np.isnan(mean_ct_out).any() == False, f'NAN values out mean ct {idx}'
|
709 |
+
|
710 |
+
mean_ro_tar, std_ro_tar = get_running_stats(tar_ro.T+eps, [0], N=N)
|
711 |
+
mean_ro_out, std_ro_out = get_running_stats(out_ro.T+eps, [0], N=N)
|
712 |
+
|
713 |
+
assert np.isnan(mean_ro_tar).any() == False, f'NAN values tar mean ro {idx}'
|
714 |
+
assert np.isnan(mean_ro_out).any() == False, f'NAN values out mean ro {idx}'
|
715 |
+
|
716 |
+
mean_ft_tar, std_ft_tar = get_running_stats(tar_ft.T, [0], N=800) # gives very high numbers due to N (80) value
|
717 |
+
mean_ft_out, std_ft_out = get_running_stats(out_ft.T, [0], N=800)
|
718 |
+
|
719 |
+
mape_mean_sc = sklearn.metrics.mean_absolute_percentage_error(mean_sc_tar[0], mean_sc_out[0])
|
720 |
+
|
721 |
+
mape_mean_bw = sklearn.metrics.mean_absolute_percentage_error(mean_bw_tar[0], mean_bw_out[0])
|
722 |
+
|
723 |
+
mape_mean_ct_l = sklearn.metrics.mean_absolute_percentage_error(mean_ct_tar[0], mean_ct_out[0])
|
724 |
+
|
725 |
+
mape_mean_ct_m = sklearn.metrics.mean_absolute_percentage_error(np.mean(mean_ct_tar[1:4], axis=0),
|
726 |
+
np.mean(mean_ct_out[1:4], axis=0))
|
727 |
+
|
728 |
+
mape_mean_ct_h = sklearn.metrics.mean_absolute_percentage_error(mean_ct_tar[-1], mean_ct_out[-1])
|
729 |
+
|
730 |
+
mape_mean_ro = sklearn.metrics.mean_absolute_percentage_error(mean_ro_tar[0], mean_ro_out[0])
|
731 |
+
|
732 |
+
mape_mean_ft = sklearn.metrics.mean_absolute_percentage_error(mean_ft_tar[0], mean_ft_out[0])
|
733 |
+
|
734 |
+
centroid_mean_.append(mape_mean_sc)
|
735 |
+
bandwidth_mean_.append(mape_mean_bw)
|
736 |
+
contrast_l_mean_.append(mape_mean_ct_l)
|
737 |
+
contrast_m_mean_.append(mape_mean_ct_m)
|
738 |
+
contrast_h_mean_.append(mape_mean_ct_h)
|
739 |
+
rolloff_mean_.append(mape_mean_ro)
|
740 |
+
flatness_mean_.append(mape_mean_ft)
|
741 |
+
|
742 |
+
spectral_['centroid_mean'].append(np.mean(centroid_mean_))
|
743 |
+
|
744 |
+
spectral_['bandwidth_mean'].append(np.mean(bandwidth_mean_))
|
745 |
+
|
746 |
+
spectral_['contrast_l_mean'].append(np.mean(contrast_l_mean_))
|
747 |
+
|
748 |
+
spectral_['contrast_m_mean'].append(np.mean(contrast_m_mean_))
|
749 |
+
|
750 |
+
spectral_['contrast_h_mean'].append(np.mean(contrast_h_mean_))
|
751 |
+
|
752 |
+
spectral_['rolloff_mean'].append(np.mean(rolloff_mean_))
|
753 |
+
|
754 |
+
spectral_['flatness_mean'].append(np.mean(flatness_mean_))
|
755 |
+
|
756 |
+
spectral_['mape_mean'].append(np.mean([np.mean(centroid_mean_),
|
757 |
+
np.mean(bandwidth_mean_),
|
758 |
+
np.mean(contrast_l_mean_),
|
759 |
+
np.mean(contrast_m_mean_),
|
760 |
+
np.mean(contrast_h_mean_),
|
761 |
+
np.mean(rolloff_mean_),
|
762 |
+
np.mean(flatness_mean_),
|
763 |
+
]))
|
764 |
+
|
765 |
+
return spectral_
|
766 |
+
|
767 |
+
# PANNING
|
768 |
+
def get_panning_rms_frame(sps_frame, freqs=[0,22050], sr=44100, n_fft=2048):
|
769 |
+
|
770 |
+
idx1 = freqs[0]
|
771 |
+
idx2 = freqs[1]
|
772 |
+
|
773 |
+
f1 = int(np.floor(idx1*n_fft/sr))
|
774 |
+
f2 = int(np.floor(idx2*n_fft/sr))
|
775 |
+
|
776 |
+
p_rms = np.sqrt((1/(f2-f1)) * np.sum(sps_frame[f1:f2]**2))
|
777 |
+
|
778 |
+
return p_rms
|
779 |
+
def get_panning_rms(sps, freqs=[[0, 22050]], sr=44100, n_fft=2048):
|
780 |
+
|
781 |
+
p_rms = []
|
782 |
+
for frame in sps:
|
783 |
+
p_rms_ = []
|
784 |
+
for f in freqs:
|
785 |
+
rms = get_panning_rms_frame(frame, freqs=f, sr=sr, n_fft=n_fft)
|
786 |
+
p_rms_.append(rms)
|
787 |
+
p_rms.append(p_rms_)
|
788 |
+
|
789 |
+
return np.asarray(p_rms)
|
790 |
+
|
791 |
+
|
792 |
+
|
793 |
+
def compute_panning_features(args_):
|
794 |
+
|
795 |
+
audio_out_ = args_[0]
|
796 |
+
audio_tar_ = args_[1]
|
797 |
+
idx = args_[2]
|
798 |
+
sr = args_[3]
|
799 |
+
fft_size = args_[4]
|
800 |
+
hop_length = args_[5]
|
801 |
+
|
802 |
+
audio_out_ = pyln.normalize.peak(audio_out_, -1.0)
|
803 |
+
audio_tar_ = pyln.normalize.peak(audio_tar_, -1.0)
|
804 |
+
|
805 |
+
panning_ = {}
|
806 |
+
|
807 |
+
freqs=[[0, sr//2], [0, 250], [250, 2500], [2500, sr//2]]
|
808 |
+
|
809 |
+
_, _, sps_frames_tar, _ = get_SPS(audio_tar_, n_fft=fft_size,
|
810 |
+
hop_length=hop_length,
|
811 |
+
smooth=True, frames=True)
|
812 |
+
|
813 |
+
_, _, sps_frames_out, _ = get_SPS(audio_out_, n_fft=fft_size,
|
814 |
+
hop_length=hop_length,
|
815 |
+
smooth=True, frames=True)
|
816 |
+
|
817 |
+
|
818 |
+
p_rms_tar = get_panning_rms(sps_frames_tar,
|
819 |
+
freqs=freqs,
|
820 |
+
sr=sr,
|
821 |
+
n_fft=fft_size)
|
822 |
+
|
823 |
+
p_rms_out = get_panning_rms(sps_frames_out,
|
824 |
+
freqs=freqs,
|
825 |
+
sr=sr,
|
826 |
+
n_fft=fft_size)
|
827 |
+
|
828 |
+
# to avoid num instability, deletes frames with zero rms from target
|
829 |
+
if np.min(p_rms_tar) == 0.0:
|
830 |
+
id_zeros = np.where(p_rms_tar.T[0] == 0)
|
831 |
+
p_rms_tar_ = []
|
832 |
+
p_rms_out_ = []
|
833 |
+
for i in range(len(freqs)):
|
834 |
+
temp_tar = np.delete(p_rms_tar.T[i], id_zeros)
|
835 |
+
temp_out = np.delete(p_rms_out.T[i], id_zeros)
|
836 |
+
p_rms_tar_.append(temp_tar)
|
837 |
+
p_rms_out_.append(temp_out)
|
838 |
+
p_rms_tar_ = np.asarray(p_rms_tar_)
|
839 |
+
p_rms_tar = p_rms_tar_.T
|
840 |
+
p_rms_out_ = np.asarray(p_rms_out_)
|
841 |
+
p_rms_out = p_rms_out_.T
|
842 |
+
|
843 |
+
N = 40
|
844 |
+
|
845 |
+
mean_tar, std_tar = get_running_stats(p_rms_tar, freqs, N=N)
|
846 |
+
mean_out, std_out = get_running_stats(p_rms_out, freqs, N=N)
|
847 |
+
|
848 |
+
panning_['P_t_mean'] = [sklearn.metrics.mean_absolute_percentage_error(mean_tar[0], mean_out[0])]
|
849 |
+
panning_['P_l_mean'] = [sklearn.metrics.mean_absolute_percentage_error(mean_tar[1], mean_out[1])]
|
850 |
+
panning_['P_m_mean'] = [sklearn.metrics.mean_absolute_percentage_error(mean_tar[2], mean_out[2])]
|
851 |
+
panning_['P_h_mean'] = [sklearn.metrics.mean_absolute_percentage_error(mean_tar[3], mean_out[3])]
|
852 |
+
|
853 |
+
panning_['mape_mean'] = [np.mean([panning_['P_t_mean'],
|
854 |
+
panning_['P_l_mean'],
|
855 |
+
panning_['P_m_mean'],
|
856 |
+
panning_['P_h_mean'],
|
857 |
+
])]
|
858 |
+
|
859 |
+
return panning_
|
860 |
+
|
861 |
+
# DYNAMIC
|
862 |
+
|
863 |
+
def get_rms_dynamic_crest(x, frame_length, hop_length):
|
864 |
+
|
865 |
+
rms = []
|
866 |
+
dynamic_spread = []
|
867 |
+
crest = []
|
868 |
+
for ch in range(x.shape[-1]):
|
869 |
+
frames = librosa.util.frame(x[:, ch], frame_length=frame_length, hop_length=hop_length)
|
870 |
+
rms_ = []
|
871 |
+
dynamic_spread_ = []
|
872 |
+
crest_ = []
|
873 |
+
for i in frames.T:
|
874 |
+
x_rms = amp_to_db(np.sqrt(np.sum(i**2)/frame_length))
|
875 |
+
x_d = np.sum(amp_to_db(np.abs(i)) - x_rms)/frame_length
|
876 |
+
x_c = amp_to_db(np.max(np.abs(i)))/x_rms
|
877 |
+
|
878 |
+
rms_.append(x_rms)
|
879 |
+
dynamic_spread_.append(x_d)
|
880 |
+
crest_.append(x_c)
|
881 |
+
rms.append(rms_)
|
882 |
+
dynamic_spread.append(dynamic_spread_)
|
883 |
+
crest.append(crest_)
|
884 |
+
|
885 |
+
rms = np.asarray(rms)
|
886 |
+
dynamic_spread = np.asarray(dynamic_spread)
|
887 |
+
crest = np.asarray(crest)
|
888 |
+
|
889 |
+
rms = np.mean(rms, axis=0)
|
890 |
+
dynamic_spread = np.mean(dynamic_spread, axis=0)
|
891 |
+
crest = np.mean(crest, axis=0)
|
892 |
+
|
893 |
+
rms = np.expand_dims(rms, axis=0)
|
894 |
+
dynamic_spread = np.expand_dims(dynamic_spread, axis=0)
|
895 |
+
crest = np.expand_dims(crest, axis=0)
|
896 |
+
|
897 |
+
return rms, dynamic_spread, crest
|
898 |
+
|
899 |
+
def lowpassFiltering(x, f0, sr):
|
900 |
+
|
901 |
+
b1, a1 = scipy.signal.butter(4, f0/(sr/2),'lowpass')
|
902 |
+
x_f = []
|
903 |
+
for ch in range(x.shape[-1]):
|
904 |
+
x_f_ = scipy.signal.filtfilt(b1, a1, x[:, ch]).copy(order='F')
|
905 |
+
x_f.append(x_f_)
|
906 |
+
return np.asarray(x_f).T
|
907 |
+
|
908 |
+
|
909 |
+
def get_low_freq_weighting(x, sr, n_fft, hop_length, f0 = 1000):
|
910 |
+
|
911 |
+
x_low = lowpassFiltering(x, f0, sr)
|
912 |
+
|
913 |
+
X_low = compute_stft(x_low,
|
914 |
+
hop_length,
|
915 |
+
n_fft,
|
916 |
+
np.sqrt(np.hanning(n_fft+1)[:-1]))
|
917 |
+
X_low = np.transpose(X_low, axes=[1, -1, 0])
|
918 |
+
X_low = np.abs(X_low)
|
919 |
+
|
920 |
+
X = compute_stft(x,
|
921 |
+
hop_length,
|
922 |
+
n_fft,
|
923 |
+
np.sqrt(np.hanning(n_fft+1)[:-1]))
|
924 |
+
X = np.transpose(X, axes=[1, -1, 0])
|
925 |
+
X = np.abs(X)
|
926 |
+
|
927 |
+
eps = 1e-5
|
928 |
+
ratio = (X_low)/(X+eps)
|
929 |
+
ratio = np.sum(ratio, axis = 1)
|
930 |
+
ratio = np.mean(ratio, axis = 0)
|
931 |
+
|
932 |
+
return np.expand_dims(ratio, axis=0)
|
933 |
+
|
934 |
+
def compute_dynamic_features(args_):
|
935 |
+
|
936 |
+
audio_out_ = args_[0]
|
937 |
+
audio_tar_ = args_[1]
|
938 |
+
idx = args_[2]
|
939 |
+
sr = args_[3]
|
940 |
+
fft_size = args_[4]
|
941 |
+
hop_length = args_[5]
|
942 |
+
|
943 |
+
audio_out_ = pyln.normalize.peak(audio_out_, -1.0)
|
944 |
+
audio_tar_ = pyln.normalize.peak(audio_tar_, -1.0)
|
945 |
+
|
946 |
+
dynamic_ = {}
|
947 |
+
|
948 |
+
with warnings.catch_warnings():
|
949 |
+
warnings.simplefilter("ignore", category=UserWarning)
|
950 |
+
|
951 |
+
rms_tar, dyn_tar, crest_tar = get_rms_dynamic_crest(audio_tar_, fft_size, hop_length)
|
952 |
+
rms_out, dyn_out, crest_out = get_rms_dynamic_crest(audio_out_, fft_size, hop_length)
|
953 |
+
|
954 |
+
low_ratio_tar = get_low_freq_weighting(audio_tar_, sr, fft_size, hop_length, f0=1000)
|
955 |
+
|
956 |
+
low_ratio_out = get_low_freq_weighting(audio_out_, sr, fft_size, hop_length, f0=1000)
|
957 |
+
|
958 |
+
N = 40
|
959 |
+
|
960 |
+
eps = 1e-10
|
961 |
+
|
962 |
+
rms_tar = (-1*rms_tar) + 1.0
|
963 |
+
rms_out = (-1*rms_out) + 1.0
|
964 |
+
dyn_tar = (-1*dyn_tar) + 1.0
|
965 |
+
dyn_out = (-1*dyn_out) + 1.0
|
966 |
+
|
967 |
+
mean_rms_tar, std_rms_tar = get_running_stats(rms_tar.T, [0], N=N)
|
968 |
+
mean_rms_out, std_rms_out = get_running_stats(rms_out.T, [0], N=N)
|
969 |
+
|
970 |
+
mean_dyn_tar, std_dyn_tar = get_running_stats(dyn_tar.T, [0], N=N)
|
971 |
+
mean_dyn_out, std_dyn_out = get_running_stats(dyn_out.T, [0], N=N)
|
972 |
+
|
973 |
+
mean_crest_tar, std_crest_tar = get_running_stats(crest_tar.T, [0], N=N)
|
974 |
+
mean_crest_out, std_crest_out = get_running_stats(crest_out.T, [0], N=N)
|
975 |
+
|
976 |
+
mean_low_ratio_tar, std_low_ratio_tar = get_running_stats(low_ratio_tar.T, [0], N=N)
|
977 |
+
mean_low_ratio_out, std_low_ratio_out = get_running_stats(low_ratio_out.T, [0], N=N)
|
978 |
+
|
979 |
+
dynamic_['rms_mean'] = [sklearn.metrics.mean_absolute_percentage_error(mean_rms_tar, mean_rms_out)]
|
980 |
+
dynamic_['dyn_mean'] = [sklearn.metrics.mean_absolute_percentage_error(mean_dyn_tar, mean_dyn_out)]
|
981 |
+
dynamic_['crest_mean'] = [sklearn.metrics.mean_absolute_percentage_error(mean_crest_tar, mean_crest_out)]
|
982 |
+
|
983 |
+
dynamic_['l_ratio_mean_mape'] = [sklearn.metrics.mean_absolute_percentage_error(mean_low_ratio_tar, mean_low_ratio_out)]
|
984 |
+
dynamic_['l_ratio_mean_l2'] = [sklearn.metrics.mean_squared_error(mean_low_ratio_tar, mean_low_ratio_out)]
|
985 |
+
|
986 |
+
dynamic_['mape_mean'] = [np.mean([dynamic_['rms_mean'],
|
987 |
+
dynamic_['dyn_mean'],
|
988 |
+
dynamic_['crest_mean'],
|
989 |
+
])]
|
990 |
+
|
991 |
+
return dynamic_
|
992 |
+
|