Spaces:
Running
Running
modify app
Browse files- app.py +4 -5
- inference.py +5 -72
app.py
CHANGED
@@ -42,8 +42,8 @@ def loudness_normalize(audio, sample_rate, target_loudness=-12.0):
|
|
42 |
return loudness_normalized_audio
|
43 |
|
44 |
def process_audio(input_audio, reference_audio):
|
45 |
-
output_audio, predicted_params,
|
46 |
-
input_audio, reference_audio, reference_audio
|
47 |
)
|
48 |
|
49 |
param_output = mastering_transfer.get_param_output_string(predicted_params)
|
@@ -54,6 +54,8 @@ def process_audio(input_audio, reference_audio):
|
|
54 |
|
55 |
# # Normalize output audio
|
56 |
# output_audio = loudness_normalize(output_audio, sr)
|
|
|
|
|
57 |
|
58 |
# Denormalize the audio to int16
|
59 |
output_audio = denormalize_audio(output_audio, dtype=np.int16)
|
@@ -66,9 +68,6 @@ def process_audio(input_audio, reference_audio):
|
|
66 |
# Ensure the audio is in the correct shape (samples, channels)
|
67 |
if output_audio.shape[1] > output_audio.shape[0]:
|
68 |
output_audio = output_audio.transpose(1,0)
|
69 |
-
|
70 |
-
print(output_audio.shape)
|
71 |
-
print(param_output)
|
72 |
|
73 |
return (sr, output_audio), param_output
|
74 |
|
|
|
42 |
return loudness_normalized_audio
|
43 |
|
44 |
def process_audio(input_audio, reference_audio):
|
45 |
+
output_audio, predicted_params, sr = mastering_transfer.process_audio(
|
46 |
+
input_audio, reference_audio, reference_audio
|
47 |
)
|
48 |
|
49 |
param_output = mastering_transfer.get_param_output_string(predicted_params)
|
|
|
54 |
|
55 |
# # Normalize output audio
|
56 |
# output_audio = loudness_normalize(output_audio, sr)
|
57 |
+
print(output_audio.shape)
|
58 |
+
print(f"sr: {sr}")
|
59 |
|
60 |
# Denormalize the audio to int16
|
61 |
output_audio = denormalize_audio(output_audio, dtype=np.int16)
|
|
|
68 |
# Ensure the audio is in the correct shape (samples, channels)
|
69 |
if output_audio.shape[1] > output_audio.shape[0]:
|
70 |
output_audio = output_audio.transpose(1,0)
|
|
|
|
|
|
|
71 |
|
72 |
return (sr, output_audio), param_output
|
73 |
|
inference.py
CHANGED
@@ -108,7 +108,7 @@ class MasteringStyleTransfer:
|
|
108 |
# Log top 5 parameter differences
|
109 |
if step == 0:
|
110 |
initial_params = current_params
|
111 |
-
top_5_diff = self.
|
112 |
log_entry = f"Step {step + 1}, Loss: {total_loss.item():.4f}\n{top_5_diff}\n"
|
113 |
|
114 |
if divergence_counter >= 10:
|
@@ -122,17 +122,6 @@ class MasteringStyleTransfer:
|
|
122 |
|
123 |
return min_loss_output, min_loss_params, min_loss_embedding, min_loss_step + 1
|
124 |
|
125 |
-
def get_top_5_diff_string(self, initial_params, current_params):
|
126 |
-
diff_dict = {}
|
127 |
-
for key in initial_params.keys():
|
128 |
-
diff = abs(current_params[key] - initial_params[key])
|
129 |
-
diff_dict[key] = diff
|
130 |
-
|
131 |
-
sorted_diff = sorted(diff_dict.items(), key=lambda x: x[1], reverse=True)
|
132 |
-
top_5_diff = sorted_diff[:5]
|
133 |
-
|
134 |
-
return "\n".join([f"{key}: {value:.4f}" for key, value in top_5_diff])
|
135 |
-
|
136 |
def preprocess_audio(self, audio, target_sample_rate=44100):
|
137 |
sample_rate, data = audio
|
138 |
|
@@ -166,7 +155,7 @@ class MasteringStyleTransfer:
|
|
166 |
|
167 |
return data_tensor.to(self.device)
|
168 |
|
169 |
-
def process_audio(self, input_audio, reference_audio, ito_reference_audio
|
170 |
input_tensor = self.preprocess_audio(input_audio, self.args.sample_rate)
|
171 |
reference_tensor = self.preprocess_audio(reference_audio, self.args.sample_rate)
|
172 |
ito_reference_tensor = self.preprocess_audio(ito_reference_audio, self.args.sample_rate)
|
@@ -175,24 +164,7 @@ class MasteringStyleTransfer:
|
|
175 |
|
176 |
output_audio, predicted_params = self.mastering_style_transfer(input_tensor, reference_feature)
|
177 |
|
178 |
-
|
179 |
-
ito_log = []
|
180 |
-
for i in range(self.args.max_iter_ito):
|
181 |
-
loss, ito_predicted_params = self.ito_step(input_audio, ito_reference_audio, predicted_params)
|
182 |
-
if log_ito:
|
183 |
-
top_10_diff = self.get_top_10_diff(predicted_params, ito_predicted_params)
|
184 |
-
log_entry = f"Iteration {i+1}, Loss: {loss:.4f}\nTop 10 parameter differences:\n{top_10_diff}\n"
|
185 |
-
ito_log.append(log_entry)
|
186 |
-
predicted_params = ito_predicted_params
|
187 |
-
|
188 |
-
ito_output_audio = self.converter.convert(input_audio, predicted_params)
|
189 |
-
ito_log = "\n".join(ito_log) if log_ito else None
|
190 |
-
else:
|
191 |
-
ito_output_audio = None
|
192 |
-
ito_predicted_params = None
|
193 |
-
ito_log = None
|
194 |
-
|
195 |
-
return output_audio, predicted_params, ito_output_audio, ito_predicted_params, ito_log, self.args.sample_rate
|
196 |
|
197 |
def print_param_difference(self, initial_params, ito_params):
|
198 |
all_diffs = []
|
@@ -278,7 +250,7 @@ class MasteringStyleTransfer:
|
|
278 |
|
279 |
return "\n".join(output)
|
280 |
|
281 |
-
def
|
282 |
if initial_params is None or ito_params is None:
|
283 |
return "Cannot compare parameters"
|
284 |
|
@@ -299,7 +271,7 @@ class MasteringStyleTransfer:
|
|
299 |
normalized_diff = abs(ito_value - initial_value)
|
300 |
all_diffs.append((fx_name, 'width', initial_value.item(), ito_value.item(), normalized_diff.item()))
|
301 |
|
302 |
-
top_diffs = sorted(all_diffs, key=lambda x: x[4], reverse=True)[:
|
303 |
|
304 |
output = ["Top 10 parameter differences (sorted by normalized difference):"]
|
305 |
for fx_name, param_name, initial_value, ito_value, normalized_diff in top_diffs:
|
@@ -322,42 +294,3 @@ def reload_weights(model, ckpt_path, device):
|
|
322 |
new_state_dict[name] = v
|
323 |
model.load_state_dict(new_state_dict, strict=False)
|
324 |
|
325 |
-
|
326 |
-
if __name__ == "__main__":
|
327 |
-
basis_path = '/data2/tony/Mastering_Style_Transfer/results/dasp_tcn_tuneenc_daspman_loudnessnorm/ckpt/1000/'
|
328 |
-
|
329 |
-
parser = argparse.ArgumentParser(description="Mastering Style Transfer")
|
330 |
-
parser.add_argument("--input_path", type=str, required=True, help="Path to input audio file")
|
331 |
-
parser.add_argument("--reference_path", type=str, required=True, help="Path to reference audio file")
|
332 |
-
parser.add_argument("--ito_reference_path", type=str, required=True, help="Path to ITO reference audio file")
|
333 |
-
parser.add_argument("--model_path", type=str, default=f"{basis_path}dasp_tcn_tuneenc_daspman_loudnessnorm_mastering_converter_1000.pt", help="Path to mastering converter model")
|
334 |
-
parser.add_argument("--encoder_path", type=str, default=f"{basis_path}dasp_tcn_tuneenc_daspman_loudnessnorm_effects_encoder_1000.pt", help="Path to effects encoder model")
|
335 |
-
parser.add_argument("--perform_ito", action="store_true", help="Whether to perform ITO")
|
336 |
-
parser.add_argument("--optimizer", type=str, default="RAdam", help="Optimizer for ITO")
|
337 |
-
parser.add_argument("--learning_rate", type=float, default=0.001, help="Learning rate for ITO")
|
338 |
-
parser.add_argument("--num_steps", type=int, default=100, help="Number of optimization steps for ITO")
|
339 |
-
parser.add_argument("--af_weights", nargs='+', type=float, default=[0.1, 0.001, 1.0, 1.0, 0.1], help="Weights for AudioFeatureLoss")
|
340 |
-
parser.add_argument("--sample_rate", type=int, default=44100, help="Sample rate for AudioFeatureLoss")
|
341 |
-
parser.add_argument("--path_to_config", type=str, default='/home/tony/mastering_transfer/networks/configs.yaml', help="Path to network architecture configuration file")
|
342 |
-
|
343 |
-
args = parser.parse_args()
|
344 |
-
|
345 |
-
# load network configurations
|
346 |
-
with open(args.path_to_config, 'r') as f:
|
347 |
-
configs = yaml.full_load(f)
|
348 |
-
args.cfg_converter = configs['TCN']['param_mapping']
|
349 |
-
args.cfg_enc = configs['Effects_Encoder']['default']
|
350 |
-
|
351 |
-
ito_config = {
|
352 |
-
'optimizer': args.optimizer,
|
353 |
-
'learning_rate': args.learning_rate,
|
354 |
-
'num_steps': args.num_steps,
|
355 |
-
'af_weights': args.af_weights,
|
356 |
-
'sample_rate': args.sample_rate
|
357 |
-
}
|
358 |
-
|
359 |
-
mastering_style_transfer = MasteringStyleTransfer(args)
|
360 |
-
output_audio, predicted_params, ito_output_audio, ito_predicted_params, optimized_reference_feature, sr, ito_steps = mastering_style_transfer.process_audio(
|
361 |
-
args.input_path, args.reference_path, args.ito_reference_path, ito_config, args.perform_ito
|
362 |
-
)
|
363 |
-
|
|
|
108 |
# Log top 5 parameter differences
|
109 |
if step == 0:
|
110 |
initial_params = current_params
|
111 |
+
top_5_diff = self.get_top_n_diff_string(initial_params, current_params, top_n=5)
|
112 |
log_entry = f"Step {step + 1}, Loss: {total_loss.item():.4f}\n{top_5_diff}\n"
|
113 |
|
114 |
if divergence_counter >= 10:
|
|
|
122 |
|
123 |
return min_loss_output, min_loss_params, min_loss_embedding, min_loss_step + 1
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
def preprocess_audio(self, audio, target_sample_rate=44100):
|
126 |
sample_rate, data = audio
|
127 |
|
|
|
155 |
|
156 |
return data_tensor.to(self.device)
|
157 |
|
158 |
+
def process_audio(self, input_audio, reference_audio, ito_reference_audio):
|
159 |
input_tensor = self.preprocess_audio(input_audio, self.args.sample_rate)
|
160 |
reference_tensor = self.preprocess_audio(reference_audio, self.args.sample_rate)
|
161 |
ito_reference_tensor = self.preprocess_audio(ito_reference_audio, self.args.sample_rate)
|
|
|
164 |
|
165 |
output_audio, predicted_params = self.mastering_style_transfer(input_tensor, reference_feature)
|
166 |
|
167 |
+
return output_audio, predicted_params, self.args.sample_rate
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
def print_param_difference(self, initial_params, ito_params):
|
170 |
all_diffs = []
|
|
|
250 |
|
251 |
return "\n".join(output)
|
252 |
|
253 |
+
def get_top_n_diff_string(self, initial_params, ito_params, top_n=5):
|
254 |
if initial_params is None or ito_params is None:
|
255 |
return "Cannot compare parameters"
|
256 |
|
|
|
271 |
normalized_diff = abs(ito_value - initial_value)
|
272 |
all_diffs.append((fx_name, 'width', initial_value.item(), ito_value.item(), normalized_diff.item()))
|
273 |
|
274 |
+
top_diffs = sorted(all_diffs, key=lambda x: x[4], reverse=True)[:top_n]
|
275 |
|
276 |
output = ["Top 10 parameter differences (sorted by normalized difference):"]
|
277 |
for fx_name, param_name, initial_value, ito_value, normalized_diff in top_diffs:
|
|
|
294 |
new_state_dict[name] = v
|
295 |
model.load_state_dict(new_state_dict, strict=False)
|
296 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|