jhtonyKoo commited on
Commit
c13752e
·
1 Parent(s): 0ea0beb

modify app

Browse files
Files changed (2) hide show
  1. app.py +4 -5
  2. inference.py +5 -72
app.py CHANGED
@@ -42,8 +42,8 @@ def loudness_normalize(audio, sample_rate, target_loudness=-12.0):
42
  return loudness_normalized_audio
43
 
44
  def process_audio(input_audio, reference_audio):
45
- output_audio, predicted_params, _, _, _, sr = mastering_transfer.process_audio(
46
- input_audio, reference_audio, reference_audio, {}, False
47
  )
48
 
49
  param_output = mastering_transfer.get_param_output_string(predicted_params)
@@ -54,6 +54,8 @@ def process_audio(input_audio, reference_audio):
54
 
55
  # # Normalize output audio
56
  # output_audio = loudness_normalize(output_audio, sr)
 
 
57
 
58
  # Denormalize the audio to int16
59
  output_audio = denormalize_audio(output_audio, dtype=np.int16)
@@ -66,9 +68,6 @@ def process_audio(input_audio, reference_audio):
66
  # Ensure the audio is in the correct shape (samples, channels)
67
  if output_audio.shape[1] > output_audio.shape[0]:
68
  output_audio = output_audio.transpose(1,0)
69
-
70
- print(output_audio.shape)
71
- print(param_output)
72
 
73
  return (sr, output_audio), param_output
74
 
 
42
  return loudness_normalized_audio
43
 
44
  def process_audio(input_audio, reference_audio):
45
+ output_audio, predicted_params, sr = mastering_transfer.process_audio(
46
+ input_audio, reference_audio, reference_audio
47
  )
48
 
49
  param_output = mastering_transfer.get_param_output_string(predicted_params)
 
54
 
55
  # # Normalize output audio
56
  # output_audio = loudness_normalize(output_audio, sr)
57
+ print(output_audio.shape)
58
+ print(f"sr: {sr}")
59
 
60
  # Denormalize the audio to int16
61
  output_audio = denormalize_audio(output_audio, dtype=np.int16)
 
68
  # Ensure the audio is in the correct shape (samples, channels)
69
  if output_audio.shape[1] > output_audio.shape[0]:
70
  output_audio = output_audio.transpose(1,0)
 
 
 
71
 
72
  return (sr, output_audio), param_output
73
 
inference.py CHANGED
@@ -108,7 +108,7 @@ class MasteringStyleTransfer:
108
  # Log top 5 parameter differences
109
  if step == 0:
110
  initial_params = current_params
111
- top_5_diff = self.get_top_5_diff_string(initial_params, current_params)
112
  log_entry = f"Step {step + 1}, Loss: {total_loss.item():.4f}\n{top_5_diff}\n"
113
 
114
  if divergence_counter >= 10:
@@ -122,17 +122,6 @@ class MasteringStyleTransfer:
122
 
123
  return min_loss_output, min_loss_params, min_loss_embedding, min_loss_step + 1
124
 
125
- def get_top_5_diff_string(self, initial_params, current_params):
126
- diff_dict = {}
127
- for key in initial_params.keys():
128
- diff = abs(current_params[key] - initial_params[key])
129
- diff_dict[key] = diff
130
-
131
- sorted_diff = sorted(diff_dict.items(), key=lambda x: x[1], reverse=True)
132
- top_5_diff = sorted_diff[:5]
133
-
134
- return "\n".join([f"{key}: {value:.4f}" for key, value in top_5_diff])
135
-
136
  def preprocess_audio(self, audio, target_sample_rate=44100):
137
  sample_rate, data = audio
138
 
@@ -166,7 +155,7 @@ class MasteringStyleTransfer:
166
 
167
  return data_tensor.to(self.device)
168
 
169
- def process_audio(self, input_audio, reference_audio, ito_reference_audio, params, perform_ito, log_ito=False):
170
  input_tensor = self.preprocess_audio(input_audio, self.args.sample_rate)
171
  reference_tensor = self.preprocess_audio(reference_audio, self.args.sample_rate)
172
  ito_reference_tensor = self.preprocess_audio(ito_reference_audio, self.args.sample_rate)
@@ -175,24 +164,7 @@ class MasteringStyleTransfer:
175
 
176
  output_audio, predicted_params = self.mastering_style_transfer(input_tensor, reference_feature)
177
 
178
- if perform_ito:
179
- ito_log = []
180
- for i in range(self.args.max_iter_ito):
181
- loss, ito_predicted_params = self.ito_step(input_audio, ito_reference_audio, predicted_params)
182
- if log_ito:
183
- top_10_diff = self.get_top_10_diff(predicted_params, ito_predicted_params)
184
- log_entry = f"Iteration {i+1}, Loss: {loss:.4f}\nTop 10 parameter differences:\n{top_10_diff}\n"
185
- ito_log.append(log_entry)
186
- predicted_params = ito_predicted_params
187
-
188
- ito_output_audio = self.converter.convert(input_audio, predicted_params)
189
- ito_log = "\n".join(ito_log) if log_ito else None
190
- else:
191
- ito_output_audio = None
192
- ito_predicted_params = None
193
- ito_log = None
194
-
195
- return output_audio, predicted_params, ito_output_audio, ito_predicted_params, ito_log, self.args.sample_rate
196
 
197
  def print_param_difference(self, initial_params, ito_params):
198
  all_diffs = []
@@ -278,7 +250,7 @@ class MasteringStyleTransfer:
278
 
279
  return "\n".join(output)
280
 
281
- def get_top_10_diff_string(self, initial_params, ito_params):
282
  if initial_params is None or ito_params is None:
283
  return "Cannot compare parameters"
284
 
@@ -299,7 +271,7 @@ class MasteringStyleTransfer:
299
  normalized_diff = abs(ito_value - initial_value)
300
  all_diffs.append((fx_name, 'width', initial_value.item(), ito_value.item(), normalized_diff.item()))
301
 
302
- top_diffs = sorted(all_diffs, key=lambda x: x[4], reverse=True)[:10]
303
 
304
  output = ["Top 10 parameter differences (sorted by normalized difference):"]
305
  for fx_name, param_name, initial_value, ito_value, normalized_diff in top_diffs:
@@ -322,42 +294,3 @@ def reload_weights(model, ckpt_path, device):
322
  new_state_dict[name] = v
323
  model.load_state_dict(new_state_dict, strict=False)
324
 
325
-
326
- if __name__ == "__main__":
327
- basis_path = '/data2/tony/Mastering_Style_Transfer/results/dasp_tcn_tuneenc_daspman_loudnessnorm/ckpt/1000/'
328
-
329
- parser = argparse.ArgumentParser(description="Mastering Style Transfer")
330
- parser.add_argument("--input_path", type=str, required=True, help="Path to input audio file")
331
- parser.add_argument("--reference_path", type=str, required=True, help="Path to reference audio file")
332
- parser.add_argument("--ito_reference_path", type=str, required=True, help="Path to ITO reference audio file")
333
- parser.add_argument("--model_path", type=str, default=f"{basis_path}dasp_tcn_tuneenc_daspman_loudnessnorm_mastering_converter_1000.pt", help="Path to mastering converter model")
334
- parser.add_argument("--encoder_path", type=str, default=f"{basis_path}dasp_tcn_tuneenc_daspman_loudnessnorm_effects_encoder_1000.pt", help="Path to effects encoder model")
335
- parser.add_argument("--perform_ito", action="store_true", help="Whether to perform ITO")
336
- parser.add_argument("--optimizer", type=str, default="RAdam", help="Optimizer for ITO")
337
- parser.add_argument("--learning_rate", type=float, default=0.001, help="Learning rate for ITO")
338
- parser.add_argument("--num_steps", type=int, default=100, help="Number of optimization steps for ITO")
339
- parser.add_argument("--af_weights", nargs='+', type=float, default=[0.1, 0.001, 1.0, 1.0, 0.1], help="Weights for AudioFeatureLoss")
340
- parser.add_argument("--sample_rate", type=int, default=44100, help="Sample rate for AudioFeatureLoss")
341
- parser.add_argument("--path_to_config", type=str, default='/home/tony/mastering_transfer/networks/configs.yaml', help="Path to network architecture configuration file")
342
-
343
- args = parser.parse_args()
344
-
345
- # load network configurations
346
- with open(args.path_to_config, 'r') as f:
347
- configs = yaml.full_load(f)
348
- args.cfg_converter = configs['TCN']['param_mapping']
349
- args.cfg_enc = configs['Effects_Encoder']['default']
350
-
351
- ito_config = {
352
- 'optimizer': args.optimizer,
353
- 'learning_rate': args.learning_rate,
354
- 'num_steps': args.num_steps,
355
- 'af_weights': args.af_weights,
356
- 'sample_rate': args.sample_rate
357
- }
358
-
359
- mastering_style_transfer = MasteringStyleTransfer(args)
360
- output_audio, predicted_params, ito_output_audio, ito_predicted_params, optimized_reference_feature, sr, ito_steps = mastering_style_transfer.process_audio(
361
- args.input_path, args.reference_path, args.ito_reference_path, ito_config, args.perform_ito
362
- )
363
-
 
108
  # Log top 5 parameter differences
109
  if step == 0:
110
  initial_params = current_params
111
+ top_5_diff = self.get_top_n_diff_string(initial_params, current_params, top_n=5)
112
  log_entry = f"Step {step + 1}, Loss: {total_loss.item():.4f}\n{top_5_diff}\n"
113
 
114
  if divergence_counter >= 10:
 
122
 
123
  return min_loss_output, min_loss_params, min_loss_embedding, min_loss_step + 1
124
 
 
 
 
 
 
 
 
 
 
 
 
125
  def preprocess_audio(self, audio, target_sample_rate=44100):
126
  sample_rate, data = audio
127
 
 
155
 
156
  return data_tensor.to(self.device)
157
 
158
+ def process_audio(self, input_audio, reference_audio, ito_reference_audio):
159
  input_tensor = self.preprocess_audio(input_audio, self.args.sample_rate)
160
  reference_tensor = self.preprocess_audio(reference_audio, self.args.sample_rate)
161
  ito_reference_tensor = self.preprocess_audio(ito_reference_audio, self.args.sample_rate)
 
164
 
165
  output_audio, predicted_params = self.mastering_style_transfer(input_tensor, reference_feature)
166
 
167
+ return output_audio, predicted_params, self.args.sample_rate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
  def print_param_difference(self, initial_params, ito_params):
170
  all_diffs = []
 
250
 
251
  return "\n".join(output)
252
 
253
+ def get_top_n_diff_string(self, initial_params, ito_params, top_n=5):
254
  if initial_params is None or ito_params is None:
255
  return "Cannot compare parameters"
256
 
 
271
  normalized_diff = abs(ito_value - initial_value)
272
  all_diffs.append((fx_name, 'width', initial_value.item(), ito_value.item(), normalized_diff.item()))
273
 
274
+ top_diffs = sorted(all_diffs, key=lambda x: x[4], reverse=True)[:top_n]
275
 
276
  output = ["Top 10 parameter differences (sorted by normalized difference):"]
277
  for fx_name, param_name, initial_value, ito_value, normalized_diff in top_diffs:
 
294
  new_state_dict[name] = v
295
  model.load_state_dict(new_state_dict, strict=False)
296