Spaces:

jhtonyKoo
/

ITO-Master

Running

App Files Files Community

jhtonyKoo commited on Oct 16, 2024

Commit

d09ad44

1 Parent(s): c9034be

modify app

Browse files

Files changed (2) hide show

app.py +5 -5
modules/loss.py +62 -0

app.py CHANGED Viewed

@@ -158,21 +158,21 @@ with gr.Blocks() as demo:
     with gr.Row():
         gr.Markdown("Interactive demo of Inference Time Optimization (ITO) for Music Mastering Style Transfer. \
                     The mastering style transfer is performed by a differentiable audio processing model, and the predicted parameters are shown as the output. \
-                    Perform mastering style transfer with an input source audio and a reference mastering style audio. On top of this result, you can perform ITO to optimize the reference embedding z~ref~ $z_{ref}$ to further gain control over the output mastering style.")
         gr.Image("ito_snow.png", width=300)
     gr.Markdown("## Step 1: Mastering Style Transfer")
     with gr.Tab("Upload Audio"):
         with gr.Row():
-            input_audio = gr.Audio(label="Source Audio (x~in~ $x_{in}$)")
-            reference_audio = gr.Audio(label="Reference Style Audio (x~ref~ $x_{ref}$)")
         process_button = gr.Button("Process Mastering Style Transfer")
         with gr.Row():
             with gr.Column():
-                output_audio = gr.Audio(label="Output Audio (y')", type='numpy')
                 normalized_input = gr.Audio(label="Normalized Source Audio", type='numpy')
             param_output = gr.Textbox(label="Predicted Parameters", lines=5)
@@ -213,7 +213,7 @@ with gr.Blocks() as demo:
     gr.Markdown("## Step 2: Inference Time Optimization (ITO)")
     with gr.Row():
-        ito_reference_audio = gr.Audio(label="ITO Reference Style Audio (optional)")
         with gr.Column():
             num_steps = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Number of Steps")
             optimizer = gr.Dropdown(["Adam", "RAdam", "SGD"], value="RAdam", label="Optimizer")

     with gr.Row():
         gr.Markdown("Interactive demo of Inference Time Optimization (ITO) for Music Mastering Style Transfer. \
                     The mastering style transfer is performed by a differentiable audio processing model, and the predicted parameters are shown as the output. \
+                    Perform mastering style transfer with an input source audio and a reference mastering style audio. On top of this result, you can perform ITO to optimize the reference embedding $z_{ref}$ to further gain control over the output mastering style.")
         gr.Image("ito_snow.png", width=300)
     gr.Markdown("## Step 1: Mastering Style Transfer")
     with gr.Tab("Upload Audio"):
         with gr.Row():
+            input_audio = gr.Audio(label="Source Audio $x_{in}$")
+            reference_audio = gr.Audio(label="Reference Style Audio $x_{ref}$")
         process_button = gr.Button("Process Mastering Style Transfer")
         with gr.Row():
             with gr.Column():
+                output_audio = gr.Audio(label="Output Audio y'", type='numpy')
                 normalized_input = gr.Audio(label="Normalized Source Audio", type='numpy')
             param_output = gr.Textbox(label="Predicted Parameters", lines=5)
     gr.Markdown("## Step 2: Inference Time Optimization (ITO)")
     with gr.Row():
+        ito_reference_audio = gr.Audio(label="ITO Reference Style Audio $x'_{ref}$ (optional)")
         with gr.Column():
             num_steps = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Number of Steps")
             optimizer = gr.Dropdown(["Adam", "RAdam", "SGD"], value="RAdam", label="Optimizer")

modules/loss.py CHANGED Viewed

@@ -176,6 +176,68 @@ class Loss:
         )
 """

         )
+import laion_clap
+import torchaudio
+# CLAP feature loss
+class CLAPFeatureLoss(nn.Module):
+    def __init__(self, distance_fn='mse'):
+        super(CLAPFeatureLoss, self).__init__()
+        self.target_sample_rate = 48000  # CLAP expects 48kHz audio
+        self.model = laion_clap.CLAP_Module(enable_fusion=False)
+        self.model.load_ckpt()  # download the default pretrained checkpoint
+        self.distance_fn = distance_fn
+        if distance_fn == 'mse':
+            self.compute_distance = F.mse_loss
+        elif distance_fn == 'l1':
+            self.compute_distance = F.l1_loss
+        elif distance_fn == 'cosine':
+            self.compute_distance = lambda x, y: 1 - F.cosine_similarity(x, y).mean()
+        else:
+            raise ValueError(f"Unsupported distance function: {distance_fn}")
+    def forward(self, input_audio, target_audio, sample_rate):
+        # Ensure input is in the correct shape (N, C, T)
+        if input_audio.dim() == 2:
+            input_audio = input_audio.unsqueeze(1)
+        if target_audio.dim() == 2:
+            target_audio = target_audio.unsqueeze(1)
+        # Convert to mono if stereo
+        if input_audio.shape[1] > 1:
+            input_audio = input_audio.mean(dim=1, keepdim=True)
+        if target_audio.shape[1] > 1:
+            target_audio = target_audio.mean(dim=1, keepdim=True)
+        # Resample if necessary
+        if sample_rate != self.target_sample_rate:
+            input_audio = self.resample(input_audio, sample_rate)
+            target_audio = self.resample(target_audio, sample_rate)
+        # Quantize audio data
+        input_audio = self.quantize(input_audio)
+        target_audio = self.quantize(target_audio)
+        # Get CLAP embeddings
+        input_embed = self.model.get_audio_embedding_from_data(x=input_audio, use_tensor=True)
+        target_embed = self.model.get_audio_embedding_from_data(x=target_audio, use_tensor=True)
+        # Compute loss using the specified distance function
+        loss = self.compute_distance(input_embed, target_embed)
+        return loss
+    def quantize(self, audio):
+        audio = audio.squeeze(1)  # Remove channel dimension
+        audio = torch.clamp(audio, -1.0, 1.0)
+        audio = (audio * 32767.0).to(torch.int16).to(torch.float32) / 32767.0
+        return audio
+    def resample(self, audio, sample_rate):
+        resampler = torchaudio.transforms.Resample(
+            orig_freq=sample_rate, new_freq=self.target_sample_rate
+        ).to(audio.device)
+        return resampler(audio)
 """