Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
import soundfile as sf | |
import numpy as np | |
import yaml | |
from inference import MasteringStyleTransfer | |
from utils import download_youtube_audio | |
from config import args | |
mastering_transfer = MasteringStyleTransfer(args) | |
def denormalize_audio(audio, dtype=np.int16): | |
""" | |
Denormalize the audio from the range [-1, 1] to the full range of the specified dtype. | |
""" | |
if dtype == np.int16: | |
audio = np.clip(audio, -1, 1) # Ensure the input is in the range [-1, 1] | |
return (audio * 32767).astype(np.int16) | |
elif dtype == np.float32: | |
return audio.astype(np.float32) | |
else: | |
raise ValueError("Unsupported dtype. Use np.int16 or np.float32.") | |
def process_audio(input_audio, reference_audio): | |
output_audio, predicted_params, _, _, _, sr = mastering_transfer.process_audio( | |
input_audio, reference_audio, reference_audio, {}, False | |
) | |
param_output = mastering_transfer.get_param_output_string(predicted_params) | |
# Convert output_audio to numpy array if it's a tensor | |
if isinstance(output_audio, torch.Tensor): | |
output_audio = output_audio.cpu().numpy() | |
# Denormalize the audio to int16 | |
output_audio = denormalize_audio(output_audio, dtype=np.int16) | |
# Ensure the audio is in the correct shape (samples, channels) | |
if output_audio.ndim == 1: | |
output_audio = output_audio.reshape(-1, 1) | |
elif output_audio.ndim > 2: | |
output_audio = output_audio.squeeze() | |
print(output_audio.shape) | |
print(param_output) | |
return (sr, output_audio), param_output | |
def perform_ito(input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights): | |
if ito_reference_audio is None: | |
ito_reference_audio = reference_audio | |
ito_config = { | |
'optimizer': optimizer, | |
'learning_rate': learning_rate, | |
'num_steps': num_steps, | |
'af_weights': af_weights, | |
'sample_rate': args.sample_rate | |
} | |
input_tensor = mastering_transfer.preprocess_audio(input_audio, args.sample_rate) | |
reference_tensor = mastering_transfer.preprocess_audio(reference_audio, args.sample_rate) | |
ito_reference_tensor = mastering_transfer.preprocess_audio(ito_reference_audio, args.sample_rate) | |
initial_reference_feature = mastering_transfer.get_reference_embedding(reference_tensor) | |
ito_log = "" | |
for log_entry, current_output, current_params, step in mastering_transfer.inference_time_optimization( | |
input_tensor, ito_reference_tensor, ito_config, initial_reference_feature | |
): | |
ito_log += log_entry | |
ito_param_output = mastering_transfer.get_param_output_string(current_params) | |
# Convert current_output to numpy array if it's a tensor | |
if isinstance(current_output, torch.Tensor): | |
current_output = current_output.cpu().numpy() | |
# Denormalize the audio to int16 | |
current_output = denormalize_audio(current_output, dtype=np.int16) | |
# Ensure the audio is in the correct shape (samples, channels) | |
if current_output.ndim == 1: | |
current_output = current_output.reshape(-1, 1) | |
elif current_output.ndim > 2: | |
current_output = current_output.squeeze() | |
yield (args.sample_rate, current_output), ito_param_output, step, ito_log | |
with gr.Blocks() as demo: | |
gr.Markdown("# Mastering Style Transfer Demo") | |
with gr.Tab("Upload Audio"): | |
with gr.Row(): | |
input_audio = gr.Audio(label="Input Audio") | |
reference_audio = gr.Audio(label="Reference Audio") | |
process_button = gr.Button("Process Mastering Style Transfer") | |
with gr.Row(): | |
output_audio = gr.Audio(label="Output Audio", type='tuple') | |
param_output = gr.Textbox(label="Predicted Parameters", lines=10) | |
process_button.click( | |
process_audio, | |
inputs=[input_audio, reference_audio], | |
outputs=[output_audio, param_output] | |
) | |
gr.Markdown("## Inference Time Optimization (ITO)") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
ito_reference_audio = gr.Audio(label="ITO Reference Audio (optional)") | |
num_steps = gr.Slider(minimum=1, maximum=1000, value=100, step=1, label="Number of Steps") | |
optimizer = gr.Dropdown(["Adam", "RAdam", "SGD"], value="RAdam", label="Optimizer") | |
learning_rate = gr.Slider(minimum=0.0001, maximum=0.1, value=0.001, step=0.0001, label="Learning Rate") | |
af_weights = gr.Textbox(label="AudioFeatureLoss Weights (comma-separated)", value="0.1,0.001,1.0,1.0,0.1") | |
ito_button = gr.Button("Perform ITO") | |
ito_output_audio = gr.Audio(label="ITO Output Audio") | |
ito_param_output = gr.Textbox(label="ITO Predicted Parameters", lines=10) | |
ito_steps_taken = gr.Number(label="ITO Steps Taken") | |
with gr.Column(scale=1): | |
ito_log = gr.Textbox(label="ITO Log", lines=30) | |
def run_ito(input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights): | |
af_weights = [float(w.strip()) for w in af_weights.split(',')] | |
return perform_ito( | |
input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights | |
) | |
ito_button.click( | |
run_ito, | |
inputs=[input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights], | |
outputs=[ito_output_audio, ito_param_output, ito_steps_taken, ito_log] | |
) | |
demo.launch() | |
# import gradio as gr | |
# import torch | |
# import soundfile as sf | |
# import numpy as np | |
# import yaml | |
# from inference import MasteringStyleTransfer | |
# from utils import download_youtube_audio | |
# from config import args | |
# mastering_transfer = MasteringStyleTransfer(args) | |
# def process_audio(input_audio, reference_audio, perform_ito, ito_reference_audio=None): | |
# # Process the audio files | |
# output_audio, predicted_params, ito_output_audio, ito_predicted_params, ito_log, sr = mastering_transfer.process_audio( | |
# input_audio, reference_audio, ito_reference_audio if ito_reference_audio else reference_audio, {}, perform_ito | |
# ) | |
# # Generate parameter output strings | |
# param_output = mastering_transfer.get_param_output_string(predicted_params) | |
# ito_param_output = mastering_transfer.get_param_output_string(ito_predicted_params) if ito_predicted_params is not None else "ITO not performed" | |
# # Generate top 10 differences if ITO was performed | |
# top_10_diff = mastering_transfer.get_top_10_diff_string(predicted_params, ito_predicted_params) if ito_predicted_params is not None else "ITO not performed" | |
# return "output_mastered.wav", "ito_output_mastered.wav" if ito_output_audio is not None else None, param_output, ito_param_output, top_10_diff, ito_log | |
# def process_with_ito(input_audio, reference_audio, perform_ito, use_same_reference, ito_reference_audio): | |
# ito_ref = reference_audio if use_same_reference else ito_reference_audio | |
# return process_audio(input_audio, reference_audio, perform_ito, ito_ref) | |
# def process_youtube_with_ito(input_url, reference_url, perform_ito, use_same_reference, ito_reference_url): | |
# input_audio = download_youtube_audio(input_url) | |
# reference_audio = download_youtube_audio(reference_url) | |
# ito_ref = reference_audio if use_same_reference else download_youtube_audio(ito_reference_url) | |
# output_audio, predicted_params, ito_output_audio, ito_predicted_params, ito_log, sr = mastering_transfer.process_audio( | |
# input_audio, reference_audio, ito_ref, {}, perform_ito, log_ito=True | |
# ) | |
# param_output = mastering_transfer.get_param_output_string(predicted_params) | |
# ito_param_output = mastering_transfer.get_param_output_string(ito_predicted_params) if ito_predicted_params is not None else "ITO not performed" | |
# top_10_diff = mastering_transfer.get_top_10_diff_string(predicted_params, ito_predicted_params) if ito_predicted_params is not None else "ITO not performed" | |
# return "output_mastered_yt.wav", "ito_output_mastered_yt.wav" if ito_output_audio is not None else None, param_output, ito_param_output, top_10_diff, ito_log | |
# with gr.Blocks() as demo: | |
# gr.Markdown("# Mastering Style Transfer Demo") | |
# with gr.Tab("Upload Audio"): | |
# input_audio = gr.Audio(label="Input Audio") | |
# reference_audio = gr.Audio(label="Reference Audio") | |
# perform_ito = gr.Checkbox(label="Perform ITO") | |
# with gr.Column(visible=False) as ito_options: | |
# use_same_reference = gr.Checkbox(label="Use same reference audio for ITO", value=True) | |
# ito_reference_audio = gr.Audio(label="ITO Reference Audio", visible=False) | |
# def update_ito_options(perform_ito): | |
# return gr.Column.update(visible=perform_ito) | |
# def update_ito_reference(use_same): | |
# return gr.Audio.update(visible=not use_same) | |
# perform_ito.change(fn=update_ito_options, inputs=perform_ito, outputs=ito_options) | |
# use_same_reference.change(fn=update_ito_reference, inputs=use_same_reference, outputs=ito_reference_audio) | |
# submit_button = gr.Button("Process") | |
# output_audio = gr.Audio(label="Output Audio") | |
# ito_output_audio = gr.Audio(label="ITO Output Audio") | |
# param_output = gr.Textbox(label="Predicted Parameters", lines=10) | |
# ito_param_output = gr.Textbox(label="ITO Predicted Parameters", lines=10) | |
# top_10_diff = gr.Textbox(label="Top 10 Parameter Differences", lines=10) | |
# ito_log = gr.Textbox(label="ITO Log", lines=20) | |
# submit_button.click( | |
# process_with_ito, | |
# inputs=[input_audio, reference_audio, perform_ito, use_same_reference, ito_reference_audio], | |
# outputs=[output_audio, ito_output_audio, param_output, ito_param_output, top_10_diff, ito_log] | |
# ) | |
# with gr.Tab("YouTube URLs"): | |
# input_url = gr.Textbox(label="Input YouTube URL") | |
# reference_url = gr.Textbox(label="Reference YouTube URL") | |
# perform_ito_yt = gr.Checkbox(label="Perform ITO") | |
# with gr.Column(visible=False) as ito_options_yt: | |
# use_same_reference_yt = gr.Checkbox(label="Use same reference audio for ITO", value=True) | |
# ito_reference_url = gr.Textbox(label="ITO Reference YouTube URL", visible=False) | |
# def update_ito_options_yt(perform_ito): | |
# return gr.Column.update(visible=perform_ito) | |
# def update_ito_reference_yt(use_same): | |
# return gr.Textbox.update(visible=not use_same) | |
# perform_ito_yt.change(fn=update_ito_options_yt, inputs=perform_ito_yt, outputs=ito_options_yt) | |
# use_same_reference_yt.change(fn=update_ito_reference_yt, inputs=use_same_reference_yt, outputs=ito_reference_url) | |
# submit_button_yt = gr.Button("Process") | |
# output_audio_yt = gr.Audio(label="Output Audio") | |
# ito_output_audio_yt = gr.Audio(label="ITO Output Audio") | |
# param_output_yt = gr.Textbox(label="Predicted Parameters", lines=10) | |
# ito_param_output_yt = gr.Textbox(label="ITO Predicted Parameters", lines=10) | |
# top_10_diff_yt = gr.Textbox(label="Top 10 Parameter Differences", lines=10) | |
# ito_log_yt = gr.Textbox(label="ITO Log", lines=20) | |
# submit_button_yt.click( | |
# process_youtube_with_ito, | |
# inputs=[input_url, reference_url, perform_ito_yt, use_same_reference_yt, ito_reference_url], | |
# outputs=[output_audio_yt, ito_output_audio_yt, param_output_yt, ito_param_output_yt, top_10_diff_yt, ito_log_yt] | |
# ) | |
# demo.launch() |