Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import spaces | |
import torchaudio | |
from audiocraft.models import MusicGen | |
from audiocraft.data.audio import audio_write | |
import tempfile | |
import os | |
import logging | |
import torch | |
from pydub import AudioSegment | |
import io | |
import random | |
#logging.basicConfig(level=logging.DEBUG) | |
# Check if CUDA is available | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
# Placeholder Utility Functions | |
#def peak_normalize(y, target_peak=0.97): | |
# return target_peak * (y / np.max(np.abs(y))) | |
# | |
#def rms_normalize(y, target_rms=0.05): | |
# return y * (target_rms / np.sqrt(np.mean(y**2))) | |
def preprocess_audio(waveform): | |
waveform_np = waveform.cpu().squeeze().numpy() # Move to CPU before converting to NumPy | |
# processed_waveform_np = rms_normalize(peak_normalize(waveform_np)) | |
return torch.from_numpy(waveform_np).unsqueeze(0).to(device) | |
def generate_drum_sample(): | |
model = MusicGen.get_pretrained('pharoAIsanders420/micro-musicgen-jungle') | |
model.set_generation_params(duration=10) | |
wav = model.generate_unconditional(1).squeeze(0) # Reducing dimensions if necessary | |
filename_without_extension = f'jungle' | |
filename_with_extension = f'{filename_without_extension}.wav' | |
audio_write(filename_without_extension, wav.cpu(), model.sample_rate, strategy="loudness", loudness_compressor=True) | |
return filename_with_extension | |
def continue_drum_sample(existing_audio_path): | |
# Load the existing audio | |
existing_audio, sr = torchaudio.load(existing_audio_path) | |
existing_audio = existing_audio.to(device) # Ensure the existing audio is on the GPU if available | |
# Set fixed durations | |
prompt_duration = 2 # seconds | |
output_duration = 10 # seconds | |
# Calculate the slice from the end of the current audio based on prompt_duration | |
num_samples = int(prompt_duration * sr) | |
if existing_audio.shape[1] < num_samples: | |
raise ValueError("The existing audio is too short for the specified prompt duration.") | |
start_sample = existing_audio.shape[1] - num_samples | |
prompt_waveform = existing_audio[..., start_sample:] | |
# Assume model is already loaded and configured to generate drum samples | |
model = MusicGen.get_pretrained('pharoAIsanders420/micro-musicgen-jungle') | |
model.set_generation_params(duration=output_duration) | |
# Generate continuation | |
output = model.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True) | |
output = output.to(device) # Ensure the new output is on the same device as existing_audio | |
if output.dim() == 3: # [batch_size, channels, samples] | |
output = output.squeeze(0) # Remove batch dimension if present | |
if output.dim() == 1: | |
output = output.unsqueeze(0) # Mono to [1, samples] | |
# Combine the new output with the existing audio | |
combined_audio = torch.cat((existing_audio, output), dim=1) | |
# Move combined audio to CPU for saving | |
combined_audio = combined_audio.cpu() | |
# Save combined audio to a new file | |
combined_file_path = f'./continued_jungle_{random.randint(1000, 9999)}.wav' | |
torchaudio.save(combined_file_path, combined_audio, sr) | |
return combined_file_path | |
def generate_music(wav_filename, prompt_duration, musicgen_model, output_duration): | |
# Load the audio from the passed file path | |
song, sr = torchaudio.load(wav_filename) | |
song = song.to(device) | |
# Load the model | |
model_name = musicgen_model.split(" ")[0] | |
model_continue = MusicGen.get_pretrained(model_name) | |
# Setting generation parameters | |
model_continue.set_generation_params( | |
use_sampling=True, | |
top_k=250, | |
top_p=0.0, | |
temperature=1.0, | |
duration=output_duration, | |
cfg_coef=3 | |
) | |
prompt_waveform = song[..., :int(prompt_duration * sr)] | |
prompt_waveform = preprocess_audio(prompt_waveform) | |
output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True) | |
output = output.cpu() # Move the output tensor back to CPU | |
# Ensure the output tensor has at most 2 dimensions | |
if len(output.size()) > 2: | |
output = output.squeeze() | |
filename_without_extension = f'continued_music' | |
filename_with_extension = f'{filename_without_extension}.wav' | |
audio_write(filename_without_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True) | |
return filename_with_extension | |
def continue_music(input_audio_path, prompt_duration, musicgen_model, output_duration): | |
# Load the audio from the given file path | |
song, sr = torchaudio.load(input_audio_path) | |
song = song.to(device) | |
# Load the model and set generation parameters | |
model_continue = MusicGen.get_pretrained(musicgen_model.split(" ")[0]) | |
model_continue.set_generation_params( | |
use_sampling=True, | |
top_k=250, | |
top_p=0.0, | |
temperature=1.0, | |
duration=output_duration, | |
cfg_coef=3 | |
) | |
original_audio = AudioSegment.from_mp3(input_audio_path) | |
current_audio = original_audio | |
file_paths_for_cleanup = [] # List to track generated file paths for cleanup | |
for i in range(1): | |
# Calculate the slice from the end of the current audio based on prompt_duration | |
num_samples = int(prompt_duration * sr) | |
if current_audio.duration_seconds * 1000 < prompt_duration * 1000: | |
raise ValueError("The prompt_duration is longer than the current audio length.") | |
start_time = current_audio.duration_seconds * 1000 - prompt_duration * 1000 | |
prompt_audio = current_audio[start_time:] | |
# Convert the prompt audio to a PyTorch tensor | |
prompt_bytes = prompt_audio.export(format="wav").read() | |
prompt_waveform, _ = torchaudio.load(io.BytesIO(prompt_bytes)) | |
prompt_waveform = prompt_waveform.to(device) | |
# Prepare the audio slice for generation | |
prompt_waveform = preprocess_audio(prompt_waveform) | |
output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True) | |
output = output.cpu() # Move the output tensor back to CPU | |
if len(output.size()) > 2: | |
output = output.squeeze() | |
filename_without_extension = f'continue_{i}' | |
filename_with_extension = f'{filename_without_extension}.wav' | |
correct_filename_extension = f'{filename_without_extension}.wav.wav' # Apply the workaround for audio_write | |
audio_write(filename_with_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True) | |
generated_audio_segment = AudioSegment.from_wav(correct_filename_extension) | |
# Replace the prompt portion with the generated audio | |
current_audio = current_audio[:start_time] + generated_audio_segment | |
file_paths_for_cleanup.append(correct_filename_extension) # Add to cleanup list | |
combined_audio_filename = f"combined_audio_{random.randint(1, 10000)}.mp3" | |
current_audio.export(combined_audio_filename, format="mp3") | |
# Clean up temporary files using the list of file paths | |
for file_path in file_paths_for_cleanup: | |
os.remove(file_path) | |
return combined_audio_filename | |
# Define the expandable sections | |
musicgen_micro_blurb = """ | |
## musicgen_micro | |
musicgen micro is an experimental series of models by aaron abebe. they are incredibly fast, and extra insane. this one does goated jungle drums. we're very excited about these. | |
[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> aaron's github](https://github.com/aaronabebe/) | |
[<img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Hugging Face" width="20" style="vertical-align:middle"> musicgen-micro on huggingface](https://huggingface.co/pharoAIsanders420/micro-musicgen-jungle) | |
""" | |
musicgen_blurb = """ | |
## musicgen | |
musicgen is a transformer-based music model that generates audio. It can also do something called a continuation, which was initially meant to extend musicgen outputs beyond 30 seconds. it can be used with any input audio to produce surprising results. | |
[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> audiocraft github](https://github.com/facebookresearch/audiocraft) | |
visit https://thecollabagepatch.com/infinitepolo.mp3 or https://thecollabagepatch.com/audiocraft.mp3 to hear continuations in action. | |
see also https://youtube.com/@thecollabagepatch | |
""" | |
finetunes_blurb = """ | |
## fine-tuned models | |
the fine-tunes hosted on the huggingface hub are provided collectively by the musicgen discord community. thanks to vanya, mj, hoenn, septicDNB and of course, lyra. | |
[<img src="https://cdn.iconscout.com/icon/free/png-256/discord-3691244-3073764.png" alt="Discord" width="20" style="vertical-align:middle"> musicgen discord](https://discord.gg/93kX8rGZ) | |
[<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" style="vertical-align:middle"> fine-tuning colab notebook by lyra](https://colab.research.google.com/drive/13tbcC3A42KlaUZ21qvUXd25SFLu8WIvb) | |
""" | |
# Define the fine-tunes blurb for each model | |
fine_tunes_info = """ | |
## thepatch/vanya_ai_dnb_0.1 | |
thepatch/vanya_ai_dnb_0.1 was trained by vanya. [vanya's Twitter](https://twitter.com/@veryVANYA) π - it treats almost all input audio as the beginning of a buildup to a dnb drop (can do downtempo well) | |
## thepatch/bleeps-medium | |
thepatch/bleeps-medium was trained by kevin and lyra [lyra's Twitter](https://twitter.com/@_lyraaaa_) π - it is a medium model. it's more melodic and ambient sometimes than vanya's, but there's a 50/50 chance it gets real heavy with the edm vibes. It can be amazing at turning your chords into pads, and is a good percussionist. | |
## thepatch/budots_remix | |
thepatch/budots_remix was trained by MJ BERSABEph. budots is a dope niche genre from the philippines apparently. this one will often do fascinating, demonic, kinds of vocal chopping. warning: it tends to speed up and slow down tempo, which makes it hard to use in a daw. | |
## thepatch/hoenn_lofi | |
thepatch/hoenn_lofi is a large fine-tune by hoenn. [hoenn's Twitter](https://twitter.com/@eschatolocation) π - this model is a large boi, and it shows. even tho it is trained to do lo-fi, its ability to run with your melodies and not ruin them is unparalleled among the fine-tunes so far. | |
## thepatch/PhonkV2 | |
thepatch/PhonkV2 was trained by MJ BERSABEph. there are multiple versions in the discord. | |
## foureyednymph/musicgen-sza-sos-small | |
foureyednymph/musicgen-sza-sos-small was just trained by foureyednymph. We're all about to find out if it does continuations well. | |
""" | |
# Create the Gradio interface | |
with gr.Blocks() as iface: | |
gr.Markdown("# the-micro-slot-machine") | |
gr.Markdown("two ai's jamming. warning: outputs will be very strange, likely stupid, and possibly rad.") | |
gr.Markdown("this is an even weirder slot machine than the other one. on the left, you get to generate some state of the art lo-fi jungle drums at incredible speed thanks to aaron's new class of model, and if you want you can have it continue its own output. Then, you can either press the generate_music button to use the first 5 seconds as a prompt, or you can re-upload the audio into the continue_music section to have a fine-tune continue from the end of the jungle drum output, however long and insane it is. think of this as a very weird relay race and you're winning.") | |
with gr.Accordion("more info", open=False): | |
gr.Markdown(musicgen_micro_blurb) | |
gr.Markdown(musicgen_blurb) | |
gr.Markdown(finetunes_blurb) | |
with gr.Accordion("fine-tunes info", open=False): | |
gr.Markdown(fine_tunes_info) | |
with gr.Row(): | |
with gr.Column(): | |
generate_button = gr.Button("Generate Drum Sample") | |
drum_audio = gr.Audio(label="Generated Drum Sample", type="filepath", interactive=True, show_download_button=True) | |
continue_drum_sample_button = gr.Button("Continue Drum Sample") | |
with gr.Column(): | |
prompt_duration = gr.Dropdown(label="Prompt Duration (seconds)", choices=list(range(1, 11)), value=5) | |
output_duration = gr.Slider(label="Output Duration (seconds)", minimum=10, maximum=30, step=1, value=20) | |
musicgen_model = gr.Dropdown(label="MusicGen Model", choices=[ | |
"thepatch/vanya_ai_dnb_0.1 (small)", | |
"thepatch/budots_remix (small)", | |
"thepatch/PhonkV2 (small)", | |
"thepatch/bleeps-medium (medium)", | |
"thepatch/hoenn_lofi (large)", | |
"foureyednymph/musicgen-sza-sos-small (small)" | |
], value="thepatch/vanya_ai_dnb_0.1 (small)") | |
generate_music_button = gr.Button("Generate Music") | |
output_audio = gr.Audio(label="Generated Music", type="filepath") | |
continue_button = gr.Button("Continue Generating Music") | |
continue_output_audio = gr.Audio(label="Continued Music Output", type="filepath") | |
# Connecting the components | |
generate_button.click(generate_drum_sample, outputs=[drum_audio]) | |
continue_drum_sample_button.click(continue_drum_sample, inputs=[drum_audio], outputs=[drum_audio]) | |
generate_music_button.click(generate_music, inputs=[drum_audio, prompt_duration, musicgen_model, output_duration], outputs=[output_audio]) | |
continue_button.click(continue_music, inputs=[output_audio, prompt_duration, musicgen_model, output_duration], outputs=continue_output_audio) | |
iface.launch() |