File size: 14,060 Bytes
0e5a9e8
c22018b
0e5a9e8
 
 
 
 
 
 
 
 
 
c22018b
0e5a9e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4980832
0e5a9e8
 
 
 
 
 
 
 
 
 
 
 
4980832
0e5a9e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce1516d
0e5a9e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce1516d
0e5a9e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6070bf8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f20dbe8
 
 
6070bf8
 
0e5a9e8
 
528c312
0e5a9e8
 
 
6070bf8
ea6fe3e
 
 
6070bf8
ea6fe3e
 
6070bf8
0e5a9e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f20dbe8
0e5a9e8
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
import gradio as gr
import spaces
import torchaudio
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
import tempfile
import os
import logging
import torch
from pydub import AudioSegment
import io
import random


#logging.basicConfig(level=logging.DEBUG)

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Placeholder Utility Functions
#def peak_normalize(y, target_peak=0.97):
#    return target_peak * (y / np.max(np.abs(y)))
#
#def rms_normalize(y, target_rms=0.05):
#    return y * (target_rms / np.sqrt(np.mean(y**2)))

def preprocess_audio(waveform):
    waveform_np = waveform.cpu().squeeze().numpy()  # Move to CPU before converting to NumPy
#   processed_waveform_np = rms_normalize(peak_normalize(waveform_np))
    return torch.from_numpy(waveform_np).unsqueeze(0).to(device)

@spaces.GPU(duration=10)
def generate_drum_sample():
    model = MusicGen.get_pretrained('pharoAIsanders420/micro-musicgen-jungle')
    model.set_generation_params(duration=10)
    wav = model.generate_unconditional(1).squeeze(0)  # Reducing dimensions if necessary
    
    filename_without_extension = f'jungle'
    filename_with_extension = f'{filename_without_extension}.wav'

    audio_write(filename_without_extension, wav.cpu(), model.sample_rate, strategy="loudness", loudness_compressor=True)

    return filename_with_extension

@spaces.GPU(duration=10)
def continue_drum_sample(existing_audio_path):
    # Load the existing audio
    existing_audio, sr = torchaudio.load(existing_audio_path)
    existing_audio = existing_audio.to(device)  # Ensure the existing audio is on the GPU if available
    
    # Set fixed durations
    prompt_duration = 2  # seconds
    output_duration = 10  # seconds

    # Calculate the slice from the end of the current audio based on prompt_duration
    num_samples = int(prompt_duration * sr)
    if existing_audio.shape[1] < num_samples:
        raise ValueError("The existing audio is too short for the specified prompt duration.")

    start_sample = existing_audio.shape[1] - num_samples
    prompt_waveform = existing_audio[..., start_sample:]

    # Assume model is already loaded and configured to generate drum samples
    model = MusicGen.get_pretrained('pharoAIsanders420/micro-musicgen-jungle')
    model.set_generation_params(duration=output_duration)

    # Generate continuation
    output = model.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
    output = output.to(device)  # Ensure the new output is on the same device as existing_audio

    if output.dim() == 3:  # [batch_size, channels, samples]
        output = output.squeeze(0)  # Remove batch dimension if present

    if output.dim() == 1:
        output = output.unsqueeze(0)  # Mono to [1, samples]

    # Combine the new output with the existing audio
    combined_audio = torch.cat((existing_audio, output), dim=1)

    # Move combined audio to CPU for saving
    combined_audio = combined_audio.cpu()

    # Save combined audio to a new file
    combined_file_path = f'./continued_jungle_{random.randint(1000, 9999)}.wav'
    torchaudio.save(combined_file_path, combined_audio, sr)

    return combined_file_path

@spaces.GPU(duation=120)
def generate_music(wav_filename, prompt_duration, musicgen_model, output_duration):
    # Load the audio from the passed file path
    song, sr = torchaudio.load(wav_filename)
    song = song.to(device)

    # Load the model
    model_name = musicgen_model.split(" ")[0]
    model_continue = MusicGen.get_pretrained(model_name)

    # Setting generation parameters
    model_continue.set_generation_params(
        use_sampling=True,
        top_k=250,
        top_p=0.0,
        temperature=1.0,
        duration=output_duration,
        cfg_coef=3
    )

    prompt_waveform = song[..., :int(prompt_duration * sr)]
    prompt_waveform = preprocess_audio(prompt_waveform)
    
    output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
    output = output.cpu()  # Move the output tensor back to CPU
    
    # Ensure the output tensor has at most 2 dimensions
    if len(output.size()) > 2:
        output = output.squeeze()
    
    filename_without_extension = f'continued_music'
    filename_with_extension = f'{filename_without_extension}.wav'
    audio_write(filename_without_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)

    return filename_with_extension

@spaces.GPU(duration=120)
def continue_music(input_audio_path, prompt_duration, musicgen_model, output_duration):
    # Load the audio from the given file path
    song, sr = torchaudio.load(input_audio_path)
    song = song.to(device)

    # Load the model and set generation parameters
    model_continue = MusicGen.get_pretrained(musicgen_model.split(" ")[0])
    model_continue.set_generation_params(
        use_sampling=True,
        top_k=250,
        top_p=0.0,
        temperature=1.0,
        duration=output_duration,
        cfg_coef=3
    )

    original_audio = AudioSegment.from_mp3(input_audio_path)
    current_audio = original_audio

    file_paths_for_cleanup = []  # List to track generated file paths for cleanup

    for i in range(1):
        # Calculate the slice from the end of the current audio based on prompt_duration
        num_samples = int(prompt_duration * sr)
        if current_audio.duration_seconds * 1000 < prompt_duration * 1000:
            raise ValueError("The prompt_duration is longer than the current audio length.")

        start_time = current_audio.duration_seconds * 1000 - prompt_duration * 1000
        prompt_audio = current_audio[start_time:]

        # Convert the prompt audio to a PyTorch tensor
        prompt_bytes = prompt_audio.export(format="wav").read()
        prompt_waveform, _ = torchaudio.load(io.BytesIO(prompt_bytes))
        prompt_waveform = prompt_waveform.to(device)

        # Prepare the audio slice for generation
        prompt_waveform = preprocess_audio(prompt_waveform)

        output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
        output = output.cpu()  # Move the output tensor back to CPU

        if len(output.size()) > 2:
            output = output.squeeze()

        filename_without_extension = f'continue_{i}'
        filename_with_extension = f'{filename_without_extension}.wav'
        correct_filename_extension = f'{filename_without_extension}.wav.wav'  # Apply the workaround for audio_write

        audio_write(filename_with_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)
        generated_audio_segment = AudioSegment.from_wav(correct_filename_extension)

        # Replace the prompt portion with the generated audio
        current_audio = current_audio[:start_time] + generated_audio_segment

        file_paths_for_cleanup.append(correct_filename_extension)  # Add to cleanup list

    combined_audio_filename = f"combined_audio_{random.randint(1, 10000)}.mp3"
    current_audio.export(combined_audio_filename, format="mp3")

    # Clean up temporary files using the list of file paths
    for file_path in file_paths_for_cleanup:
        os.remove(file_path)

    return combined_audio_filename

    # Define the expandable sections
musicgen_micro_blurb = """

## musicgen_micro

musicgen micro is an experimental series of models by aaron abebe. they are incredibly fast, and extra insane. this one does goated jungle drums. we're very excited about these.

[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> aaron's github](https://github.com/aaronabebe/)

[<img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Hugging Face" width="20" style="vertical-align:middle"> musicgen-micro on huggingface](https://huggingface.co/pharoAIsanders420/micro-musicgen-jungle)

"""

musicgen_blurb = """

## musicgen

musicgen is a transformer-based music model that generates audio. It can also do something called a continuation, which was initially meant to extend musicgen outputs beyond 30 seconds. it can be used with any input audio to produce surprising results.

[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> audiocraft github](https://github.com/facebookresearch/audiocraft)

visit https://thecollabagepatch.com/infinitepolo.mp3 or https://thecollabagepatch.com/audiocraft.mp3 to hear continuations in action.

see also https://youtube.com/@thecollabagepatch 

"""

finetunes_blurb = """

## fine-tuned models

the fine-tunes hosted on the huggingface hub are provided collectively by the musicgen discord community. thanks to vanya, mj, hoenn, septicDNB and of course, lyra.

[<img src="https://cdn.iconscout.com/icon/free/png-256/discord-3691244-3073764.png" alt="Discord" width="20" style="vertical-align:middle"> musicgen discord](https://discord.gg/93kX8rGZ)

[<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" style="vertical-align:middle"> fine-tuning colab notebook by lyra](https://colab.research.google.com/drive/13tbcC3A42KlaUZ21qvUXd25SFLu8WIvb)

"""

# Define the fine-tunes blurb for each model
fine_tunes_info = """

## thepatch/vanya_ai_dnb_0.1

thepatch/vanya_ai_dnb_0.1 was trained by vanya. [vanya's Twitter](https://twitter.com/@veryVANYA) πŸ”— - it treats almost all input audio as the beginning of a buildup to a dnb drop (can do downtempo well)



## thepatch/bleeps-medium

thepatch/bleeps-medium was trained by kevin and lyra [lyra's Twitter](https://twitter.com/@_lyraaaa_) πŸ”— - it is a medium model. it's more melodic and ambient sometimes than vanya's, but there's a 50/50 chance it gets real heavy with the edm vibes. It can be amazing at turning your chords into pads, and is a good percussionist.



## thepatch/budots_remix

thepatch/budots_remix was trained by MJ BERSABEph. budots is a dope niche genre from the philippines apparently. this one will often do fascinating, demonic, kinds of vocal chopping. warning: it tends to speed up and slow down tempo, which makes it hard to use in a daw.



## thepatch/hoenn_lofi

thepatch/hoenn_lofi is a large fine-tune by hoenn. [hoenn's Twitter](https://twitter.com/@eschatolocation) πŸ”— - this model is a large boi, and it shows. even tho it is trained to do lo-fi, its ability to run with your melodies and not ruin them is unparalleled among the fine-tunes so far.



## thepatch/PhonkV2

thepatch/PhonkV2 was trained by MJ BERSABEph. there are multiple versions in the discord.



## foureyednymph/musicgen-sza-sos-small

foureyednymph/musicgen-sza-sos-small was just trained by foureyednymph. We're all about to find out if it does continuations well.

"""

# Create the Gradio interface
with gr.Blocks() as iface:
    gr.Markdown("# the-micro-slot-machine")
    gr.Markdown("two ai's jamming. warning: outputs will be very strange, likely stupid, and possibly rad.")
    gr.Markdown("this is an even weirder slot machine than the other one. on the left, you get to generate some state of the art lo-fi jungle drums at incredible speed thanks to aaron's new class of model, and if you want you can have it continue its own output. Then, you can either press the generate_music button to use the first 5 seconds as a prompt, or you can re-upload the audio into the continue_music section to have a fine-tune continue from the end of the jungle drum output, however long and insane it is. think of this as a very weird relay race and you're winning.")

    with gr.Accordion("more info", open=False):
        gr.Markdown(musicgen_micro_blurb)
        gr.Markdown(musicgen_blurb)
        gr.Markdown(finetunes_blurb)

    with gr.Accordion("fine-tunes info", open=False):
        gr.Markdown(fine_tunes_info)

    with gr.Row():
        with gr.Column():
            generate_button = gr.Button("Generate Drum Sample")
            drum_audio = gr.Audio(label="Generated Drum Sample", type="filepath", interactive=True, show_download_button=True)
            continue_drum_sample_button = gr.Button("Continue Drum Sample")
            

        with gr.Column():
            prompt_duration = gr.Dropdown(label="Prompt Duration (seconds)", choices=list(range(1, 11)), value=5)
            output_duration = gr.Slider(label="Output Duration (seconds)", minimum=10, maximum=30, step=1, value=20)
            musicgen_model = gr.Dropdown(label="MusicGen Model", choices=[
                "thepatch/vanya_ai_dnb_0.1 (small)",
                "thepatch/budots_remix (small)",
                "thepatch/PhonkV2 (small)",
                "thepatch/bleeps-medium (medium)",
                "thepatch/hoenn_lofi (large)",
                "foureyednymph/musicgen-sza-sos-small (small)"
            ], value="thepatch/vanya_ai_dnb_0.1 (small)")
            generate_music_button = gr.Button("Generate Music")
            output_audio = gr.Audio(label="Generated Music", type="filepath")
            continue_button = gr.Button("Continue Generating Music")
            continue_output_audio = gr.Audio(label="Continued Music Output", type="filepath")

    # Connecting the components
    generate_button.click(generate_drum_sample, outputs=[drum_audio])
    continue_drum_sample_button.click(continue_drum_sample, inputs=[drum_audio], outputs=[drum_audio])
    generate_music_button.click(generate_music, inputs=[drum_audio, prompt_duration, musicgen_model, output_duration], outputs=[output_audio])
    continue_button.click(continue_music, inputs=[output_audio, prompt_duration, musicgen_model, output_duration], outputs=continue_output_audio)

iface.launch()