Hjgugugjhuhjggg commited on
Commit
d59877f
·
verified ·
1 Parent(s): ccf19d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -41
app.py CHANGED
@@ -1,42 +1,57 @@
1
- import gradio as gr
2
- import random
3
  import torch
4
- from audiocraft.models import MusicGen
5
- from audiocraft.data.audio import audio_write
6
- import spaces
7
-
8
- model = MusicGen.get_pretrained("small")
9
-
10
- @spaces.GPU()
11
- def infer(duration, descriptions):
12
- if len(descriptions) > 8192:
13
- return "Error: La descripción no puede exceder los 8192 caracteres."
14
-
15
- seed = random.randint(0, 10000)
16
- torch.manual_seed(seed)
17
-
18
- model.set_generation_params(duration=duration, temperature=0.5)
19
- wav = model.generate(descriptions.split(", "))
20
- output_files = []
21
-
22
- for idx, one_wav in enumerate(wav):
23
- file_name = f'output_{idx}.wav'
24
- audio_write(file_name, one_wav.cpu(), model.sample_rate, strategy="loudness")
25
- output_files.append(file_name)
26
-
27
- return output_files[0]
28
-
29
- with gr.Blocks() as demo:
30
- gr.Markdown("# Generador de Música con MusicGen")
31
- duration_input = gr.Slider(minimum=1, maximum=600, label="Duración de la canción (segundos)", value=8)
32
- description_input = gr.Textbox(placeholder="Ejemplo: happy rock, energetic EDM", label="Descripción de la música")
33
- generate_button = gr.Button("Generar Música")
34
- output_audio = gr.Audio(label="Escuchar Música", type="filepath")
35
-
36
- generate_button.click(
37
- fn=infer,
38
- inputs=[duration_input, description_input],
39
- outputs=output_audio
40
- )
41
-
42
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
+ import gradio as gr
3
+ from transformers import AutoProcessor, MusicgenForConditionalGeneration
4
+ from TTS.api import TTS
5
+ import librosa
6
+ import numpy as np
7
+
8
+ processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
9
+ model_musicgen = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
10
+ model_tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cpu")
11
+
12
+ def generate_music(text_prompt, audio_prompt=None, tts_text=None):
13
+ inputs = processor(text=text_prompt, padding=True, return_tensors="pt")
14
+ max_duration_sec = 600
15
+ min_duration_sec = 30
16
+ max_new_tokens = int(max_duration_sec * model_musicgen.config.audio_length // model_musicgen.config.generation_length)
17
+ min_new_tokens = int(min_duration_sec * model_musicgen.config.audio_length // model_musicgen.config.generation_length)
18
+ audio_values = model_musicgen.generate(**inputs.to("cpu"), max_new_tokens=max_new_tokens, min_new_tokens=min_new_tokens)
19
+ music = audio_values.cpu().numpy()[0]
20
+
21
+ if audio_prompt is not None and tts_text is not None:
22
+ cloned_audio = model_tts.tts_with_vc(tts_text, speaker_wav=audio_prompt, language="es")
23
+
24
+ if model_tts.synthesizer.output_sample_rate != 44100:
25
+ cloned_audio, _ = librosa.resample(cloned_audio, model_tts.synthesizer.output_sample_rate, 44100)
26
+
27
+ try:
28
+ music_np = librosa.util.buf_to_float(music, n_bytes=2)
29
+
30
+ if len(music_np) > len(cloned_audio):
31
+ padding = np.zeros(len(music_np) - len(cloned_audio))
32
+ cloned_audio = np.concatenate([cloned_audio, padding])
33
+ elif len(cloned_audio) > len(music_np):
34
+ padding = np.zeros(len(cloned_audio) - len(music_np))
35
+ music_np = np.concatenate([music_np, padding])
36
+
37
+ combined_audio = music_np + cloned_audio
38
+ return (44100, combined_audio)
39
+ except ImportError:
40
+ print("Error: Se requiere librosa para combinar audio. Instale con 'pip install librosa'")
41
+ return (44100, music)
42
+
43
+ return (44100, music)
44
+
45
+ iface = gr.Interface(
46
+ fn=generate_music,
47
+ inputs=[
48
+ gr.Textbox(label="Descripción de la música"),
49
+ gr.Audio(source="microphone", type="filepath", label="Audio de voz (Opcional)", optional=True),
50
+ gr.Textbox(label="Texto para clonar la voz (Opcional)", optional=True),
51
+ ],
52
+ outputs=gr.Audio(label="Música generada", type="numpy"),
53
+ title="Generador de Música con MusicGen y XTTS",
54
+ description="Introduce una descripción de la música que deseas generar y opcionalmente un audio de voz para clonar con texto.",
55
+ )
56
+
57
+ iface.launch()