Hjgugugjhuhjggg
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -5,60 +5,70 @@ from TTS.api import TTS
|
|
5 |
import librosa
|
6 |
import numpy as np
|
7 |
import os
|
8 |
-
import spaces
|
9 |
|
|
|
10 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
11 |
|
|
|
12 |
processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
|
13 |
model_musicgen = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
|
|
|
|
|
14 |
model_tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False).to("cpu")
|
15 |
|
16 |
-
@spaces.GPU()
|
17 |
def generate_music(text_prompt, audio_file=None, tts_text=None):
|
|
|
18 |
inputs = processor(text=text_prompt, padding=True, return_tensors="pt")
|
|
|
|
|
19 |
max_duration_sec = 600
|
20 |
min_duration_sec = 30
|
21 |
max_new_tokens = int(max_duration_sec * model_musicgen.config.min_length)
|
22 |
min_new_tokens = int(min_duration_sec * model_musicgen.config.min_length)
|
23 |
|
24 |
-
|
|
|
25 |
music = audio_values.cpu().numpy()[0]
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
31 |
|
32 |
-
|
33 |
-
|
|
|
34 |
|
35 |
-
|
36 |
music_np = librosa.util.buf_to_float(music, n_bytes=2)
|
37 |
|
38 |
-
#
|
39 |
if len(music_np) > len(cloned_audio):
|
40 |
cloned_audio = np.pad(cloned_audio, (0, len(music_np) - len(cloned_audio)))
|
41 |
else:
|
42 |
music_np = np.pad(music_np, (0, len(cloned_audio) - len(music_np)))
|
43 |
|
|
|
44 |
combined_audio = music_np + cloned_audio
|
45 |
-
|
46 |
-
#
|
47 |
combined_audio = combined_audio / np.max(np.abs(combined_audio))
|
48 |
combined_audio_int16 = (combined_audio * 32767).astype(np.int16)
|
49 |
|
50 |
return (44100, combined_audio_int16)
|
51 |
except Exception as e:
|
52 |
-
print(f"Error
|
53 |
return (44100, music)
|
54 |
|
55 |
return (44100, music)
|
56 |
|
|
|
57 |
iface = gr.Interface(
|
58 |
fn=generate_music,
|
59 |
inputs=[
|
60 |
gr.Textbox(label="Descripción de la música"),
|
61 |
-
gr.Audio(type="filepath", label="Subir audio de voz"),
|
62 |
gr.Textbox(label="Texto para clonar la voz (Opcional)"),
|
63 |
],
|
64 |
outputs=gr.Audio(label="Música generada", type="numpy"),
|
@@ -66,4 +76,5 @@ iface = gr.Interface(
|
|
66 |
description="Introduce una descripción de la música que deseas generar y opcionalmente un audio de voz para clonar con texto.",
|
67 |
)
|
68 |
|
|
|
69 |
iface.launch()
|
|
|
5 |
import librosa
|
6 |
import numpy as np
|
7 |
import os
|
|
|
8 |
|
9 |
+
# Establecer variable de entorno para Coqui TTS
|
10 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
11 |
|
12 |
+
# Cargar el procesador y el modelo de MusicGen
|
13 |
processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
|
14 |
model_musicgen = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
|
15 |
+
|
16 |
+
# Cargar el modelo de TTS
|
17 |
model_tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False).to("cpu")
|
18 |
|
|
|
19 |
def generate_music(text_prompt, audio_file=None, tts_text=None):
|
20 |
+
# Procesar la descripción de la música
|
21 |
inputs = processor(text=text_prompt, padding=True, return_tensors="pt")
|
22 |
+
|
23 |
+
# Parámetros de duración máxima y mínima para la música generada
|
24 |
max_duration_sec = 600
|
25 |
min_duration_sec = 30
|
26 |
max_new_tokens = int(max_duration_sec * model_musicgen.config.min_length)
|
27 |
min_new_tokens = int(min_duration_sec * model_musicgen.config.min_length)
|
28 |
|
29 |
+
# Generar música
|
30 |
+
audio_values = model_musicgen.generate(**inputs)
|
31 |
music = audio_values.cpu().numpy()[0]
|
32 |
|
33 |
+
# Si se proporciona un archivo de audio y texto para clonar la voz
|
34 |
+
if audio_file and tts_text:
|
35 |
+
try:
|
36 |
+
# Generar la voz clonada usando TTS
|
37 |
+
cloned_audio = model_tts.tts_with_vc(tts_text, speaker_wav=audio_file, language="es")
|
38 |
|
39 |
+
# Asegurarse de que la tasa de muestreo sea la correcta (44.1kHz)
|
40 |
+
if model_tts.synthesizer.output_sample_rate != 44100:
|
41 |
+
cloned_audio, _ = librosa.resample(cloned_audio, model_tts.synthesizer.output_sample_rate, 44100)
|
42 |
|
43 |
+
# Convertir música generada en un array de floats
|
44 |
music_np = librosa.util.buf_to_float(music, n_bytes=2)
|
45 |
|
46 |
+
# Ajustar las longitudes de los arrays (rellenar el más corto)
|
47 |
if len(music_np) > len(cloned_audio):
|
48 |
cloned_audio = np.pad(cloned_audio, (0, len(music_np) - len(cloned_audio)))
|
49 |
else:
|
50 |
music_np = np.pad(music_np, (0, len(cloned_audio) - len(music_np)))
|
51 |
|
52 |
+
# Combinar la música y el audio clonado
|
53 |
combined_audio = music_np + cloned_audio
|
54 |
+
|
55 |
+
# Normalizar y convertir a formato int16
|
56 |
combined_audio = combined_audio / np.max(np.abs(combined_audio))
|
57 |
combined_audio_int16 = (combined_audio * 32767).astype(np.int16)
|
58 |
|
59 |
return (44100, combined_audio_int16)
|
60 |
except Exception as e:
|
61 |
+
print(f"Error combinando audio: {e}")
|
62 |
return (44100, music)
|
63 |
|
64 |
return (44100, music)
|
65 |
|
66 |
+
# Crear la interfaz Gradio
|
67 |
iface = gr.Interface(
|
68 |
fn=generate_music,
|
69 |
inputs=[
|
70 |
gr.Textbox(label="Descripción de la música"),
|
71 |
+
gr.Audio(type="filepath", label="Subir audio de voz (Opcional)"),
|
72 |
gr.Textbox(label="Texto para clonar la voz (Opcional)"),
|
73 |
],
|
74 |
outputs=gr.Audio(label="Música generada", type="numpy"),
|
|
|
76 |
description="Introduce una descripción de la música que deseas generar y opcionalmente un audio de voz para clonar con texto.",
|
77 |
)
|
78 |
|
79 |
+
# Lanzar la interfaz
|
80 |
iface.launch()
|