|
import torch |
|
import gradio as gr |
|
from transformers import AutoProcessor, MusicgenForConditionalGeneration |
|
from TTS.api import TTS |
|
import librosa |
|
import numpy as np |
|
import os |
|
import spaces |
|
|
|
os.environ["COQUI_TOS_AGREED"] = "1" |
|
|
|
processor = AutoProcessor.from_pretrained("facebook/musicgen-small") |
|
model_musicgen = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small") |
|
model_tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False).to("cpu") |
|
|
|
@spaces.GPU() |
|
def generate_music(text_prompt, audio_file=None, tts_text=None): |
|
inputs = processor(text=text_prompt, padding=True, return_tensors="pt") |
|
max_duration_sec = 600 |
|
min_duration_sec = 30 |
|
max_new_tokens = int(max_duration_sec * model_musicgen.config.min_length) |
|
min_new_tokens = int(min_duration_sec * model_musicgen.config.min_length) |
|
|
|
audio_values = model_musicgen.generate(**inputs.to("cpu")) |
|
music = audio_values.cpu().numpy()[0] |
|
|
|
audio_prompt = audio_file |
|
|
|
if audio_prompt and tts_text: |
|
cloned_audio = model_tts.tts_with_vc(tts_text, speaker_wav=audio_prompt, language="es") |
|
|
|
if model_tts.synthesizer.output_sample_rate != 44100: |
|
cloned_audio, _ = librosa.resample(cloned_audio, model_tts.synthesizer.output_sample_rate, 44100) |
|
|
|
try: |
|
music_np = librosa.util.buf_to_float(music, n_bytes=2) |
|
|
|
|
|
if len(music_np) > len(cloned_audio): |
|
cloned_audio = np.pad(cloned_audio, (0, len(music_np) - len(cloned_audio))) |
|
else: |
|
music_np = np.pad(music_np, (0, len(cloned_audio) - len(music_np))) |
|
|
|
combined_audio = music_np + cloned_audio |
|
|
|
|
|
combined_audio = combined_audio / np.max(np.abs(combined_audio)) |
|
combined_audio_int16 = (combined_audio * 32767).astype(np.int16) |
|
|
|
return (44100, combined_audio_int16) |
|
except Exception as e: |
|
print(f"Error combining audio: {e}") |
|
return (44100, music) |
|
|
|
return (44100, music) |
|
|
|
iface = gr.Interface( |
|
fn=generate_music, |
|
inputs=[ |
|
gr.Textbox(label="Descripción de la música"), |
|
gr.Audio(type="filepath", label="Subir audio de voz"), |
|
gr.Textbox(label="Texto para clonar la voz (Opcional)"), |
|
], |
|
outputs=gr.Audio(label="Música generada", type="numpy"), |
|
title="Generador de Música con MusicGen y XTTS", |
|
description="Introduce una descripción de la música que deseas generar y opcionalmente un audio de voz para clonar con texto.", |
|
) |
|
|
|
iface.launch() |
|
|