music / app.py
Hjgugugjhuhjggg's picture
Update app.py
d59877f verified
raw
history blame
2.66 kB
import torch
import gradio as gr
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from TTS.api import TTS
import librosa
import numpy as np
processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
model_musicgen = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
model_tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cpu")
def generate_music(text_prompt, audio_prompt=None, tts_text=None):
inputs = processor(text=text_prompt, padding=True, return_tensors="pt")
max_duration_sec = 600
min_duration_sec = 30
max_new_tokens = int(max_duration_sec * model_musicgen.config.audio_length // model_musicgen.config.generation_length)
min_new_tokens = int(min_duration_sec * model_musicgen.config.audio_length // model_musicgen.config.generation_length)
audio_values = model_musicgen.generate(**inputs.to("cpu"), max_new_tokens=max_new_tokens, min_new_tokens=min_new_tokens)
music = audio_values.cpu().numpy()[0]
if audio_prompt is not None and tts_text is not None:
cloned_audio = model_tts.tts_with_vc(tts_text, speaker_wav=audio_prompt, language="es")
if model_tts.synthesizer.output_sample_rate != 44100:
cloned_audio, _ = librosa.resample(cloned_audio, model_tts.synthesizer.output_sample_rate, 44100)
try:
music_np = librosa.util.buf_to_float(music, n_bytes=2)
if len(music_np) > len(cloned_audio):
padding = np.zeros(len(music_np) - len(cloned_audio))
cloned_audio = np.concatenate([cloned_audio, padding])
elif len(cloned_audio) > len(music_np):
padding = np.zeros(len(cloned_audio) - len(music_np))
music_np = np.concatenate([music_np, padding])
combined_audio = music_np + cloned_audio
return (44100, combined_audio)
except ImportError:
print("Error: Se requiere librosa para combinar audio. Instale con 'pip install librosa'")
return (44100, music)
return (44100, music)
iface = gr.Interface(
fn=generate_music,
inputs=[
gr.Textbox(label="Descripción de la música"),
gr.Audio(source="microphone", type="filepath", label="Audio de voz (Opcional)", optional=True),
gr.Textbox(label="Texto para clonar la voz (Opcional)", optional=True),
],
outputs=gr.Audio(label="Música generada", type="numpy"),
title="Generador de Música con MusicGen y XTTS",
description="Introduce una descripción de la música que deseas generar y opcionalmente un audio de voz para clonar con texto.",
)
iface.launch()