music / app.py
Hjgugugjhuhjggg's picture
Update app.py
6cc672e verified
raw
history blame
2.57 kB
import torch
import gradio as gr
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from TTS.api import TTS
import librosa
import numpy as np
import os
os.environ["COQUI_TOS_AGREED"] = "1"
processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
model_musicgen = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
model_tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False).to("cpu")
def generate_music(text_prompt, audio_file=None, audio_mic=None, tts_text=None):
inputs = processor(text=text_prompt, padding=True, return_tensors="pt")
max_duration_sec = 600
min_duration_sec = 30
max_new_tokens = int(max_duration_sec * model_musicgen.config.min_length)
min_new_tokens = int(min_duration_sec * model_musicgen.config.min_length)
audio_values = model_musicgen.generate(**inputs.to("cpu"))
music = audio_values.cpu().numpy()[0]
audio_prompt = audio_file or audio_mic
if audio_prompt and tts_text:
cloned_audio = model_tts.tts_with_vc(tts_text, speaker_wav=audio_prompt, language="es")
if model_tts.synthesizer.output_sample_rate != 44100:
cloned_audio, _ = librosa.resample(cloned_audio, model_tts.synthesizer.output_sample_rate, 44100)
try:
music_np = librosa.util.buf_to_float(music, n_bytes=2)
if len(music_np) > len(cloned_audio):
padding = np.zeros(len(music_np) - len(cloned_audio))
cloned_audio = np.concatenate([cloned_audio, padding])
elif len(cloned_audio) > len(music_np):
padding = np.zeros(len(cloned_audio) - len(music_np))
music_np = np.concatenate([music_np, padding])
combined_audio = music_np + cloned_audio
return (44100, combined_audio)
except ImportError:
print("Error: Se requiere librosa para combinar audio. Instale con 'pip install librosa'")
return (44100, music)
return (44100, music)
iface = gr.Interface(
fn=generate_music,
inputs=[
gr.Textbox(label="Descripción de la música"),
gr.Audio(type="filepath", label="Subir audio de voz"),
gr.Textbox(label="Texto para clonar la voz (Opcional)"),
],
outputs=gr.Audio(label="Música generada", type="numpy"),
title="Generador de Música con MusicGen y XTTS",
description="Introduce una descripción de la música que deseas generar y opcionalmente un audio de voz para clonar con texto.",
)
iface.launch()