music / app.py
Hjgugugjhuhjggg's picture
Update app.py
3f2ba73 verified
raw
history blame
2.64 kB
import torch
import gradio as gr
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from TTS.api import TTS
import librosa
import numpy as np
import os
import spaces
os.environ["COQUI_TOS_AGREED"] = "1"
processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
model_musicgen = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
model_tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False).to("cpu")
@spaces.GPU()
def generate_music(text_prompt, audio_file=None, tts_text=None):
inputs = processor(text=text_prompt, padding=True, return_tensors="pt")
max_duration_sec = 600
min_duration_sec = 30
max_new_tokens = int(max_duration_sec * model_musicgen.config.min_length)
min_new_tokens = int(min_duration_sec * model_musicgen.config.min_length)
audio_values = model_musicgen.generate(**inputs.to("cpu"))
music = audio_values.cpu().numpy()[0]
audio_prompt = audio_file
if audio_prompt and tts_text:
cloned_audio = model_tts.tts_with_vc(tts_text, speaker_wav=audio_prompt, language="es")
if model_tts.synthesizer.output_sample_rate != 44100:
cloned_audio, _ = librosa.resample(cloned_audio, model_tts.synthesizer.output_sample_rate, 44100)
try:
music_np = librosa.util.buf_to_float(music, n_bytes=2)
# Pad the shorter array
if len(music_np) > len(cloned_audio):
cloned_audio = np.pad(cloned_audio, (0, len(music_np) - len(cloned_audio)))
else:
music_np = np.pad(music_np, (0, len(cloned_audio) - len(music_np)))
combined_audio = music_np + cloned_audio
# Normalize and convert to int16
combined_audio = combined_audio / np.max(np.abs(combined_audio))
combined_audio_int16 = (combined_audio * 32767).astype(np.int16)
return (44100, combined_audio_int16)
except Exception as e:
print(f"Error combining audio: {e}")
return (44100, music)
return (44100, music)
iface = gr.Interface(
fn=generate_music,
inputs=[
gr.Textbox(label="Descripción de la música"),
gr.Audio(type="filepath", label="Subir audio de voz"),
gr.Textbox(label="Texto para clonar la voz (Opcional)"),
],
outputs=gr.Audio(label="Música generada", type="numpy"),
title="Generador de Música con MusicGen y XTTS",
description="Introduce una descripción de la música que deseas generar y opcionalmente un audio de voz para clonar con texto.",
)
iface.launch()