File size: 2,642 Bytes
8add5a6
d59877f
 
 
 
 
79bbaad
3f2ba73
79bbaad
 
d59877f
 
 
79bbaad
d59877f
3f2ba73
 
d59877f
 
 
80bc806
 
3f2ba73
6cc672e
d59877f
 
3f2ba73
9d512be
 
d59877f
 
 
 
 
 
 
 
3f2ba73
d59877f
3f2ba73
 
 
d59877f
 
3f2ba73
 
 
 
 
 
 
 
d59877f
 
 
 
 
 
 
 
7cd71e2
6cff445
d59877f
 
 
 
 
 
3f2ba73
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import torch
import gradio as gr
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from TTS.api import TTS
import librosa
import numpy as np
import os
import spaces

os.environ["COQUI_TOS_AGREED"] = "1"

processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
model_musicgen = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
model_tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False).to("cpu")

@spaces.GPU()
def generate_music(text_prompt, audio_file=None, tts_text=None):
    inputs = processor(text=text_prompt, padding=True, return_tensors="pt")
    max_duration_sec = 600
    min_duration_sec = 30
    max_new_tokens = int(max_duration_sec * model_musicgen.config.min_length)
    min_new_tokens = int(min_duration_sec * model_musicgen.config.min_length)
    
    audio_values = model_musicgen.generate(**inputs.to("cpu"))
    music = audio_values.cpu().numpy()[0]

    audio_prompt = audio_file

    if audio_prompt and tts_text:
        cloned_audio = model_tts.tts_with_vc(tts_text, speaker_wav=audio_prompt, language="es")

        if model_tts.synthesizer.output_sample_rate != 44100:
            cloned_audio, _ = librosa.resample(cloned_audio, model_tts.synthesizer.output_sample_rate, 44100)

        try:
            music_np = librosa.util.buf_to_float(music, n_bytes=2)

            # Pad the shorter array
            if len(music_np) > len(cloned_audio):
                cloned_audio = np.pad(cloned_audio, (0, len(music_np) - len(cloned_audio)))
            else:
                music_np = np.pad(music_np, (0, len(cloned_audio) - len(music_np)))

            combined_audio = music_np + cloned_audio
            
            # Normalize and convert to int16
            combined_audio = combined_audio / np.max(np.abs(combined_audio))
            combined_audio_int16 = (combined_audio * 32767).astype(np.int16)

            return (44100, combined_audio_int16)
        except Exception as e:
            print(f"Error combining audio: {e}")
            return (44100, music)

    return (44100, music)

iface = gr.Interface(
    fn=generate_music,
    inputs=[
        gr.Textbox(label="Descripción de la música"),
        gr.Audio(type="filepath", label="Subir audio de voz"),
        gr.Textbox(label="Texto para clonar la voz (Opcional)"),
    ],
    outputs=gr.Audio(label="Música generada", type="numpy"),
    title="Generador de Música con MusicGen y XTTS",
    description="Introduce una descripción de la música que deseas generar y opcionalmente un audio de voz para clonar con texto.",
)

iface.launch()