Hjgugugjhuhjggg
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -5,6 +5,7 @@ from TTS.api import TTS
|
|
5 |
import librosa
|
6 |
import numpy as np
|
7 |
import os
|
|
|
8 |
|
9 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
10 |
|
@@ -12,16 +13,18 @@ processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
|
|
12 |
model_musicgen = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
|
13 |
model_tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False).to("cpu")
|
14 |
|
15 |
-
|
|
|
16 |
inputs = processor(text=text_prompt, padding=True, return_tensors="pt")
|
17 |
max_duration_sec = 600
|
18 |
min_duration_sec = 30
|
19 |
max_new_tokens = int(max_duration_sec * model_musicgen.config.min_length)
|
20 |
min_new_tokens = int(min_duration_sec * model_musicgen.config.min_length)
|
|
|
21 |
audio_values = model_musicgen.generate(**inputs.to("cpu"))
|
22 |
music = audio_values.cpu().numpy()[0]
|
23 |
|
24 |
-
audio_prompt = audio_file
|
25 |
|
26 |
if audio_prompt and tts_text:
|
27 |
cloned_audio = model_tts.tts_with_vc(tts_text, speaker_wav=audio_prompt, language="es")
|
@@ -32,17 +35,21 @@ def generate_music(text_prompt, audio_file=None, audio_mic=None, tts_text=None):
|
|
32 |
try:
|
33 |
music_np = librosa.util.buf_to_float(music, n_bytes=2)
|
34 |
|
|
|
35 |
if len(music_np) > len(cloned_audio):
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
padding = np.zeros(len(cloned_audio) - len(music_np))
|
40 |
-
music_np = np.concatenate([music_np, padding])
|
41 |
|
42 |
combined_audio = music_np + cloned_audio
|
43 |
-
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
46 |
return (44100, music)
|
47 |
|
48 |
return (44100, music)
|
@@ -59,4 +66,4 @@ iface = gr.Interface(
|
|
59 |
description="Introduce una descripción de la música que deseas generar y opcionalmente un audio de voz para clonar con texto.",
|
60 |
)
|
61 |
|
62 |
-
iface.launch()
|
|
|
5 |
import librosa
|
6 |
import numpy as np
|
7 |
import os
|
8 |
+
import spaces
|
9 |
|
10 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
11 |
|
|
|
13 |
model_musicgen = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
|
14 |
model_tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False).to("cpu")
|
15 |
|
16 |
+
@spaces.GPU()
|
17 |
+
def generate_music(text_prompt, audio_file=None, tts_text=None):
|
18 |
inputs = processor(text=text_prompt, padding=True, return_tensors="pt")
|
19 |
max_duration_sec = 600
|
20 |
min_duration_sec = 30
|
21 |
max_new_tokens = int(max_duration_sec * model_musicgen.config.min_length)
|
22 |
min_new_tokens = int(min_duration_sec * model_musicgen.config.min_length)
|
23 |
+
|
24 |
audio_values = model_musicgen.generate(**inputs.to("cpu"))
|
25 |
music = audio_values.cpu().numpy()[0]
|
26 |
|
27 |
+
audio_prompt = audio_file
|
28 |
|
29 |
if audio_prompt and tts_text:
|
30 |
cloned_audio = model_tts.tts_with_vc(tts_text, speaker_wav=audio_prompt, language="es")
|
|
|
35 |
try:
|
36 |
music_np = librosa.util.buf_to_float(music, n_bytes=2)
|
37 |
|
38 |
+
# Pad the shorter array
|
39 |
if len(music_np) > len(cloned_audio):
|
40 |
+
cloned_audio = np.pad(cloned_audio, (0, len(music_np) - len(cloned_audio)))
|
41 |
+
else:
|
42 |
+
music_np = np.pad(music_np, (0, len(cloned_audio) - len(music_np)))
|
|
|
|
|
43 |
|
44 |
combined_audio = music_np + cloned_audio
|
45 |
+
|
46 |
+
# Normalize and convert to int16
|
47 |
+
combined_audio = combined_audio / np.max(np.abs(combined_audio))
|
48 |
+
combined_audio_int16 = (combined_audio * 32767).astype(np.int16)
|
49 |
+
|
50 |
+
return (44100, combined_audio_int16)
|
51 |
+
except Exception as e:
|
52 |
+
print(f"Error combining audio: {e}")
|
53 |
return (44100, music)
|
54 |
|
55 |
return (44100, music)
|
|
|
66 |
description="Introduce una descripción de la música que deseas generar y opcionalmente un audio de voz para clonar con texto.",
|
67 |
)
|
68 |
|
69 |
+
iface.launch()
|