Hjgugugjhuhjggg commited on
Commit
3f2ba73
·
verified ·
1 Parent(s): 6cc672e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -11
app.py CHANGED
@@ -5,6 +5,7 @@ from TTS.api import TTS
5
  import librosa
6
  import numpy as np
7
  import os
 
8
 
9
  os.environ["COQUI_TOS_AGREED"] = "1"
10
 
@@ -12,16 +13,18 @@ processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
12
  model_musicgen = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
13
  model_tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False).to("cpu")
14
 
15
- def generate_music(text_prompt, audio_file=None, audio_mic=None, tts_text=None):
 
16
  inputs = processor(text=text_prompt, padding=True, return_tensors="pt")
17
  max_duration_sec = 600
18
  min_duration_sec = 30
19
  max_new_tokens = int(max_duration_sec * model_musicgen.config.min_length)
20
  min_new_tokens = int(min_duration_sec * model_musicgen.config.min_length)
 
21
  audio_values = model_musicgen.generate(**inputs.to("cpu"))
22
  music = audio_values.cpu().numpy()[0]
23
 
24
- audio_prompt = audio_file or audio_mic
25
 
26
  if audio_prompt and tts_text:
27
  cloned_audio = model_tts.tts_with_vc(tts_text, speaker_wav=audio_prompt, language="es")
@@ -32,17 +35,21 @@ def generate_music(text_prompt, audio_file=None, audio_mic=None, tts_text=None):
32
  try:
33
  music_np = librosa.util.buf_to_float(music, n_bytes=2)
34
 
 
35
  if len(music_np) > len(cloned_audio):
36
- padding = np.zeros(len(music_np) - len(cloned_audio))
37
- cloned_audio = np.concatenate([cloned_audio, padding])
38
- elif len(cloned_audio) > len(music_np):
39
- padding = np.zeros(len(cloned_audio) - len(music_np))
40
- music_np = np.concatenate([music_np, padding])
41
 
42
  combined_audio = music_np + cloned_audio
43
- return (44100, combined_audio)
44
- except ImportError:
45
- print("Error: Se requiere librosa para combinar audio. Instale con 'pip install librosa'")
 
 
 
 
 
46
  return (44100, music)
47
 
48
  return (44100, music)
@@ -59,4 +66,4 @@ iface = gr.Interface(
59
  description="Introduce una descripción de la música que deseas generar y opcionalmente un audio de voz para clonar con texto.",
60
  )
61
 
62
- iface.launch()
 
5
  import librosa
6
  import numpy as np
7
  import os
8
+ import spaces
9
 
10
  os.environ["COQUI_TOS_AGREED"] = "1"
11
 
 
13
  model_musicgen = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
14
  model_tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False).to("cpu")
15
 
16
+ @spaces.GPU()
17
+ def generate_music(text_prompt, audio_file=None, tts_text=None):
18
  inputs = processor(text=text_prompt, padding=True, return_tensors="pt")
19
  max_duration_sec = 600
20
  min_duration_sec = 30
21
  max_new_tokens = int(max_duration_sec * model_musicgen.config.min_length)
22
  min_new_tokens = int(min_duration_sec * model_musicgen.config.min_length)
23
+
24
  audio_values = model_musicgen.generate(**inputs.to("cpu"))
25
  music = audio_values.cpu().numpy()[0]
26
 
27
+ audio_prompt = audio_file
28
 
29
  if audio_prompt and tts_text:
30
  cloned_audio = model_tts.tts_with_vc(tts_text, speaker_wav=audio_prompt, language="es")
 
35
  try:
36
  music_np = librosa.util.buf_to_float(music, n_bytes=2)
37
 
38
+ # Pad the shorter array
39
  if len(music_np) > len(cloned_audio):
40
+ cloned_audio = np.pad(cloned_audio, (0, len(music_np) - len(cloned_audio)))
41
+ else:
42
+ music_np = np.pad(music_np, (0, len(cloned_audio) - len(music_np)))
 
 
43
 
44
  combined_audio = music_np + cloned_audio
45
+
46
+ # Normalize and convert to int16
47
+ combined_audio = combined_audio / np.max(np.abs(combined_audio))
48
+ combined_audio_int16 = (combined_audio * 32767).astype(np.int16)
49
+
50
+ return (44100, combined_audio_int16)
51
+ except Exception as e:
52
+ print(f"Error combining audio: {e}")
53
  return (44100, music)
54
 
55
  return (44100, music)
 
66
  description="Introduce una descripción de la música que deseas generar y opcionalmente un audio de voz para clonar con texto.",
67
  )
68
 
69
+ iface.launch()