aiqcamp commited on
Commit
d2a875e
β€’
1 Parent(s): 164c335

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -131
app.py CHANGED
@@ -2,12 +2,15 @@ import spaces
2
  import logging
3
  from datetime import datetime
4
  from pathlib import Path
5
-
6
  import gradio as gr
7
  import torch
8
  import torchaudio
9
  import os
 
 
 
10
 
 
11
  try:
12
  import mmaudio
13
  except ImportError:
@@ -20,22 +23,80 @@ from mmaudio.model.flow_matching import FlowMatching
20
  from mmaudio.model.networks import MMAudio, get_my_mmaudio
21
  from mmaudio.model.sequence_config import SequenceConfig
22
  from mmaudio.model.utils.features_utils import FeaturesUtils
23
- import tempfile
24
 
 
25
  torch.backends.cuda.matmul.allow_tf32 = True
26
  torch.backends.cudnn.allow_tf32 = True
27
 
 
28
  log = logging.getLogger()
29
 
 
30
  device = 'cuda'
31
  dtype = torch.bfloat16
32
 
 
33
  model: ModelConfig = all_model_cfg['large_44k_v2']
34
  model.download_if_needed()
35
  output_dir = Path('./output/gradio')
36
 
37
  setup_eval_logging()
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
41
  seq_cfg = model.seq_cfg
@@ -54,14 +115,25 @@ def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
54
 
55
  return net, feature_utils, seq_cfg
56
 
57
-
58
  net, feature_utils, seq_cfg = get_model()
59
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  @spaces.GPU
62
  @torch.inference_mode()
63
  def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
64
  cfg_strength: float, duration: float):
 
 
65
 
66
  rng = torch.Generator(device=device)
67
  rng.manual_seed(seed)
@@ -83,23 +155,20 @@ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int
83
  cfg_strength=cfg_strength)
84
  audio = audios.float().cpu()[0]
85
 
86
- # current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
87
  video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
88
- # output_dir.mkdir(exist_ok=True, parents=True)
89
- # video_save_path = output_dir / f'{current_time_string}.mp4'
90
  make_video(video,
91
  video_save_path,
92
  audio,
93
  sampling_rate=seq_cfg.sampling_rate,
94
  duration_sec=seq_cfg.duration)
95
- log.info(f'Saved video to {video_save_path}')
96
  return video_save_path
97
 
98
-
99
  @spaces.GPU
100
  @torch.inference_mode()
101
  def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
102
  duration: float):
 
 
103
 
104
  rng = torch.Generator(device=device)
105
  rng.manual_seed(seed)
@@ -121,141 +190,49 @@ def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int,
121
 
122
  audio_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.flac').name
123
  torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
124
- log.info(f'Saved audio to {audio_save_path}')
125
  return audio_save_path
126
 
 
 
 
 
 
 
 
127
 
128
  video_to_audio_tab = gr.Interface(
129
  fn=video_to_audio,
130
  inputs=[
131
- gr.Video(),
132
- gr.Text(label='Prompt'),
133
- gr.Text(label='Negative prompt', value='music'),
134
- gr.Number(label='Seed', value=0, precision=0, minimum=0),
135
- gr.Number(label='Num steps', value=25, precision=0, minimum=1),
136
- gr.Number(label='Guidance Strength', value=4.5, minimum=1),
137
- gr.Number(label='Duration (sec)', value=8, minimum=1),
138
  ],
139
- outputs='playable_video',
140
- cache_examples=False,
141
- title='MMAudio β€” Video-to-Audio Synthesis',
142
- examples=[
143
- [
144
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4',
145
- 'waves, seagulls',
146
- '',
147
- 0,
148
- 25,
149
- 4.5,
150
- 10,
151
- ],
152
- [
153
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_serpent.mp4',
154
- '',
155
- 'music',
156
- 0,
157
- 25,
158
- 4.5,
159
- 10,
160
- ],
161
- [
162
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_seahorse.mp4',
163
- 'bubbles',
164
- '',
165
- 0,
166
- 25,
167
- 4.5,
168
- 10,
169
- ],
170
- [
171
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_india.mp4',
172
- 'Indian holy music',
173
- '',
174
- 0,
175
- 25,
176
- 4.5,
177
- 10,
178
- ],
179
- [
180
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_galloping.mp4',
181
- 'galloping',
182
- '',
183
- 0,
184
- 25,
185
- 4.5,
186
- 10,
187
- ],
188
- [
189
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_kraken.mp4',
190
- 'waves, storm',
191
- '',
192
- 0,
193
- 25,
194
- 4.5,
195
- 10,
196
- ],
197
- [
198
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_nyc.mp4',
199
- '',
200
- '',
201
- 0,
202
- 25,
203
- 4.5,
204
- 10,
205
- ],
206
- [
207
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/mochi_storm.mp4',
208
- 'storm',
209
- '',
210
- 0,
211
- 25,
212
- 4.5,
213
- 10,
214
- ],
215
- [
216
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_spring.mp4',
217
- '',
218
- '',
219
- 0,
220
- 25,
221
- 4.5,
222
- 10,
223
- ],
224
- [
225
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_typing.mp4',
226
- 'typing',
227
- '',
228
- 0,
229
- 25,
230
- 4.5,
231
- 10,
232
- ],
233
- [
234
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_wake_up.mp4',
235
- '',
236
- '',
237
- 0,
238
- 25,
239
- 4.5,
240
- 10,
241
- ],
242
- ])
243
 
244
  text_to_audio_tab = gr.Interface(
245
  fn=text_to_audio,
246
  inputs=[
247
- gr.Text(label='Prompt'),
248
- gr.Text(label='Negative prompt'),
249
- gr.Number(label='Seed', value=0, precision=0, minimum=0),
250
- gr.Number(label='Num steps', value=25, precision=0, minimum=1),
251
- gr.Number(label='Guidance Strength', value=4.5, minimum=1),
252
- gr.Number(label='Duration (sec)', value=8, minimum=1),
253
  ],
254
- outputs='audio',
255
- cache_examples=False,
256
- title='MMAudio β€” Text-to-Audio Synthesis',
257
  )
258
 
 
259
  if __name__ == "__main__":
260
- gr.TabbedInterface([video_to_audio_tab, text_to_audio_tab],
261
- ['Video-to-Audio', 'Text-to-Audio']).launch(allowed_paths=[output_dir])
 
 
 
 
2
  import logging
3
  from datetime import datetime
4
  from pathlib import Path
 
5
  import gradio as gr
6
  import torch
7
  import torchaudio
8
  import os
9
+ from transformers import pipeline
10
+ from pixabay import Image, Video
11
+ import tempfile
12
 
13
+ # κΈ°λ³Έ μ„€μ •
14
  try:
15
  import mmaudio
16
  except ImportError:
 
23
  from mmaudio.model.networks import MMAudio, get_my_mmaudio
24
  from mmaudio.model.sequence_config import SequenceConfig
25
  from mmaudio.model.utils.features_utils import FeaturesUtils
 
26
 
27
+ # CUDA μ„€μ •
28
  torch.backends.cuda.matmul.allow_tf32 = True
29
  torch.backends.cudnn.allow_tf32 = True
30
 
31
+ # λ‘œκΉ… μ„€μ •
32
  log = logging.getLogger()
33
 
34
+ # μž₯치 및 데이터 νƒ€μž… μ„€μ •
35
  device = 'cuda'
36
  dtype = torch.bfloat16
37
 
38
+ # λͺ¨λΈ μ„€μ •
39
  model: ModelConfig = all_model_cfg['large_44k_v2']
40
  model.download_if_needed()
41
  output_dir = Path('./output/gradio')
42
 
43
  setup_eval_logging()
44
 
45
+ # λ²ˆμ—­κΈ° 및 Pixabay API μ„€μ •
46
+ translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")
47
+ PIXABAY_API_KEY = "33492762-a28a596ec4f286f84cd328b17"
48
+ pixabay_video = Video(PIXABAY_API_KEY)
49
+
50
+ # CSS μŠ€νƒ€μΌ μ •μ˜
51
+ custom_css = """
52
+ .gradio-container {
53
+ background: linear-gradient(45deg, #1a1a1a, #2a2a2a);
54
+ border-radius: 15px;
55
+ box-shadow: 0 8px 32px rgba(0,0,0,0.3);
56
+ }
57
+
58
+ .input-container, .output-container {
59
+ background: rgba(255,255,255,0.1);
60
+ backdrop-filter: blur(10px);
61
+ border-radius: 10px;
62
+ padding: 20px;
63
+ transform-style: preserve-3d;
64
+ transition: transform 0.3s ease;
65
+ }
66
+
67
+ .input-container:hover {
68
+ transform: translateZ(20px);
69
+ }
70
+
71
+ .gallery-item {
72
+ transition: transform 0.3s ease;
73
+ border-radius: 8px;
74
+ overflow: hidden;
75
+ }
76
+
77
+ .gallery-item:hover {
78
+ transform: scale(1.05);
79
+ box-shadow: 0 4px 15px rgba(0,0,0,0.2);
80
+ }
81
+
82
+ .tabs {
83
+ background: rgba(255,255,255,0.05);
84
+ border-radius: 10px;
85
+ padding: 10px;
86
+ }
87
+
88
+ button {
89
+ background: linear-gradient(45deg, #4a90e2, #357abd);
90
+ border: none;
91
+ border-radius: 5px;
92
+ transition: all 0.3s ease;
93
+ }
94
+
95
+ button:hover {
96
+ transform: translateY(-2px);
97
+ box-shadow: 0 4px 15px rgba(74,144,226,0.3);
98
+ }
99
+ """
100
 
101
  def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
102
  seq_cfg = model.seq_cfg
 
115
 
116
  return net, feature_utils, seq_cfg
117
 
 
118
  net, feature_utils, seq_cfg = get_model()
119
 
120
+ def translate_prompt(text):
121
+ if text and any(ord(char) >= 0x3131 and ord(char) <= 0xD7A3 for char in text):
122
+ translation = translator(text)[0]['translation_text']
123
+ return translation
124
+ return text
125
+
126
+ def search_videos(query):
127
+ query = translate_prompt(query)
128
+ videos = pixabay_video.search(q=query, per_page=80)
129
+ return [video.video_large for video in videos['hits']]
130
 
131
  @spaces.GPU
132
  @torch.inference_mode()
133
  def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
134
  cfg_strength: float, duration: float):
135
+ prompt = translate_prompt(prompt)
136
+ negative_prompt = translate_prompt(negative_prompt)
137
 
138
  rng = torch.Generator(device=device)
139
  rng.manual_seed(seed)
 
155
  cfg_strength=cfg_strength)
156
  audio = audios.float().cpu()[0]
157
 
 
158
  video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
 
 
159
  make_video(video,
160
  video_save_path,
161
  audio,
162
  sampling_rate=seq_cfg.sampling_rate,
163
  duration_sec=seq_cfg.duration)
 
164
  return video_save_path
165
 
 
166
  @spaces.GPU
167
  @torch.inference_mode()
168
  def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
169
  duration: float):
170
+ prompt = translate_prompt(prompt)
171
+ negative_prompt = translate_prompt(negative_prompt)
172
 
173
  rng = torch.Generator(device=device)
174
  rng.manual_seed(seed)
 
190
 
191
  audio_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.flac').name
192
  torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
 
193
  return audio_save_path
194
 
195
+ # μΈν„°νŽ˜μ΄μŠ€ μ •μ˜
196
+ video_search_tab = gr.Interface(
197
+ fn=search_videos,
198
+ inputs=gr.Textbox(label="검색어 μž…λ ₯"),
199
+ outputs=gr.Gallery(label="검색 κ²°κ³Ό", columns=4, rows=20),
200
+ css=custom_css
201
+ )
202
 
203
  video_to_audio_tab = gr.Interface(
204
  fn=video_to_audio,
205
  inputs=[
206
+ gr.Video(label="λΉ„λ””μ˜€"),
207
+ gr.Textbox(label="ν”„λ‘¬ν”„νŠΈ"),
208
+ gr.Textbox(label="λ„€κ±°ν‹°λΈŒ ν”„λ‘¬ν”„νŠΈ", value="music"),
209
+ gr.Number(label="μ‹œλ“œ", value=0),
210
+ gr.Number(label="μŠ€ν… 수", value=25),
211
+ gr.Number(label="κ°€μ΄λ“œ 강도", value=4.5),
212
+ gr.Number(label="길이(초)", value=8),
213
  ],
214
+ outputs="playable_video",
215
+ css=custom_css
216
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
  text_to_audio_tab = gr.Interface(
219
  fn=text_to_audio,
220
  inputs=[
221
+ gr.Textbox(label="ν”„λ‘¬ν”„νŠΈ"),
222
+ gr.Textbox(label="λ„€κ±°ν‹°λΈŒ ν”„λ‘¬ν”„νŠΈ"),
223
+ gr.Number(label="μ‹œλ“œ", value=0),
224
+ gr.Number(label="μŠ€ν… 수", value=25),
225
+ gr.Number(label="κ°€μ΄λ“œ 강도", value=4.5),
226
+ gr.Number(label="길이(초)", value=8),
227
  ],
228
+ outputs="audio",
229
+ css=custom_css
 
230
  )
231
 
232
+ # 메인 μ‹€ν–‰
233
  if __name__ == "__main__":
234
+ gr.TabbedInterface(
235
+ [video_search_tab, video_to_audio_tab, text_to_audio_tab],
236
+ ["λΉ„λ””μ˜€ 검색", "λΉ„λ””μ˜€-μ˜€λ””μ˜€ λ³€ν™˜", "ν…μŠ€νŠΈ-μ˜€λ””μ˜€ λ³€ν™˜"],
237
+ css=custom_css
238
+ ).launch(allowed_paths=[output_dir])