AudioLlama

Running on Zero

App Files Files Community

aiqcamp commited on Dec 17, 2024

Commit

d2a875e

verified ·

1 Parent(s): 164c335

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -131

app.py CHANGED Viewed

@@ -2,12 +2,15 @@ import spaces
 import logging
 from datetime import datetime
 from pathlib import Path
 import gradio as gr
 import torch
 import torchaudio
 import os
 try:
     import mmaudio
 except ImportError:
@@ -20,22 +23,80 @@ from mmaudio.model.flow_matching import FlowMatching
 from mmaudio.model.networks import MMAudio, get_my_mmaudio
 from mmaudio.model.sequence_config import SequenceConfig
 from mmaudio.model.utils.features_utils import FeaturesUtils
-import tempfile
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
 log = logging.getLogger()
 device = 'cuda'
 dtype = torch.bfloat16
 model: ModelConfig = all_model_cfg['large_44k_v2']
 model.download_if_needed()
 output_dir = Path('./output/gradio')
 setup_eval_logging()
 def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
     seq_cfg = model.seq_cfg
@@ -54,14 +115,25 @@ def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
     return net, feature_utils, seq_cfg
 net, feature_utils, seq_cfg = get_model()
 @spaces.GPU
 @torch.inference_mode()
 def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
                    cfg_strength: float, duration: float):
     rng = torch.Generator(device=device)
     rng.manual_seed(seed)
@@ -83,23 +155,20 @@ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int
                       cfg_strength=cfg_strength)
     audio = audios.float().cpu()[0]
-    # current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
     video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
-    # output_dir.mkdir(exist_ok=True, parents=True)
-    # video_save_path = output_dir / f'{current_time_string}.mp4'
     make_video(video,
                video_save_path,
                audio,
                sampling_rate=seq_cfg.sampling_rate,
                duration_sec=seq_cfg.duration)
-    log.info(f'Saved video to {video_save_path}')
     return video_save_path
 @spaces.GPU
 @torch.inference_mode()
 def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
                   duration: float):
     rng = torch.Generator(device=device)
     rng.manual_seed(seed)
@@ -121,141 +190,49 @@ def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int,
     audio_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.flac').name
     torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
-    log.info(f'Saved audio to {audio_save_path}')
     return audio_save_path
 video_to_audio_tab = gr.Interface(
     fn=video_to_audio,
     inputs=[
-        gr.Video(),
-        gr.Text(label='Prompt'),
-        gr.Text(label='Negative prompt', value='music'),
-        gr.Number(label='Seed', value=0, precision=0, minimum=0),
-        gr.Number(label='Num steps', value=25, precision=0, minimum=1),
-        gr.Number(label='Guidance Strength', value=4.5, minimum=1),
-        gr.Number(label='Duration (sec)', value=8, minimum=1),
     ],
-    outputs='playable_video',
-    cache_examples=False,
-    title='MMAudio — Video-to-Audio Synthesis',
-    examples=[
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4',
-            'waves, seagulls',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_serpent.mp4',
-            '',
-            'music',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_seahorse.mp4',
-            'bubbles',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_india.mp4',
-            'Indian holy music',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_galloping.mp4',
-            'galloping',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_kraken.mp4',
-            'waves, storm',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_nyc.mp4',
-            '',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/mochi_storm.mp4',
-            'storm',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_spring.mp4',
-            '',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_typing.mp4',
-            'typing',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_wake_up.mp4',
-            '',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-    ])
 text_to_audio_tab = gr.Interface(
     fn=text_to_audio,
     inputs=[
-        gr.Text(label='Prompt'),
-        gr.Text(label='Negative prompt'),
-        gr.Number(label='Seed', value=0, precision=0, minimum=0),
-        gr.Number(label='Num steps', value=25, precision=0, minimum=1),
-        gr.Number(label='Guidance Strength', value=4.5, minimum=1),
-        gr.Number(label='Duration (sec)', value=8, minimum=1),
     ],
-    outputs='audio',
-    cache_examples=False,
-    title='MMAudio — Text-to-Audio Synthesis',
 )
 if __name__ == "__main__":
-    gr.TabbedInterface([video_to_audio_tab, text_to_audio_tab],
-                       ['Video-to-Audio', 'Text-to-Audio']).launch(allowed_paths=[output_dir])

 import logging
 from datetime import datetime
 from pathlib import Path
 import gradio as gr
 import torch
 import torchaudio
 import os
+from transformers import pipeline
+from pixabay import Image, Video
+import tempfile
+# 기본 설정
 try:
     import mmaudio
 except ImportError:
 from mmaudio.model.networks import MMAudio, get_my_mmaudio
 from mmaudio.model.sequence_config import SequenceConfig
 from mmaudio.model.utils.features_utils import FeaturesUtils
+# CUDA 설정
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
+# 로깅 설정
 log = logging.getLogger()
+# 장치 및 데이터 타입 설정
 device = 'cuda'
 dtype = torch.bfloat16
+# 모델 설정
 model: ModelConfig = all_model_cfg['large_44k_v2']
 model.download_if_needed()
 output_dir = Path('./output/gradio')
 setup_eval_logging()
+# 번역기 및 Pixabay API 설정
+translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")
+PIXABAY_API_KEY = "33492762-a28a596ec4f286f84cd328b17"
+pixabay_video = Video(PIXABAY_API_KEY)
+# CSS 스타일 정의
+custom_css = """
+.gradio-container {
+    background: linear-gradient(45deg, #1a1a1a, #2a2a2a);
+    border-radius: 15px;
+    box-shadow: 0 8px 32px rgba(0,0,0,0.3);
+}
+.input-container, .output-container {
+    background: rgba(255,255,255,0.1);
+    backdrop-filter: blur(10px);
+    border-radius: 10px;
+    padding: 20px;
+    transform-style: preserve-3d;
+    transition: transform 0.3s ease;
+}
+.input-container:hover {
+    transform: translateZ(20px);
+}
+.gallery-item {
+    transition: transform 0.3s ease;
+    border-radius: 8px;
+    overflow: hidden;
+}
+.gallery-item:hover {
+    transform: scale(1.05);
+    box-shadow: 0 4px 15px rgba(0,0,0,0.2);
+}
+.tabs {
+    background: rgba(255,255,255,0.05);
+    border-radius: 10px;
+    padding: 10px;
+}
+button {
+    background: linear-gradient(45deg, #4a90e2, #357abd);
+    border: none;
+    border-radius: 5px;
+    transition: all 0.3s ease;
+}
+button:hover {
+    transform: translateY(-2px);
+    box-shadow: 0 4px 15px rgba(74,144,226,0.3);
+}
+"""
 def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
     seq_cfg = model.seq_cfg
     return net, feature_utils, seq_cfg
 net, feature_utils, seq_cfg = get_model()
+def translate_prompt(text):
+    if text and any(ord(char) >= 0x3131 and ord(char) <= 0xD7A3 for char in text):
+        translation = translator(text)[0]['translation_text']
+        return translation
+    return text
+def search_videos(query):
+    query = translate_prompt(query)
+    videos = pixabay_video.search(q=query, per_page=80)
+    return [video.video_large for video in videos['hits']]
 @spaces.GPU
 @torch.inference_mode()
 def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
                    cfg_strength: float, duration: float):
+    prompt = translate_prompt(prompt)
+    negative_prompt = translate_prompt(negative_prompt)
     rng = torch.Generator(device=device)
     rng.manual_seed(seed)
                       cfg_strength=cfg_strength)
     audio = audios.float().cpu()[0]
     video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
     make_video(video,
                video_save_path,
                audio,
                sampling_rate=seq_cfg.sampling_rate,
                duration_sec=seq_cfg.duration)
     return video_save_path
 @spaces.GPU
 @torch.inference_mode()
 def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
                   duration: float):
+    prompt = translate_prompt(prompt)
+    negative_prompt = translate_prompt(negative_prompt)
     rng = torch.Generator(device=device)
     rng.manual_seed(seed)
     audio_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.flac').name
     torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
     return audio_save_path
+# 인터페이스 정의
+video_search_tab = gr.Interface(
+    fn=search_videos,
+    inputs=gr.Textbox(label="검색어 입력"),
+    outputs=gr.Gallery(label="검색 결과", columns=4, rows=20),
+    css=custom_css
+)
 video_to_audio_tab = gr.Interface(
     fn=video_to_audio,
     inputs=[
+        gr.Video(label="비디오"),
+        gr.Textbox(label="프롬프트"),
+        gr.Textbox(label="네거티브 프롬프트", value="music"),
+        gr.Number(label="시드", value=0),
+        gr.Number(label="스텝 수", value=25),
+        gr.Number(label="가이드 강도", value=4.5),
+        gr.Number(label="길이(초)", value=8),
     ],
+    outputs="playable_video",
+    css=custom_css
+)
 text_to_audio_tab = gr.Interface(
     fn=text_to_audio,
     inputs=[
+        gr.Textbox(label="프롬프트"),
+        gr.Textbox(label="네거티브 프롬프트"),
+        gr.Number(label="시드", value=0),
+        gr.Number(label="스텝 수", value=25),
+        gr.Number(label="가이드 강도", value=4.5),
+        gr.Number(label="길이(초)", value=8),
     ],
+    outputs="audio",
+    css=custom_css
 )
+# 메인 실행
 if __name__ == "__main__":
+    gr.TabbedInterface(
+        [video_search_tab, video_to_audio_tab, text_to_audio_tab],
+        ["비디오 검색", "비디오-오디오 변환", "텍스트-오디오 변환"],
+        css=custom_css
+    ).launch(allowed_paths=[output_dir])