AudioLlama

Running on Zero

App Files Files Community

aiqcamp commited on 3 days ago

Commit

59b0bed

•

1 Parent(s): df0d8fc

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -43

app.py CHANGED Viewed

@@ -9,8 +9,15 @@ import os
 import requests
 from transformers import pipeline
 import tempfile
-# 기본 설정
 try:
     import mmaudio
 except ImportError:
@@ -24,15 +31,22 @@ from mmaudio.model.networks import MMAudio, get_my_mmaudio
 from mmaudio.model.sequence_config import SequenceConfig
 from mmaudio.model.utils.features_utils import FeaturesUtils
-# CUDA 설정
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
 # 로깅 설정
 log = logging.getLogger()
-# 장치 및 데이터 타입 설정
-device = 'cuda'
 dtype = torch.bfloat16
 # 모델 설정
@@ -43,23 +57,9 @@ output_dir = Path('./output/gradio')
 setup_eval_logging()
 # 번역기 및 Pixabay API 설정
-translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")
 PIXABAY_API_KEY = "33492762-a28a596ec4f286f84cd328b17"
-def search_pixabay_videos(query, api_key):
-    base_url = "https://pixabay.com/api/videos/"
-    params = {
-        "key": api_key,
-        "q": query,
-        "per_page": 80
-    }
-    response = requests.get(base_url, params=params)
-    if response.status_code == 200:
-        data = response.json()
-        return [video['videos']['large']['url'] for video in data.get('hits', [])]
-    return []
 # CSS 스타일 정의
 custom_css = """
 .gradio-container {
@@ -111,34 +111,71 @@ button:hover {
 }
 """
-def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
-    seq_cfg = model.seq_cfg
-    net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
-    net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
-    log.info(f'Loaded weights from {model.model_path}')
-    feature_utils = FeaturesUtils(tod_vae_ckpt=model.vae_path,
-                                  synchformer_ckpt=model.synchformer_ckpt,
-                                  enable_conditions=True,
-                                  mode=model.mode,
-                                  bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
-                                  need_vae_encoder=False)
-    feature_utils = feature_utils.to(device, dtype).eval()
-    return net, feature_utils, seq_cfg
 net, feature_utils, seq_cfg = get_model()
 def translate_prompt(text):
-    if text and any(ord(char) >= 0x3131 and ord(char) <= 0xD7A3 for char in text):
-        translation = translator(text)[0]['translation_text']
-        return translation
-    return text
 def search_videos(query):
-    query = translate_prompt(query)
-    return search_pixabay_videos(query, PIXABAY_API_KEY)
 @spaces.GPU
 @torch.inference_mode()
@@ -209,7 +246,8 @@ video_search_tab = gr.Interface(
     fn=search_videos,
     inputs=gr.Textbox(label="검색어 입력"),
     outputs=gr.Gallery(label="검색 결과", columns=4, rows=20),
-    css=custom_css
 )
 video_to_audio_tab = gr.Interface(

 import requests
 from transformers import pipeline
 import tempfile
+import numpy as np
+from einops import rearrange
+import cv2
+from scipy.io import wavfile
+import librosa
+import json
+from typing import Optional, Tuple, List
+import atexit
 try:
     import mmaudio
 except ImportError:
 from mmaudio.model.sequence_config import SequenceConfig
 from mmaudio.model.utils.features_utils import FeaturesUtils
 # 로깅 설정
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
 log = logging.getLogger()
+# CUDA 설정
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cudnn.benchmark = True
+else:
+    device = torch.device("cpu")
 dtype = torch.bfloat16
 # 모델 설정
 setup_eval_logging()
 # 번역기 및 Pixabay API 설정
+translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en", device="cpu")
 PIXABAY_API_KEY = "33492762-a28a596ec4f286f84cd328b17"
 # CSS 스타일 정의
 custom_css = """
 .gradio-container {
 }
 """
+def cleanup_temp_files():
+    temp_dir = tempfile.gettempdir()
+    for file in os.listdir(temp_dir):
+        if file.endswith(('.mp4', '.flac')):
+            try:
+                os.remove(os.path.join(temp_dir, file))
+            except:
+                pass
+atexit.register(cleanup_temp_files)
+def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
+    with torch.cuda.device(device):
+        seq_cfg = model.seq_cfg
+        net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
+        net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
+        log.info(f'Loaded weights from {model.model_path}')
+        feature_utils = FeaturesUtils(
+            tod_vae_ckpt=model.vae_path,
+            synchformer_ckpt=model.synchformer_ckpt,
+            enable_conditions=True,
+            mode=model.mode,
+            bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
+            need_vae_encoder=False
+        ).to(device, dtype).eval()
+        return net, feature_utils, seq_cfg
 net, feature_utils, seq_cfg = get_model()
 def translate_prompt(text):
+    try:
+        if text and any(ord(char) >= 0x3131 and ord(char) <= 0xD7A3 for char in text):
+            with torch.no_grad():
+                translation = translator(text)[0]['translation_text']
+            return translation
+        return text
+    except Exception as e:
+        logging.error(f"Translation error: {e}")
+        return text
+def search_pixabay_videos(query, api_key):
+    try:
+        base_url = "https://pixabay.com/api/videos/"
+        params = {
+            "key": api_key,
+            "q": query,
+            "per_page": 80
+        }
+        response = requests.get(base_url, params=params)
+        if response.status_code == 200:
+            data = response.json()
+            return [video['videos']['large']['url'] for video in data.get('hits', [])]
+        return []
+    except Exception as e:
+        logging.error(f"Pixabay API error: {e}")
+        return []
+@torch.no_grad()
 def search_videos(query):
+    with torch.cuda.device("cpu"):
+        query = translate_prompt(query)
+        return search_pixabay_videos(query, PIXABAY_API_KEY)
 @spaces.GPU
 @torch.inference_mode()
     fn=search_videos,
     inputs=gr.Textbox(label="검색어 입력"),
     outputs=gr.Gallery(label="검색 결과", columns=4, rows=20),
+    css=custom_css,
+    api_name=False
 )
 video_to_audio_tab = gr.Interface(