Spaces:

THUdyh
/

Oryx

Running on Zero

App Files Files Community

THUdyh commited on Sep 26, 2024

Commit

08e8c65

verified ·

1 Parent(s): e93fc05

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -3

app.py CHANGED Viewed

@@ -29,6 +29,31 @@ tokenizer, model, image_processor, context_len = load_pretrained_model(model_pat
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 model.to(device).eval()
 def preprocess_qwen(sources, tokenizer: transformers.PreTrainedTokenizer, has_image: bool = False, max_len=2048, system_message: str = "You are a helpful assistant.") -> Dict:
     roles = {"human": "<|im_start|>user", "gpt": "<|im_start|>assistant"}
@@ -85,6 +110,9 @@ def preprocess_qwen(sources, tokenizer: transformers.PreTrainedTokenizer, has_im
 @spaces.GPU(duration=120)
 def oryx_inference(multimodal):
     visual, text = multimodal["files"][0], multimodal["text"]
     if visual.endswith(".mp4"):
         modality = "video"
     else:
@@ -190,14 +218,13 @@ def oryx_inference(multimodal):
     return outputs
 # Define input and output for the Gradio interface
-cur_dir = os.path.dirname(os.path.abspath(__file__))
 demo = gr.Interface(
     fn=oryx_inference,
     inputs=gr.MultimodalTextbox(file_types=[".mp4", "image"],placeholder="Enter message or upload file..."),
     outputs="text",
     examples=[
             {
-                "files":[f"{cur_dir}/case/case1.mp4"],
                 "text":"Describe what is happening in this video in detail.",
             },
             {
@@ -206,7 +233,8 @@ demo = gr.Interface(
             },
         ],
     title="Oryx Demo",
-    description="A huggingface space for Oryx-7B."
 )
 # Launch the Gradio app

 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 model.to(device).eval()
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+title_markdown = """
+<div style="display: flex; justify-content: left; align-items: center; text-align: left; background: linear-gradient(45deg, rgba(204,255,231, 0.8), rgba(204,255,231, 0.3)); border-radius: 10px; box-shadow: 0 8px 16px 0 rgba(0,0,0,0.1);">  <a href="https://llava-vl.github.io/blog/2024-04-30-llava-next-video/"" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
+    <img src="https://oryx-mllm.github.io/static/images/icon.png" alt="Oryx" style="max-width: 80px; height: auto; border-radius: 10px;">
+  </a>
+  <div>
+    <h2 ><a href="https://github.com/Oryx-mllm/Oryx">Oryx MLLM: On-Demand Spatial-Temporal Understanding at Arbitrary Resolution</a> </h2>
+    <h5 style="margin: 0;"><a href="https://oryx-mllm.github.io/">Project Page</a> | <a href="https://github.com/Oryx-mllm/Oryx">Github</a> | <a href="https://huggingface.co/collections/THUdyh/oryx-66ebe5d0cfb61a2837a103ff">Huggingface</a> | <a href="https://arxiv.org/abs/2409.12961">Paper</a> | <a href="https://x.com/_akhaliq/status/1836963718887866400"> Twitter </a> </h5>
+  </div>
+</div>
+"""
+bibtext = """
+### Citation
+```
+@article{liu2024oryx,
+  title={Oryx MLLM: On-Demand Spatial-Temporal Understanding at Arbitrary Resolution},
+  author={Liu, Zuyan and Dong, Yuhao and Liu, Ziwei and Hu, Winston and Lu, Jiwen and Rao, Yongming},
+  journal={arXiv preprint arXiv:2409.12961},
+  year={2024}
+  }
+```
+"""
 def preprocess_qwen(sources, tokenizer: transformers.PreTrainedTokenizer, has_image: bool = False, max_len=2048, system_message: str = "You are a helpful assistant.") -> Dict:
     roles = {"human": "<|im_start|>user", "gpt": "<|im_start|>assistant"}
 @spaces.GPU(duration=120)
 def oryx_inference(multimodal):
     visual, text = multimodal["files"][0], multimodal["text"]
+    if visual.endswith("case/image2.png"):
+        modality = "video"
+        visual = f"{cur_dir}/case/case1.mp4"
     if visual.endswith(".mp4"):
         modality = "video"
     else:
     return outputs
 # Define input and output for the Gradio interface
 demo = gr.Interface(
     fn=oryx_inference,
     inputs=gr.MultimodalTextbox(file_types=[".mp4", "image"],placeholder="Enter message or upload file..."),
     outputs="text",
     examples=[
             {
+                "files":[f"{cur_dir}/case/image2.png"],
                 "text":"Describe what is happening in this video in detail.",
             },
             {
             },
         ],
     title="Oryx Demo",
+    description=title_markdown,
+    article=bibtext,
 )
 # Launch the Gradio app