Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -29,6 +29,31 @@ tokenizer, model, image_processor, context_len = load_pretrained_model(model_pat
|
|
29 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
30 |
model.to(device).eval()
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
def preprocess_qwen(sources, tokenizer: transformers.PreTrainedTokenizer, has_image: bool = False, max_len=2048, system_message: str = "You are a helpful assistant.") -> Dict:
|
33 |
roles = {"human": "<|im_start|>user", "gpt": "<|im_start|>assistant"}
|
34 |
|
@@ -85,6 +110,9 @@ def preprocess_qwen(sources, tokenizer: transformers.PreTrainedTokenizer, has_im
|
|
85 |
@spaces.GPU(duration=120)
|
86 |
def oryx_inference(multimodal):
|
87 |
visual, text = multimodal["files"][0], multimodal["text"]
|
|
|
|
|
|
|
88 |
if visual.endswith(".mp4"):
|
89 |
modality = "video"
|
90 |
else:
|
@@ -190,14 +218,13 @@ def oryx_inference(multimodal):
|
|
190 |
return outputs
|
191 |
|
192 |
# Define input and output for the Gradio interface
|
193 |
-
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
194 |
demo = gr.Interface(
|
195 |
fn=oryx_inference,
|
196 |
inputs=gr.MultimodalTextbox(file_types=[".mp4", "image"],placeholder="Enter message or upload file..."),
|
197 |
outputs="text",
|
198 |
examples=[
|
199 |
{
|
200 |
-
"files":[f"{cur_dir}/case/
|
201 |
"text":"Describe what is happening in this video in detail.",
|
202 |
},
|
203 |
{
|
@@ -206,7 +233,8 @@ demo = gr.Interface(
|
|
206 |
},
|
207 |
],
|
208 |
title="Oryx Demo",
|
209 |
-
description=
|
|
|
210 |
)
|
211 |
|
212 |
# Launch the Gradio app
|
|
|
29 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
30 |
model.to(device).eval()
|
31 |
|
32 |
+
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
33 |
+
|
34 |
+
title_markdown = """
|
35 |
+
<div style="display: flex; justify-content: left; align-items: center; text-align: left; background: linear-gradient(45deg, rgba(204,255,231, 0.8), rgba(204,255,231, 0.3)); border-radius: 10px; box-shadow: 0 8px 16px 0 rgba(0,0,0,0.1);"> <a href="https://llava-vl.github.io/blog/2024-04-30-llava-next-video/"" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
|
36 |
+
<img src="https://oryx-mllm.github.io/static/images/icon.png" alt="Oryx" style="max-width: 80px; height: auto; border-radius: 10px;">
|
37 |
+
</a>
|
38 |
+
<div>
|
39 |
+
<h2 ><a href="https://github.com/Oryx-mllm/Oryx">Oryx MLLM: On-Demand Spatial-Temporal Understanding at Arbitrary Resolution</a> </h2>
|
40 |
+
<h5 style="margin: 0;"><a href="https://oryx-mllm.github.io/">Project Page</a> | <a href="https://github.com/Oryx-mllm/Oryx">Github</a> | <a href="https://huggingface.co/collections/THUdyh/oryx-66ebe5d0cfb61a2837a103ff">Huggingface</a> | <a href="https://arxiv.org/abs/2409.12961">Paper</a> | <a href="https://x.com/_akhaliq/status/1836963718887866400"> Twitter </a> </h5>
|
41 |
+
</div>
|
42 |
+
</div>
|
43 |
+
"""
|
44 |
+
|
45 |
+
bibtext = """
|
46 |
+
### Citation
|
47 |
+
```
|
48 |
+
@article{liu2024oryx,
|
49 |
+
title={Oryx MLLM: On-Demand Spatial-Temporal Understanding at Arbitrary Resolution},
|
50 |
+
author={Liu, Zuyan and Dong, Yuhao and Liu, Ziwei and Hu, Winston and Lu, Jiwen and Rao, Yongming},
|
51 |
+
journal={arXiv preprint arXiv:2409.12961},
|
52 |
+
year={2024}
|
53 |
+
}
|
54 |
+
```
|
55 |
+
"""
|
56 |
+
|
57 |
def preprocess_qwen(sources, tokenizer: transformers.PreTrainedTokenizer, has_image: bool = False, max_len=2048, system_message: str = "You are a helpful assistant.") -> Dict:
|
58 |
roles = {"human": "<|im_start|>user", "gpt": "<|im_start|>assistant"}
|
59 |
|
|
|
110 |
@spaces.GPU(duration=120)
|
111 |
def oryx_inference(multimodal):
|
112 |
visual, text = multimodal["files"][0], multimodal["text"]
|
113 |
+
if visual.endswith("case/image2.png"):
|
114 |
+
modality = "video"
|
115 |
+
visual = f"{cur_dir}/case/case1.mp4"
|
116 |
if visual.endswith(".mp4"):
|
117 |
modality = "video"
|
118 |
else:
|
|
|
218 |
return outputs
|
219 |
|
220 |
# Define input and output for the Gradio interface
|
|
|
221 |
demo = gr.Interface(
|
222 |
fn=oryx_inference,
|
223 |
inputs=gr.MultimodalTextbox(file_types=[".mp4", "image"],placeholder="Enter message or upload file..."),
|
224 |
outputs="text",
|
225 |
examples=[
|
226 |
{
|
227 |
+
"files":[f"{cur_dir}/case/image2.png"],
|
228 |
"text":"Describe what is happening in this video in detail.",
|
229 |
},
|
230 |
{
|
|
|
233 |
},
|
234 |
],
|
235 |
title="Oryx Demo",
|
236 |
+
description=title_markdown,
|
237 |
+
article=bibtext,
|
238 |
)
|
239 |
|
240 |
# Launch the Gradio app
|