THUdyh commited on
Commit
08e8c65
1 Parent(s): e93fc05

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -3
app.py CHANGED
@@ -29,6 +29,31 @@ tokenizer, model, image_processor, context_len = load_pretrained_model(model_pat
29
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
30
  model.to(device).eval()
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def preprocess_qwen(sources, tokenizer: transformers.PreTrainedTokenizer, has_image: bool = False, max_len=2048, system_message: str = "You are a helpful assistant.") -> Dict:
33
  roles = {"human": "<|im_start|>user", "gpt": "<|im_start|>assistant"}
34
 
@@ -85,6 +110,9 @@ def preprocess_qwen(sources, tokenizer: transformers.PreTrainedTokenizer, has_im
85
  @spaces.GPU(duration=120)
86
  def oryx_inference(multimodal):
87
  visual, text = multimodal["files"][0], multimodal["text"]
 
 
 
88
  if visual.endswith(".mp4"):
89
  modality = "video"
90
  else:
@@ -190,14 +218,13 @@ def oryx_inference(multimodal):
190
  return outputs
191
 
192
  # Define input and output for the Gradio interface
193
- cur_dir = os.path.dirname(os.path.abspath(__file__))
194
  demo = gr.Interface(
195
  fn=oryx_inference,
196
  inputs=gr.MultimodalTextbox(file_types=[".mp4", "image"],placeholder="Enter message or upload file..."),
197
  outputs="text",
198
  examples=[
199
  {
200
- "files":[f"{cur_dir}/case/case1.mp4"],
201
  "text":"Describe what is happening in this video in detail.",
202
  },
203
  {
@@ -206,7 +233,8 @@ demo = gr.Interface(
206
  },
207
  ],
208
  title="Oryx Demo",
209
- description="A huggingface space for Oryx-7B."
 
210
  )
211
 
212
  # Launch the Gradio app
 
29
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
30
  model.to(device).eval()
31
 
32
+ cur_dir = os.path.dirname(os.path.abspath(__file__))
33
+
34
+ title_markdown = """
35
+ <div style="display: flex; justify-content: left; align-items: center; text-align: left; background: linear-gradient(45deg, rgba(204,255,231, 0.8), rgba(204,255,231, 0.3)); border-radius: 10px; box-shadow: 0 8px 16px 0 rgba(0,0,0,0.1);"> <a href="https://llava-vl.github.io/blog/2024-04-30-llava-next-video/"" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
36
+ <img src="https://oryx-mllm.github.io/static/images/icon.png" alt="Oryx" style="max-width: 80px; height: auto; border-radius: 10px;">
37
+ </a>
38
+ <div>
39
+ <h2 ><a href="https://github.com/Oryx-mllm/Oryx">Oryx MLLM: On-Demand Spatial-Temporal Understanding at Arbitrary Resolution</a> </h2>
40
+ <h5 style="margin: 0;"><a href="https://oryx-mllm.github.io/">Project Page</a> | <a href="https://github.com/Oryx-mllm/Oryx">Github</a> | <a href="https://huggingface.co/collections/THUdyh/oryx-66ebe5d0cfb61a2837a103ff">Huggingface</a> | <a href="https://arxiv.org/abs/2409.12961">Paper</a> | <a href="https://x.com/_akhaliq/status/1836963718887866400"> Twitter </a> </h5>
41
+ </div>
42
+ </div>
43
+ """
44
+
45
+ bibtext = """
46
+ ### Citation
47
+ ```
48
+ @article{liu2024oryx,
49
+ title={Oryx MLLM: On-Demand Spatial-Temporal Understanding at Arbitrary Resolution},
50
+ author={Liu, Zuyan and Dong, Yuhao and Liu, Ziwei and Hu, Winston and Lu, Jiwen and Rao, Yongming},
51
+ journal={arXiv preprint arXiv:2409.12961},
52
+ year={2024}
53
+ }
54
+ ```
55
+ """
56
+
57
  def preprocess_qwen(sources, tokenizer: transformers.PreTrainedTokenizer, has_image: bool = False, max_len=2048, system_message: str = "You are a helpful assistant.") -> Dict:
58
  roles = {"human": "<|im_start|>user", "gpt": "<|im_start|>assistant"}
59
 
 
110
  @spaces.GPU(duration=120)
111
  def oryx_inference(multimodal):
112
  visual, text = multimodal["files"][0], multimodal["text"]
113
+ if visual.endswith("case/image2.png"):
114
+ modality = "video"
115
+ visual = f"{cur_dir}/case/case1.mp4"
116
  if visual.endswith(".mp4"):
117
  modality = "video"
118
  else:
 
218
  return outputs
219
 
220
  # Define input and output for the Gradio interface
 
221
  demo = gr.Interface(
222
  fn=oryx_inference,
223
  inputs=gr.MultimodalTextbox(file_types=[".mp4", "image"],placeholder="Enter message or upload file..."),
224
  outputs="text",
225
  examples=[
226
  {
227
+ "files":[f"{cur_dir}/case/image2.png"],
228
  "text":"Describe what is happening in this video in detail.",
229
  },
230
  {
 
233
  },
234
  ],
235
  title="Oryx Demo",
236
+ description=title_markdown,
237
+ article=bibtext,
238
  )
239
 
240
  # Launch the Gradio app