nyu-visionx
/

Cambrian-S-7B

@@ -4,13 +4,15 @@ base_model: Qwen/Qwen2.5-7B-Instruct
 library_name: transformers
 pipeline_tag: image-to-text
 tags:
-  - multimodal
-  - video-understanding
-  - spatial-reasoning
-  - vision-language
 datasets:
-  - nyu-visionx/VSI-590K
-eval_results:
   - task:
       type: visual-question-answering
       name: VSI-Bench
@@ -19,12 +21,18 @@ eval_results:
       name: VSI-Bench
     metrics:
       - type: accuracy
-        value: 67.5
         name: accuracy
-        verified: false
   - task:
       type: visual-question-answering
       name: VSI-Bench-Debiased
   - task:
       type: visual-question-answering
       name: EgoSchema
@@ -33,9 +41,8 @@ eval_results:
       name: EgoSchema
     metrics:
       - type: accuracy
-        value: 76.8
         name: accuracy
-        verified: false
   - task:
       type: visual-question-answering
       name: Perception Test
@@ -44,9 +51,8 @@ eval_results:
       name: Perception Test
     metrics:
       - type: accuracy
-        value: 69.9
         name: accuracy
-        verified: false
   - task:
       type: visual-question-answering
       name: VideoMME
@@ -55,9 +61,8 @@ eval_results:
       name: VideoMME
     metrics:
       - type: accuracy
-        value: 63.4
         name: accuracy
-        verified: false
   - task:
       type: visual-question-answering
       name: MVBench
@@ -66,9 +71,8 @@ eval_results:
       name: MVBench
     metrics:
       - type: accuracy
-        value: 64.5
         name: accuracy
-        verified: false
   - task:
       type: visual-question-answering
       name: LongVideoBench
@@ -77,9 +81,8 @@ eval_results:
       name: LongVideoBench
     metrics:
       - type: accuracy
-        value: 59.4
         name: accuracy
-        verified: false
   - task:
       type: visual-question-answering
       name: VideoMMMU
@@ -88,9 +91,8 @@ eval_results:
       name: VideoMMMU
     metrics:
       - type: accuracy
-        value: 38.6
         name: accuracy
-        verified: false
   - task:
       type: visual-question-answering
       name: MMVP
@@ -99,9 +101,8 @@ eval_results:
       name: MMVP
     metrics:
       - type: accuracy
-        value: 60.0
         name: accuracy
-        verified: false
   - task:
       type: visual-question-answering
       name: 3DSR
@@ -110,9 +111,8 @@ eval_results:
       name: 3DSR
     metrics:
       - type: accuracy
-        value: 54.8
         name: accuracy
-        verified: false
   - task:
       type: visual-question-answering
       name: CV-Bench
@@ -121,9 +121,10 @@ eval_results:
       name: CV-Bench
     metrics:
       - type: accuracy
-        value: 76.9
         name: accuracy
-        verified: false
 ---
@@ -175,4 +176,4 @@ output_ids = model.generate(input_ids, images=image_tensor, image_sizes=image_si
   journal={arXiv preprint arXiv:2025},
   year={2025}
 }
-```

 library_name: transformers
 pipeline_tag: image-to-text
 tags:
+- multimodal
+- video-understanding
+- spatial-reasoning
+- vision-language
 datasets:
+- nyu-visionx/VSI-590K
+model-index:
+- name: Cambrian-S-7B
+  results:
   - task:
       type: visual-question-answering
       name: VSI-Bench
       name: VSI-Bench
     metrics:
       - type: accuracy
         name: accuracy
+        value: 67.5
   - task:
       type: visual-question-answering
       name: VSI-Bench-Debiased
+    dataset:
+      type: vsi-bench-debiased
+      name: VSI-Bench-Debiased
+    metrics:
+      - type: accuracy
+        name: accuracy
+        value: 59.9
   - task:
       type: visual-question-answering
       name: EgoSchema
       name: EgoSchema
     metrics:
       - type: accuracy
         name: accuracy
+        value: 76.8
   - task:
       type: visual-question-answering
       name: Perception Test
       name: Perception Test
     metrics:
       - type: accuracy
         name: accuracy
+        value: 69.9
   - task:
       type: visual-question-answering
       name: VideoMME
       name: VideoMME
     metrics:
       - type: accuracy
         name: accuracy
+        value: 63.4
   - task:
       type: visual-question-answering
       name: MVBench
       name: MVBench
     metrics:
       - type: accuracy
         name: accuracy
+        value: 64.5
   - task:
       type: visual-question-answering
       name: LongVideoBench
       name: LongVideoBench
     metrics:
       - type: accuracy
         name: accuracy
+        value: 59.4
   - task:
       type: visual-question-answering
       name: VideoMMMU
       name: VideoMMMU
     metrics:
       - type: accuracy
         name: accuracy
+        value: 38.6
   - task:
       type: visual-question-answering
       name: MMVP
       name: MMVP
     metrics:
       - type: accuracy
         name: accuracy
+        value: 60.0
   - task:
       type: visual-question-answering
       name: 3DSR
       name: 3DSR
     metrics:
       - type: accuracy
         name: accuracy
+        value: 54.8
   - task:
       type: visual-question-answering
       name: CV-Bench
       name: CV-Bench
     metrics:
       - type: accuracy
         name: accuracy
+        value: 76.9
+language:
+- en
 ---
   journal={arXiv preprint arXiv:2025},
   year={2025}
 }
+```