Spaces:

opencompass
/

openvlm_video_leaderboard

Running

App Files Files Community

nebulae09 commited on 6 days ago

Commit

62c094e

•

1 Parent(s): 5ab1442

update md

Browse files

Files changed (1) hide show

lb_info.py +17 -9

lb_info.py CHANGED Viewed

@@ -48,14 +48,21 @@ LEADERBOARD_MD['MAIN'] = """
 - Avg Score: The average score on all video understanding Benchmarks (normalized to 0 - 100, the higher the better).
 - Avg Rank: The average rank on all video understanding Benchmarks (the lower the better).
-- The overall evaluation results on 3 video understanding benchmarks, sorted by the ascending order of Avg Rank.
 """
 LEADERBOARD_MD['Video-MME (w/o subs)'] = """
 ## Video-MME (w/o subs) Evaluation Results
 - We give the total scores for the three video lengths (short, medium and long), as well as the total scores for each task type.
-- Video-MME (w subs) will update as evaluation is completed.
 """
 # LEADERBOARD_MD['MVBench'] = """
@@ -139,13 +146,14 @@ def BUILD_L1_DF(results, fields):
             # elif d == 'TempCompass':
             #     item[d]['Overall'] = item[d]['overall']
             if d == 'MLVU':
-                res[d].append(
-                    f'M-Avg: {item[d]["M-Avg"]}, G-Avg: {item[d]["G-Avg"]}'
-                    # {
-                    #     'M-Avg': item[d]['M-Avg'],
-                    #     'G-Avg': item[d]['G-Avg']
-                    # }
-                )
             elif d == 'TempCompass':
                 res[d].append(item[d]['overall'])
             else:

 - Avg Score: The average score on all video understanding Benchmarks (normalized to 0 - 100, the higher the better).
 - Avg Rank: The average rank on all video understanding Benchmarks (the lower the better).
+- The overall evaluation results on 5 video understanding benchmarks, sorted by the ascending order of Avg Rank.
+- Tip: The total score of MLVU is calculated as a weighted sum of M-Avg and G-Avg, with weights based on the proportion of the number of questions in each category relative to the total. The maximum possible score is 100.
 """
 LEADERBOARD_MD['Video-MME (w/o subs)'] = """
 ## Video-MME (w/o subs) Evaluation Results
 - We give the total scores for the three video lengths (short, medium and long), as well as the total scores for each task type.
+"""
+LEADERBOARD_MD['MLVU'] = """
+## MLVU Evaluation Results
+- The ranking here is determined by sorting the M-Avg scores in descending order.
+- The number of evaluation questions used here is consistent with the official Hugging Face benchmark.
 """
 # LEADERBOARD_MD['MVBench'] = """
             # elif d == 'TempCompass':
             #     item[d]['Overall'] = item[d]['overall']
             if d == 'MLVU':
+                # res[d].append(
+                #     f'M-Avg: {item[d]["M-Avg"]}, G-Avg: {item[d]["G-Avg"]}'
+                #     # {
+                #     #     'M-Avg': item[d]['M-Avg'],
+                #     #     'G-Avg': item[d]['G-Avg']
+                #     # }
+                # )
+                res[d].append(item[d]['M-Avg'] * 0.84 + item[d]['G-Avg'] * 10 * 0.16)
             elif d == 'TempCompass':
                 res[d].append(item[d]['overall'])
             else: