nebulae09 commited on
Commit
62c094e
1 Parent(s): 5ab1442
Files changed (1) hide show
  1. lb_info.py +17 -9
lb_info.py CHANGED
@@ -48,14 +48,21 @@ LEADERBOARD_MD['MAIN'] = """
48
 
49
  - Avg Score: The average score on all video understanding Benchmarks (normalized to 0 - 100, the higher the better).
50
  - Avg Rank: The average rank on all video understanding Benchmarks (the lower the better).
51
- - The overall evaluation results on 3 video understanding benchmarks, sorted by the ascending order of Avg Rank.
 
52
  """
53
 
54
  LEADERBOARD_MD['Video-MME (w/o subs)'] = """
55
  ## Video-MME (w/o subs) Evaluation Results
56
 
57
  - We give the total scores for the three video lengths (short, medium and long), as well as the total scores for each task type.
58
- - Video-MME (w subs) will update as evaluation is completed.
 
 
 
 
 
 
59
  """
60
 
61
  # LEADERBOARD_MD['MVBench'] = """
@@ -139,13 +146,14 @@ def BUILD_L1_DF(results, fields):
139
  # elif d == 'TempCompass':
140
  # item[d]['Overall'] = item[d]['overall']
141
  if d == 'MLVU':
142
- res[d].append(
143
- f'M-Avg: {item[d]["M-Avg"]}, G-Avg: {item[d]["G-Avg"]}'
144
- # {
145
- # 'M-Avg': item[d]['M-Avg'],
146
- # 'G-Avg': item[d]['G-Avg']
147
- # }
148
- )
 
149
  elif d == 'TempCompass':
150
  res[d].append(item[d]['overall'])
151
  else:
 
48
 
49
  - Avg Score: The average score on all video understanding Benchmarks (normalized to 0 - 100, the higher the better).
50
  - Avg Rank: The average rank on all video understanding Benchmarks (the lower the better).
51
+ - The overall evaluation results on 5 video understanding benchmarks, sorted by the ascending order of Avg Rank.
52
+ - Tip: The total score of MLVU is calculated as a weighted sum of M-Avg and G-Avg, with weights based on the proportion of the number of questions in each category relative to the total. The maximum possible score is 100.
53
  """
54
 
55
  LEADERBOARD_MD['Video-MME (w/o subs)'] = """
56
  ## Video-MME (w/o subs) Evaluation Results
57
 
58
  - We give the total scores for the three video lengths (short, medium and long), as well as the total scores for each task type.
59
+ """
60
+
61
+ LEADERBOARD_MD['MLVU'] = """
62
+ ## MLVU Evaluation Results
63
+
64
+ - The ranking here is determined by sorting the M-Avg scores in descending order.
65
+ - The number of evaluation questions used here is consistent with the official Hugging Face benchmark.
66
  """
67
 
68
  # LEADERBOARD_MD['MVBench'] = """
 
146
  # elif d == 'TempCompass':
147
  # item[d]['Overall'] = item[d]['overall']
148
  if d == 'MLVU':
149
+ # res[d].append(
150
+ # f'M-Avg: {item[d]["M-Avg"]}, G-Avg: {item[d]["G-Avg"]}'
151
+ # # {
152
+ # # 'M-Avg': item[d]['M-Avg'],
153
+ # # 'G-Avg': item[d]['G-Avg']
154
+ # # }
155
+ # )
156
+ res[d].append(item[d]['M-Avg'] * 0.84 + item[d]['G-Avg'] * 10 * 0.16)
157
  elif d == 'TempCompass':
158
  res[d].append(item[d]['overall'])
159
  else: