kennymckormick commited on
Commit
e06d81a
·
1 Parent(s): 4a9f248
Files changed (3) hide show
  1. app.py +2 -2
  2. gen_table.py +41 -24
  3. meta_data.py +5 -6
app.py CHANGED
@@ -55,7 +55,8 @@ with gr.Blocks() as demo:
55
  filter_list = ['Avg Score', 'Avg Rank', 'OpenSource', 'Verified']
56
  headers = check_box['essential'] + fields
57
  new_fields = [field for field in fields if field not in filter_list]
58
- df = cp.deepcopy(table)
 
59
  df['flag'] = [model_size_flag(x, model_size) for x in df['Parameters (B)']]
60
  df = df[df['flag']]
61
  df.pop('flag')
@@ -64,7 +65,6 @@ with gr.Blocks() as demo:
64
  df = df[df['flag']]
65
  df.pop('flag')
66
 
67
- df = generate_table(results, new_fields, df)
68
  comp = gr.components.DataFrame(
69
  value=df[headers],
70
  type='pandas',
 
55
  filter_list = ['Avg Score', 'Avg Rank', 'OpenSource', 'Verified']
56
  headers = check_box['essential'] + fields
57
  new_fields = [field for field in fields if field not in filter_list]
58
+ df = generate_table(results, new_fields)
59
+
60
  df['flag'] = [model_size_flag(x, model_size) for x in df['Parameters (B)']]
61
  df = df[df['flag']]
62
  df.pop('flag')
 
65
  df = df[df['flag']]
66
  df.pop('flag')
67
 
 
68
  comp = gr.components.DataFrame(
69
  value=df[headers],
70
  type='pandas',
gen_table.py CHANGED
@@ -38,7 +38,9 @@ def model_size_flag(sz, FIELDS):
38
  return True
39
  if pd.isna(sz):
40
  return False
41
- if '<10B' in FIELDS and sz < 10:
 
 
42
  return True
43
  if '10B-20B' in FIELDS and sz >= 10 and sz < 20:
44
  return True
@@ -71,10 +73,7 @@ def BUILD_L1_DF(results, fields):
71
  type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
72
  check_box['type_map'] = type_map
73
 
74
- res = generate_table(results, fields)
75
- df = pd.DataFrame(res)
76
- df = df.sort_values('Avg Score')
77
- df = df.iloc[::-1]
78
  return df, check_box
79
 
80
 
@@ -131,7 +130,14 @@ def BUILD_L2_DF(results, dataset):
131
  return df, check_box
132
 
133
 
134
- def generate_table(results, fields, df=None):
 
 
 
 
 
 
 
135
  res = defaultdict(list)
136
  for i, m in enumerate(results):
137
  item = results[m]
@@ -149,23 +155,34 @@ def generate_table(results, fields, df=None):
149
  scores, ranks = [], []
150
  for d in fields:
151
  key_name = 'Overall' if d != 'OCRBench' else 'Final Score'
152
- res[d].append(item[d][key_name])
153
- if d == 'MME':
154
- scores.append(item[d][key_name] / 28)
155
- elif d == 'OCRBench':
156
- scores.append(item[d][key_name] / 10)
 
 
 
 
 
 
 
 
 
 
157
  else:
158
- scores.append(item[d][key_name])
159
- ranks.append(nth_large(item[d][key_name], [x[d][key_name] for x in results.values()]))
160
- res['Avg Score'].append(round(np.mean(scores), 1))
161
- res['Avg Rank'].append(round(np.mean(ranks), 2))
162
- if df is None:
163
- return res
164
- else:
165
- res = pd.DataFrame(res)
166
- df.set_index('name', inplace=True)
167
- res.set_index('name', inplace=True)
168
- df.update(res)
169
- df = df.sort_values('Avg Score')
170
- df = df.iloc[::-1]
 
171
  return df
 
38
  return True
39
  if pd.isna(sz):
40
  return False
41
+ if '<4B' in FIELDS and sz < 4:
42
+ return True
43
+ if '4B-10B' in FIELDS and sz >= 4 and sz < 10:
44
  return True
45
  if '10B-20B' in FIELDS and sz >= 10 and sz < 20:
46
  return True
 
73
  type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
74
  check_box['type_map'] = type_map
75
 
76
+ df = generate_table(results, fields)
 
 
 
77
  return df, check_box
78
 
79
 
 
130
  return df, check_box
131
 
132
 
133
+ def generate_table(results, fields):
134
+
135
+ def get_mmbench_v11(item):
136
+ assert 'MMBench_TEST_CN_V11' in item and 'MMBench_TEST_EN_V11' in item
137
+ val = (item['MMBench_TEST_CN_V11'] + item['MMBench_TEST_EN_V11']) / 2
138
+ val = float(f'{val:.1f}')
139
+ return val
140
+
141
  res = defaultdict(list)
142
  for i, m in enumerate(results):
143
  item = results[m]
 
155
  scores, ranks = [], []
156
  for d in fields:
157
  key_name = 'Overall' if d != 'OCRBench' else 'Final Score'
158
+ # Every Model should have MMBench_V11 results
159
+ if d == 'MMBench_V11':
160
+ val = get_mmbench_v11(item)
161
+ res[d].append(val)
162
+ scores.append(val)
163
+ ranks.append(nth_large(val, [get_mmbench_v11(x) for x in results.values()]))
164
+ elif d in item:
165
+ res[d].append(item[d][key_name])
166
+ if d == 'MME':
167
+ scores.append(item[d][key_name] / 28)
168
+ elif d == 'OCRBench':
169
+ scores.append(item[d][key_name] / 10)
170
+ else:
171
+ scores.append(item[d][key_name])
172
+ ranks.append(nth_large(item[d][key_name], [x[d][key_name] for x in results.values() if d in x]))
173
  else:
174
+ res[d].append(None)
175
+ scores.append(None)
176
+ ranks.append(None)
177
+
178
+ res['Avg Score'].append(round(np.mean(scores), 1) if None not in scores else None)
179
+ res['Avg Rank'].append(round(np.mean(ranks), 2) if None not in ranks else None)
180
+
181
+ df = pd.DataFrame(res)
182
+ valid, missing = df[~pd.isna(df['Avg Score'])], df[pd.isna(df['Avg Score'])]
183
+ valid = valid.sort_values('Avg Score')
184
+ valid = valid.iloc[::-1]
185
+ missing = missing.sort_values('MMBench_V11')
186
+ missing = missing.iloc[::-1]
187
+ df = pd.concat([valid, missing])
188
  return df
meta_data.py CHANGED
@@ -22,17 +22,16 @@ OpenVLM Leaderboard only includes open-source VLMs or API models that are public
22
  # CONSTANTS-FIELDS
23
  META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
24
  MAIN_FIELDS = [
25
- 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'MMStar', 'MME',
26
  'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
27
  'HallusionBench', 'SEEDBench_IMG', 'MMVet', 'LLaVABench', 'CCBench', 'RealWorldQA'
28
  ]
29
  DEFAULT_BENCH = [
30
- 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'MMStar', 'MME',
31
- 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
32
- 'HallusionBench', 'SEEDBench_IMG', 'MMVet', 'LLaVABench'
33
  ]
34
- MMBENCH_FIELDS = ['MMBench_TEST_EN', 'MMBench_DEV_EN', 'MMBench_TEST_CN', 'MMBench_DEV_CN', 'CCBench']
35
- MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
36
  MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
37
 
38
  # The README file for each benchmark
 
22
  # CONSTANTS-FIELDS
23
  META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
24
  MAIN_FIELDS = [
25
+ 'MMBench_V11', 'MMStar', 'MME',
26
  'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
27
  'HallusionBench', 'SEEDBench_IMG', 'MMVet', 'LLaVABench', 'CCBench', 'RealWorldQA'
28
  ]
29
  DEFAULT_BENCH = [
30
+ 'MMBench_V11', 'MMStar', 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
31
+ 'HallusionBench', 'MMVet'
 
32
  ]
33
+ MMBENCH_FIELDS = ['MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11', 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench']
34
+ MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
35
  MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
36
 
37
  # The README file for each benchmark