kennymckormick
commited on
Commit
·
e06d81a
1
Parent(s):
4a9f248
update
Browse files- app.py +2 -2
- gen_table.py +41 -24
- meta_data.py +5 -6
app.py
CHANGED
@@ -55,7 +55,8 @@ with gr.Blocks() as demo:
|
|
55 |
filter_list = ['Avg Score', 'Avg Rank', 'OpenSource', 'Verified']
|
56 |
headers = check_box['essential'] + fields
|
57 |
new_fields = [field for field in fields if field not in filter_list]
|
58 |
-
df =
|
|
|
59 |
df['flag'] = [model_size_flag(x, model_size) for x in df['Parameters (B)']]
|
60 |
df = df[df['flag']]
|
61 |
df.pop('flag')
|
@@ -64,7 +65,6 @@ with gr.Blocks() as demo:
|
|
64 |
df = df[df['flag']]
|
65 |
df.pop('flag')
|
66 |
|
67 |
-
df = generate_table(results, new_fields, df)
|
68 |
comp = gr.components.DataFrame(
|
69 |
value=df[headers],
|
70 |
type='pandas',
|
|
|
55 |
filter_list = ['Avg Score', 'Avg Rank', 'OpenSource', 'Verified']
|
56 |
headers = check_box['essential'] + fields
|
57 |
new_fields = [field for field in fields if field not in filter_list]
|
58 |
+
df = generate_table(results, new_fields)
|
59 |
+
|
60 |
df['flag'] = [model_size_flag(x, model_size) for x in df['Parameters (B)']]
|
61 |
df = df[df['flag']]
|
62 |
df.pop('flag')
|
|
|
65 |
df = df[df['flag']]
|
66 |
df.pop('flag')
|
67 |
|
|
|
68 |
comp = gr.components.DataFrame(
|
69 |
value=df[headers],
|
70 |
type='pandas',
|
gen_table.py
CHANGED
@@ -38,7 +38,9 @@ def model_size_flag(sz, FIELDS):
|
|
38 |
return True
|
39 |
if pd.isna(sz):
|
40 |
return False
|
41 |
-
if '<
|
|
|
|
|
42 |
return True
|
43 |
if '10B-20B' in FIELDS and sz >= 10 and sz < 20:
|
44 |
return True
|
@@ -71,10 +73,7 @@ def BUILD_L1_DF(results, fields):
|
|
71 |
type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
|
72 |
check_box['type_map'] = type_map
|
73 |
|
74 |
-
|
75 |
-
df = pd.DataFrame(res)
|
76 |
-
df = df.sort_values('Avg Score')
|
77 |
-
df = df.iloc[::-1]
|
78 |
return df, check_box
|
79 |
|
80 |
|
@@ -131,7 +130,14 @@ def BUILD_L2_DF(results, dataset):
|
|
131 |
return df, check_box
|
132 |
|
133 |
|
134 |
-
def generate_table(results, fields
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
res = defaultdict(list)
|
136 |
for i, m in enumerate(results):
|
137 |
item = results[m]
|
@@ -149,23 +155,34 @@ def generate_table(results, fields, df=None):
|
|
149 |
scores, ranks = [], []
|
150 |
for d in fields:
|
151 |
key_name = 'Overall' if d != 'OCRBench' else 'Final Score'
|
152 |
-
|
153 |
-
if d == '
|
154 |
-
|
155 |
-
|
156 |
-
scores.append(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
else:
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
|
|
171 |
return df
|
|
|
38 |
return True
|
39 |
if pd.isna(sz):
|
40 |
return False
|
41 |
+
if '<4B' in FIELDS and sz < 4:
|
42 |
+
return True
|
43 |
+
if '4B-10B' in FIELDS and sz >= 4 and sz < 10:
|
44 |
return True
|
45 |
if '10B-20B' in FIELDS and sz >= 10 and sz < 20:
|
46 |
return True
|
|
|
73 |
type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
|
74 |
check_box['type_map'] = type_map
|
75 |
|
76 |
+
df = generate_table(results, fields)
|
|
|
|
|
|
|
77 |
return df, check_box
|
78 |
|
79 |
|
|
|
130 |
return df, check_box
|
131 |
|
132 |
|
133 |
+
def generate_table(results, fields):
|
134 |
+
|
135 |
+
def get_mmbench_v11(item):
|
136 |
+
assert 'MMBench_TEST_CN_V11' in item and 'MMBench_TEST_EN_V11' in item
|
137 |
+
val = (item['MMBench_TEST_CN_V11'] + item['MMBench_TEST_EN_V11']) / 2
|
138 |
+
val = float(f'{val:.1f}')
|
139 |
+
return val
|
140 |
+
|
141 |
res = defaultdict(list)
|
142 |
for i, m in enumerate(results):
|
143 |
item = results[m]
|
|
|
155 |
scores, ranks = [], []
|
156 |
for d in fields:
|
157 |
key_name = 'Overall' if d != 'OCRBench' else 'Final Score'
|
158 |
+
# Every Model should have MMBench_V11 results
|
159 |
+
if d == 'MMBench_V11':
|
160 |
+
val = get_mmbench_v11(item)
|
161 |
+
res[d].append(val)
|
162 |
+
scores.append(val)
|
163 |
+
ranks.append(nth_large(val, [get_mmbench_v11(x) for x in results.values()]))
|
164 |
+
elif d in item:
|
165 |
+
res[d].append(item[d][key_name])
|
166 |
+
if d == 'MME':
|
167 |
+
scores.append(item[d][key_name] / 28)
|
168 |
+
elif d == 'OCRBench':
|
169 |
+
scores.append(item[d][key_name] / 10)
|
170 |
+
else:
|
171 |
+
scores.append(item[d][key_name])
|
172 |
+
ranks.append(nth_large(item[d][key_name], [x[d][key_name] for x in results.values() if d in x]))
|
173 |
else:
|
174 |
+
res[d].append(None)
|
175 |
+
scores.append(None)
|
176 |
+
ranks.append(None)
|
177 |
+
|
178 |
+
res['Avg Score'].append(round(np.mean(scores), 1) if None not in scores else None)
|
179 |
+
res['Avg Rank'].append(round(np.mean(ranks), 2) if None not in ranks else None)
|
180 |
+
|
181 |
+
df = pd.DataFrame(res)
|
182 |
+
valid, missing = df[~pd.isna(df['Avg Score'])], df[pd.isna(df['Avg Score'])]
|
183 |
+
valid = valid.sort_values('Avg Score')
|
184 |
+
valid = valid.iloc[::-1]
|
185 |
+
missing = missing.sort_values('MMBench_V11')
|
186 |
+
missing = missing.iloc[::-1]
|
187 |
+
df = pd.concat([valid, missing])
|
188 |
return df
|
meta_data.py
CHANGED
@@ -22,17 +22,16 @@ OpenVLM Leaderboard only includes open-source VLMs or API models that are public
|
|
22 |
# CONSTANTS-FIELDS
|
23 |
META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
|
24 |
MAIN_FIELDS = [
|
25 |
-
'
|
26 |
'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
|
27 |
'HallusionBench', 'SEEDBench_IMG', 'MMVet', 'LLaVABench', 'CCBench', 'RealWorldQA'
|
28 |
]
|
29 |
DEFAULT_BENCH = [
|
30 |
-
'
|
31 |
-
'
|
32 |
-
'HallusionBench', 'SEEDBench_IMG', 'MMVet', 'LLaVABench'
|
33 |
]
|
34 |
-
MMBENCH_FIELDS = ['
|
35 |
-
MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
36 |
MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
|
37 |
|
38 |
# The README file for each benchmark
|
|
|
22 |
# CONSTANTS-FIELDS
|
23 |
META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
|
24 |
MAIN_FIELDS = [
|
25 |
+
'MMBench_V11', 'MMStar', 'MME',
|
26 |
'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
|
27 |
'HallusionBench', 'SEEDBench_IMG', 'MMVet', 'LLaVABench', 'CCBench', 'RealWorldQA'
|
28 |
]
|
29 |
DEFAULT_BENCH = [
|
30 |
+
'MMBench_V11', 'MMStar', 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
|
31 |
+
'HallusionBench', 'MMVet'
|
|
|
32 |
]
|
33 |
+
MMBENCH_FIELDS = ['MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11', 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench']
|
34 |
+
MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
35 |
MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
|
36 |
|
37 |
# The README file for each benchmark
|