Junming Yang
commited on
[Leaderboard] Support leaderboard dynamic avg score calculation (#193)
Browse files* add VQA meta_data
* Support leaderboard dynamic avg score calculation
- app.py +3 -0
- gen_table.py +50 -33
- meta_data.py +2 -2
app.py
CHANGED
@@ -52,7 +52,9 @@ with gr.Blocks() as demo:
|
|
52 |
visible=True)
|
53 |
|
54 |
def filter_df(fields, model_size, model_type):
|
|
|
55 |
headers = check_box['essential'] + fields
|
|
|
56 |
df = cp.deepcopy(table)
|
57 |
df['flag'] = [model_size_flag(x, model_size) for x in df['Parameters (B)']]
|
58 |
df = df[df['flag']]
|
@@ -62,6 +64,7 @@ with gr.Blocks() as demo:
|
|
62 |
df = df[df['flag']]
|
63 |
df.pop('flag')
|
64 |
|
|
|
65 |
comp = gr.components.DataFrame(
|
66 |
value=df[headers],
|
67 |
type='pandas',
|
|
|
52 |
visible=True)
|
53 |
|
54 |
def filter_df(fields, model_size, model_type):
|
55 |
+
filter_list = ['Avg Score', 'Avg Rank', 'OpenSource', 'Verified']
|
56 |
headers = check_box['essential'] + fields
|
57 |
+
new_fields = [field for field in fields if field not in filter_list]
|
58 |
df = cp.deepcopy(table)
|
59 |
df['flag'] = [model_size_flag(x, model_size) for x in df['Parameters (B)']]
|
60 |
df = df[df['flag']]
|
|
|
64 |
df = df[df['flag']]
|
65 |
df.pop('flag')
|
66 |
|
67 |
+
df = generate_table(results, new_fields, df)
|
68 |
comp = gr.components.DataFrame(
|
69 |
value=df[headers],
|
70 |
type='pandas',
|
gen_table.py
CHANGED
@@ -60,45 +60,22 @@ def model_type_flag(line, FIELDS):
|
|
60 |
|
61 |
|
62 |
def BUILD_L1_DF(results, fields):
|
63 |
-
res = defaultdict(list)
|
64 |
-
for i, m in enumerate(results):
|
65 |
-
item = results[m]
|
66 |
-
meta = item['META']
|
67 |
-
for k in META_FIELDS:
|
68 |
-
if k == 'Parameters (B)':
|
69 |
-
param = meta['Parameters']
|
70 |
-
res[k].append(float(param.replace('B', '')) if param != '' else None)
|
71 |
-
elif k == 'Method':
|
72 |
-
name, url = meta['Method']
|
73 |
-
res[k].append(f'<a href="{url}">{name}</a>')
|
74 |
-
else:
|
75 |
-
res[k].append(meta[k])
|
76 |
-
scores, ranks = [], []
|
77 |
-
for d in fields:
|
78 |
-
key_name = 'Overall' if d != 'OCRBench' else 'Final Score'
|
79 |
-
res[d].append(item[d][key_name])
|
80 |
-
if d == 'MME':
|
81 |
-
scores.append(item[d][key_name] / 28)
|
82 |
-
elif d == 'OCRBench':
|
83 |
-
scores.append(item[d][key_name] / 10)
|
84 |
-
else:
|
85 |
-
scores.append(item[d][key_name])
|
86 |
-
ranks.append(nth_large(item[d][key_name], [x[d][key_name] for x in results.values()]))
|
87 |
-
res['Avg Score'].append(round(np.mean(scores), 1))
|
88 |
-
res['Avg Rank'].append(round(np.mean(ranks), 2))
|
89 |
-
|
90 |
-
df = pd.DataFrame(res)
|
91 |
-
df = df.sort_values('Avg Score')
|
92 |
-
df = df.iloc[::-1]
|
93 |
-
|
94 |
check_box = {}
|
95 |
check_box['essential'] = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model']
|
96 |
-
|
97 |
-
|
|
|
|
|
|
|
98 |
type_map = defaultdict(lambda: 'number')
|
99 |
type_map['Method'] = 'html'
|
100 |
type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
|
101 |
check_box['type_map'] = type_map
|
|
|
|
|
|
|
|
|
|
|
102 |
return df, check_box
|
103 |
|
104 |
|
@@ -153,3 +130,43 @@ def BUILD_L2_DF(results, dataset):
|
|
153 |
type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
|
154 |
check_box['type_map'] = type_map
|
155 |
return df, check_box
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
|
62 |
def BUILD_L1_DF(results, fields):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
check_box = {}
|
64 |
check_box['essential'] = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model']
|
65 |
+
# revise there to set defualt dataset
|
66 |
+
defualt_dataset = ['MMBench_TEST_EN', 'MMStar', 'MME', 'MMMU_VAL', 'MathVista', 'OCRBench', 'MMVet']
|
67 |
+
check_box['required'] = ['Avg Score', 'Avg Rank'] + defualt_dataset
|
68 |
+
check_box['avg'] = ['Avg Score', 'Avg Rank']
|
69 |
+
check_box['all'] = check_box['avg'] + fields
|
70 |
type_map = defaultdict(lambda: 'number')
|
71 |
type_map['Method'] = 'html'
|
72 |
type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
|
73 |
check_box['type_map'] = type_map
|
74 |
+
|
75 |
+
res = generate_table(results, fields)
|
76 |
+
df = pd.DataFrame(res)
|
77 |
+
df = df.sort_values('Avg Score')
|
78 |
+
df = df.iloc[::-1]
|
79 |
return df, check_box
|
80 |
|
81 |
|
|
|
130 |
type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
|
131 |
check_box['type_map'] = type_map
|
132 |
return df, check_box
|
133 |
+
|
134 |
+
|
135 |
+
def generate_table(results, fields, df=None):
|
136 |
+
res = defaultdict(list)
|
137 |
+
for i, m in enumerate(results):
|
138 |
+
item = results[m]
|
139 |
+
meta = item['META']
|
140 |
+
for k in META_FIELDS:
|
141 |
+
if k == 'Parameters (B)':
|
142 |
+
param = meta['Parameters']
|
143 |
+
res[k].append(float(param.replace('B', '')) if param != '' else None)
|
144 |
+
elif k == 'Method':
|
145 |
+
name, url = meta['Method']
|
146 |
+
res[k].append(f'<a href="{url}">{name}</a>')
|
147 |
+
res['name'].append(name)
|
148 |
+
else:
|
149 |
+
res[k].append(meta[k])
|
150 |
+
scores, ranks = [], []
|
151 |
+
for d in fields:
|
152 |
+
key_name = 'Overall' if d != 'OCRBench' else 'Final Score'
|
153 |
+
res[d].append(item[d][key_name])
|
154 |
+
if d == 'MME':
|
155 |
+
scores.append(item[d][key_name] / 28)
|
156 |
+
elif d == 'OCRBench':
|
157 |
+
scores.append(item[d][key_name] / 10)
|
158 |
+
else:
|
159 |
+
scores.append(item[d][key_name])
|
160 |
+
ranks.append(nth_large(item[d][key_name], [x[d][key_name] for x in results.values()]))
|
161 |
+
res['Avg Score'].append(round(np.mean(scores), 1))
|
162 |
+
res['Avg Rank'].append(round(np.mean(ranks), 2))
|
163 |
+
if df is None:
|
164 |
+
return res
|
165 |
+
else:
|
166 |
+
res = pd.DataFrame(res)
|
167 |
+
df.set_index('name', inplace=True)
|
168 |
+
res.set_index('name', inplace=True)
|
169 |
+
df.update(res)
|
170 |
+
df = df.sort_values('Avg Score')
|
171 |
+
df = df.iloc[::-1]
|
172 |
+
return df
|
meta_data.py
CHANGED
@@ -21,8 +21,8 @@ This leaderboard was last updated: {}.
|
|
21 |
META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
|
22 |
MAIN_FIELDS = [
|
23 |
'MMBench_TEST_EN', 'MMBench_TEST_CN', 'MMStar', 'MME',
|
24 |
-
'MMMU_VAL', 'MathVista', '
|
25 |
-
'
|
26 |
]
|
27 |
MMBENCH_FIELDS = ['MMBench_TEST_EN', 'MMBench_DEV_EN', 'MMBench_TEST_CN', 'MMBench_DEV_CN', 'CCBench']
|
28 |
MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
|
|
21 |
META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
|
22 |
MAIN_FIELDS = [
|
23 |
'MMBench_TEST_EN', 'MMBench_TEST_CN', 'MMStar', 'MME',
|
24 |
+
'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
|
25 |
+
'HallusionBench', 'SEEDBench_IMG', 'MMVet', 'LLaVABench'
|
26 |
]
|
27 |
MMBENCH_FIELDS = ['MMBench_TEST_EN', 'MMBench_DEV_EN', 'MMBench_TEST_CN', 'MMBench_DEV_CN', 'CCBench']
|
28 |
MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|