kennymckormick
commited on
Commit
·
e401827
1
Parent(s):
ef4756a
update
Browse files- gen_table.py +5 -6
- meta_data.py +8 -2
gen_table.py
CHANGED
@@ -7,7 +7,7 @@ import gradio as gr
|
|
7 |
import numpy as np
|
8 |
import pandas as pd
|
9 |
|
10 |
-
from meta_data import META_FIELDS, URL
|
11 |
|
12 |
|
13 |
def listinstr(lst, s):
|
@@ -62,16 +62,15 @@ def model_type_flag(line, FIELDS):
|
|
62 |
def BUILD_L1_DF(results, fields):
|
63 |
check_box = {}
|
64 |
check_box['essential'] = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model']
|
65 |
-
# revise there to set
|
66 |
-
|
67 |
-
check_box['required'] = ['Avg Score', 'Avg Rank'] + defualt_dataset
|
68 |
check_box['avg'] = ['Avg Score', 'Avg Rank']
|
69 |
check_box['all'] = check_box['avg'] + fields
|
70 |
type_map = defaultdict(lambda: 'number')
|
71 |
type_map['Method'] = 'html'
|
72 |
type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
|
73 |
check_box['type_map'] = type_map
|
74 |
-
|
75 |
res = generate_table(results, fields)
|
76 |
df = pd.DataFrame(res)
|
77 |
df = df.sort_values('Avg Score')
|
@@ -169,4 +168,4 @@ def generate_table(results, fields, df=None):
|
|
169 |
df.update(res)
|
170 |
df = df.sort_values('Avg Score')
|
171 |
df = df.iloc[::-1]
|
172 |
-
return df
|
|
|
7 |
import numpy as np
|
8 |
import pandas as pd
|
9 |
|
10 |
+
from meta_data import DEFAULT_BENCH, META_FIELDS, URL
|
11 |
|
12 |
|
13 |
def listinstr(lst, s):
|
|
|
62 |
def BUILD_L1_DF(results, fields):
|
63 |
check_box = {}
|
64 |
check_box['essential'] = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model']
|
65 |
+
# revise there to set default dataset
|
66 |
+
check_box['required'] = ['Avg Score', 'Avg Rank'] + DEFAULT_BENCH
|
|
|
67 |
check_box['avg'] = ['Avg Score', 'Avg Rank']
|
68 |
check_box['all'] = check_box['avg'] + fields
|
69 |
type_map = defaultdict(lambda: 'number')
|
70 |
type_map['Method'] = 'html'
|
71 |
type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
|
72 |
check_box['type_map'] = type_map
|
73 |
+
|
74 |
res = generate_table(results, fields)
|
75 |
df = pd.DataFrame(res)
|
76 |
df = df.sort_values('Avg Score')
|
|
|
168 |
df.update(res)
|
169 |
df = df.sort_values('Avg Score')
|
170 |
df = df.iloc[::-1]
|
171 |
+
return df
|
meta_data.py
CHANGED
@@ -24,6 +24,11 @@ MAIN_FIELDS = [
|
|
24 |
'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
|
25 |
'HallusionBench', 'SEEDBench_IMG', 'MMVet', 'LLaVABench', 'CCBench', 'RealWorldQA'
|
26 |
]
|
|
|
|
|
|
|
|
|
|
|
27 |
MMBENCH_FIELDS = ['MMBench_TEST_EN', 'MMBench_DEV_EN', 'MMBench_TEST_CN', 'MMBench_DEV_CN', 'CCBench']
|
28 |
MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
29 |
MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
|
@@ -37,8 +42,9 @@ LEADERBOARD_MD['MAIN'] = f"""
|
|
37 |
- Metrics:
|
38 |
- Avg Score: The average score on all VLM Benchmarks (normalized to 0 - 100, the higher the better).
|
39 |
- Avg Rank: The average rank on all VLM Benchmarks (the lower the better).
|
40 |
-
-
|
41 |
-
|
|
|
42 |
- Detailed evaluation results for each dataset (included or not included in main) are provided in the consequent tabs.
|
43 |
"""
|
44 |
|
|
|
24 |
'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
|
25 |
'HallusionBench', 'SEEDBench_IMG', 'MMVet', 'LLaVABench', 'CCBench', 'RealWorldQA'
|
26 |
]
|
27 |
+
DEFAULT_BENCH = [
|
28 |
+
'MMBench_TEST_EN', 'MMBench_TEST_CN', 'MMStar', 'MME',
|
29 |
+
'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
|
30 |
+
'HallusionBench', 'SEEDBench_IMG', 'MMVet', 'LLaVABench'
|
31 |
+
]
|
32 |
MMBENCH_FIELDS = ['MMBench_TEST_EN', 'MMBench_DEV_EN', 'MMBench_TEST_CN', 'MMBench_DEV_CN', 'CCBench']
|
33 |
MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
34 |
MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
|
|
|
42 |
- Metrics:
|
43 |
- Avg Score: The average score on all VLM Benchmarks (normalized to 0 - 100, the higher the better).
|
44 |
- Avg Rank: The average rank on all VLM Benchmarks (the lower the better).
|
45 |
+
- Avg Score & Rank are calculated based on selected benchmark.
|
46 |
+
- By default, we present the overall evaluation results based on {len(DEFAULT_BENCH)} VLM benchmarks, sorted by the descending order of Avg Score.
|
47 |
+
- The following datasets are included in the main results: {', '.join(DEFAULT_BENCH)}.
|
48 |
- Detailed evaluation results for each dataset (included or not included in main) are provided in the consequent tabs.
|
49 |
"""
|
50 |
|