kennymckormick commited on
Commit
e401827
·
1 Parent(s): ef4756a
Files changed (2) hide show
  1. gen_table.py +5 -6
  2. meta_data.py +8 -2
gen_table.py CHANGED
@@ -7,7 +7,7 @@ import gradio as gr
7
  import numpy as np
8
  import pandas as pd
9
 
10
- from meta_data import META_FIELDS, URL
11
 
12
 
13
  def listinstr(lst, s):
@@ -62,16 +62,15 @@ def model_type_flag(line, FIELDS):
62
  def BUILD_L1_DF(results, fields):
63
  check_box = {}
64
  check_box['essential'] = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model']
65
- # revise there to set defualt dataset
66
- defualt_dataset = ['MMBench_TEST_EN', 'MMBench_TEST_CN', 'MMStar', 'MME', 'MMMU_VAL', 'MathVista', 'OCRBench', 'MMVet', 'AI2D', 'HallusionBench', 'LLaVABench', 'SEEDBench_IMG']
67
- check_box['required'] = ['Avg Score', 'Avg Rank'] + defualt_dataset
68
  check_box['avg'] = ['Avg Score', 'Avg Rank']
69
  check_box['all'] = check_box['avg'] + fields
70
  type_map = defaultdict(lambda: 'number')
71
  type_map['Method'] = 'html'
72
  type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
73
  check_box['type_map'] = type_map
74
-
75
  res = generate_table(results, fields)
76
  df = pd.DataFrame(res)
77
  df = df.sort_values('Avg Score')
@@ -169,4 +168,4 @@ def generate_table(results, fields, df=None):
169
  df.update(res)
170
  df = df.sort_values('Avg Score')
171
  df = df.iloc[::-1]
172
- return df
 
7
  import numpy as np
8
  import pandas as pd
9
 
10
+ from meta_data import DEFAULT_BENCH, META_FIELDS, URL
11
 
12
 
13
  def listinstr(lst, s):
 
62
  def BUILD_L1_DF(results, fields):
63
  check_box = {}
64
  check_box['essential'] = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model']
65
+ # revise there to set default dataset
66
+ check_box['required'] = ['Avg Score', 'Avg Rank'] + DEFAULT_BENCH
 
67
  check_box['avg'] = ['Avg Score', 'Avg Rank']
68
  check_box['all'] = check_box['avg'] + fields
69
  type_map = defaultdict(lambda: 'number')
70
  type_map['Method'] = 'html'
71
  type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
72
  check_box['type_map'] = type_map
73
+
74
  res = generate_table(results, fields)
75
  df = pd.DataFrame(res)
76
  df = df.sort_values('Avg Score')
 
168
  df.update(res)
169
  df = df.sort_values('Avg Score')
170
  df = df.iloc[::-1]
171
+ return df
meta_data.py CHANGED
@@ -24,6 +24,11 @@ MAIN_FIELDS = [
24
  'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
25
  'HallusionBench', 'SEEDBench_IMG', 'MMVet', 'LLaVABench', 'CCBench', 'RealWorldQA'
26
  ]
 
 
 
 
 
27
  MMBENCH_FIELDS = ['MMBench_TEST_EN', 'MMBench_DEV_EN', 'MMBench_TEST_CN', 'MMBench_DEV_CN', 'CCBench']
28
  MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
29
  MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
@@ -37,8 +42,9 @@ LEADERBOARD_MD['MAIN'] = f"""
37
  - Metrics:
38
  - Avg Score: The average score on all VLM Benchmarks (normalized to 0 - 100, the higher the better).
39
  - Avg Rank: The average rank on all VLM Benchmarks (the lower the better).
40
- - The overall evaluation results on {len(MAIN_FIELDS)} VLM benchmarks, sorted by the descending order of Avg Score.
41
- - The following datasets are included in the main results: {', '.join(MAIN_FIELDS)}.
 
42
  - Detailed evaluation results for each dataset (included or not included in main) are provided in the consequent tabs.
43
  """
44
 
 
24
  'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
25
  'HallusionBench', 'SEEDBench_IMG', 'MMVet', 'LLaVABench', 'CCBench', 'RealWorldQA'
26
  ]
27
+ DEFAULT_BENCH = [
28
+ 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'MMStar', 'MME',
29
+ 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
30
+ 'HallusionBench', 'SEEDBench_IMG', 'MMVet', 'LLaVABench'
31
+ ]
32
  MMBENCH_FIELDS = ['MMBench_TEST_EN', 'MMBench_DEV_EN', 'MMBench_TEST_CN', 'MMBench_DEV_CN', 'CCBench']
33
  MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
34
  MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
 
42
  - Metrics:
43
  - Avg Score: The average score on all VLM Benchmarks (normalized to 0 - 100, the higher the better).
44
  - Avg Rank: The average rank on all VLM Benchmarks (the lower the better).
45
+ - Avg Score & Rank are calculated based on selected benchmark.
46
+ - By default, we present the overall evaluation results based on {len(DEFAULT_BENCH)} VLM benchmarks, sorted by the descending order of Avg Score.
47
+ - The following datasets are included in the main results: {', '.join(DEFAULT_BENCH)}.
48
  - Detailed evaluation results for each dataset (included or not included in main) are provided in the consequent tabs.
49
  """
50