Spaces:
Running
Running
MINGYISU
commited on
Commit
·
0c9e3fb
1
Parent(s):
498bdf4
init commit
Browse files- .gitignore +13 -0
- .pre-commit-config.yaml +53 -0
- app.py +102 -196
- index.html +0 -19
- results.csv +15 -0
- src/about.py +1 -1
- src/display/css_html_js.py +1 -0
- style.css +0 -28
- utils.py +183 -0
.gitignore
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
auto_evals/
|
2 |
+
venv/
|
3 |
+
__pycache__/
|
4 |
+
.env
|
5 |
+
.ipynb_checkpoints
|
6 |
+
*ipynb
|
7 |
+
.vscode/
|
8 |
+
|
9 |
+
eval-queue/
|
10 |
+
eval-results/
|
11 |
+
eval-queue-bk/
|
12 |
+
eval-results-bk/
|
13 |
+
logs/
|
.pre-commit-config.yaml
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
default_language_version:
|
16 |
+
python: python3
|
17 |
+
|
18 |
+
ci:
|
19 |
+
autofix_prs: true
|
20 |
+
autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
|
21 |
+
autoupdate_schedule: quarterly
|
22 |
+
|
23 |
+
repos:
|
24 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
25 |
+
rev: v4.3.0
|
26 |
+
hooks:
|
27 |
+
- id: check-yaml
|
28 |
+
- id: check-case-conflict
|
29 |
+
- id: detect-private-key
|
30 |
+
- id: check-added-large-files
|
31 |
+
args: ['--maxkb=1000']
|
32 |
+
- id: requirements-txt-fixer
|
33 |
+
- id: end-of-file-fixer
|
34 |
+
- id: trailing-whitespace
|
35 |
+
|
36 |
+
- repo: https://github.com/PyCQA/isort
|
37 |
+
rev: 5.12.0
|
38 |
+
hooks:
|
39 |
+
- id: isort
|
40 |
+
name: Format imports
|
41 |
+
|
42 |
+
- repo: https://github.com/psf/black
|
43 |
+
rev: 22.12.0
|
44 |
+
hooks:
|
45 |
+
- id: black
|
46 |
+
name: Format code
|
47 |
+
additional_dependencies: ['click==8.0.2']
|
48 |
+
|
49 |
+
- repo: https://github.com/charliermarsh/ruff-pre-commit
|
50 |
+
# Ruff version.
|
51 |
+
rev: 'v0.0.267'
|
52 |
+
hooks:
|
53 |
+
- id: ruff
|
app.py
CHANGED
@@ -1,204 +1,110 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
)
|
15 |
-
|
16 |
-
|
17 |
-
BENCHMARK_COLS,
|
18 |
-
COLS,
|
19 |
-
EVAL_COLS,
|
20 |
-
EVAL_TYPES,
|
21 |
-
AutoEvalColumn,
|
22 |
-
ModelType,
|
23 |
-
fields,
|
24 |
-
WeightType,
|
25 |
-
Precision
|
26 |
-
)
|
27 |
-
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
28 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
29 |
-
from src.submission.submit import add_new_eval
|
30 |
-
|
31 |
-
|
32 |
-
def restart_space():
|
33 |
-
API.restart_space(repo_id=REPO_ID)
|
34 |
-
|
35 |
-
### Space initialisation
|
36 |
-
try:
|
37 |
-
print(EVAL_REQUESTS_PATH)
|
38 |
-
snapshot_download(
|
39 |
-
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
40 |
-
)
|
41 |
-
except Exception:
|
42 |
-
restart_space()
|
43 |
-
try:
|
44 |
-
print(EVAL_RESULTS_PATH)
|
45 |
-
snapshot_download(
|
46 |
-
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
47 |
-
)
|
48 |
-
except Exception:
|
49 |
-
restart_space()
|
50 |
-
|
51 |
-
|
52 |
-
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
53 |
-
|
54 |
-
(
|
55 |
-
finished_eval_queue_df,
|
56 |
-
running_eval_queue_df,
|
57 |
-
pending_eval_queue_df,
|
58 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
-
|
60 |
-
def init_leaderboard(dataframe):
|
61 |
-
if dataframe is None or dataframe.empty:
|
62 |
-
raise ValueError("Leaderboard DataFrame is empty or None.")
|
63 |
-
return Leaderboard(
|
64 |
-
value=dataframe,
|
65 |
-
datatype=[c.type for c in fields(AutoEvalColumn)],
|
66 |
-
select_columns=SelectColumns(
|
67 |
-
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
68 |
-
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
-
label="Select Columns to Display:",
|
70 |
-
),
|
71 |
-
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
72 |
-
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
-
filter_columns=[
|
74 |
-
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
-
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
-
ColumnFilter(
|
77 |
-
AutoEvalColumn.params.name,
|
78 |
-
type="slider",
|
79 |
-
min=0.01,
|
80 |
-
max=150,
|
81 |
-
label="Select the number of parameters (B)",
|
82 |
-
),
|
83 |
-
ColumnFilter(
|
84 |
-
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
85 |
-
),
|
86 |
-
],
|
87 |
-
bool_checkboxgroup_label="Hide models",
|
88 |
-
interactive=False,
|
89 |
-
)
|
90 |
-
|
91 |
-
|
92 |
-
demo = gr.Blocks(css=custom_css)
|
93 |
-
with demo:
|
94 |
-
gr.HTML(TITLE)
|
95 |
-
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
96 |
-
|
97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
102 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
103 |
-
|
104 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
105 |
-
with gr.Column():
|
106 |
-
with gr.Row():
|
107 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
108 |
-
|
109 |
-
with gr.Column():
|
110 |
-
with gr.Accordion(
|
111 |
-
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
112 |
-
open=False,
|
113 |
-
):
|
114 |
-
with gr.Row():
|
115 |
-
finished_eval_table = gr.components.Dataframe(
|
116 |
-
value=finished_eval_queue_df,
|
117 |
-
headers=EVAL_COLS,
|
118 |
-
datatype=EVAL_TYPES,
|
119 |
-
row_count=5,
|
120 |
-
)
|
121 |
-
with gr.Accordion(
|
122 |
-
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
123 |
-
open=False,
|
124 |
-
):
|
125 |
-
with gr.Row():
|
126 |
-
running_eval_table = gr.components.Dataframe(
|
127 |
-
value=running_eval_queue_df,
|
128 |
-
headers=EVAL_COLS,
|
129 |
-
datatype=EVAL_TYPES,
|
130 |
-
row_count=5,
|
131 |
-
)
|
132 |
-
|
133 |
-
with gr.Accordion(
|
134 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
135 |
-
open=False,
|
136 |
-
):
|
137 |
-
with gr.Row():
|
138 |
-
pending_eval_table = gr.components.Dataframe(
|
139 |
-
value=pending_eval_queue_df,
|
140 |
-
headers=EVAL_COLS,
|
141 |
-
datatype=EVAL_TYPES,
|
142 |
-
row_count=5,
|
143 |
-
)
|
144 |
with gr.Row():
|
145 |
-
gr.
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
model_type = gr.Dropdown(
|
152 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
153 |
-
label="Model type",
|
154 |
-
multiselect=False,
|
155 |
-
value=None,
|
156 |
-
interactive=True,
|
157 |
)
|
|
|
158 |
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
choices=[i.value.name for i in WeightType],
|
169 |
-
label="Weights type",
|
170 |
-
multiselect=False,
|
171 |
-
value="Original",
|
172 |
-
interactive=True,
|
173 |
-
)
|
174 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
175 |
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
)
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
|
201 |
-
|
202 |
-
scheduler.add_job(restart_space, "interval", seconds=1800)
|
203 |
-
scheduler.start()
|
204 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
1 |
+
from utils import *
|
2 |
+
|
3 |
+
global data_component
|
4 |
+
|
5 |
+
def update_table(query, min_size, max_size, selected_subjects=None):
|
6 |
+
df = get_df()
|
7 |
+
filtered_df = search_and_filter_models(df, query, min_size, max_size)
|
8 |
+
if selected_subjects and len(selected_subjects) > 0:
|
9 |
+
base_columns = ['Models', 'Model Size(B)', 'Data Source', 'Overall']
|
10 |
+
selected_columns = base_columns + selected_subjects
|
11 |
+
filtered_df = filtered_df[selected_columns]
|
12 |
+
return filtered_df
|
13 |
+
|
14 |
+
with gr.Blocks() as block:
|
15 |
+
gr.Markdown(LEADERBOARD_INTRODUCTION)
|
16 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
18 |
+
# Table 1
|
19 |
+
with gr.TabItem("📊 MMLU-Pro", elem_id="qa-tab-table1", id=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
with gr.Row():
|
21 |
+
with gr.Accordion("Citation", open=False):
|
22 |
+
citation_button = gr.Textbox(
|
23 |
+
value=CITATION_BUTTON_TEXT,
|
24 |
+
label=CITATION_BUTTON_LABEL,
|
25 |
+
elem_id="citation-button",
|
26 |
+
lines=10,
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
)
|
28 |
+
gr.Markdown(TABLE_INTRODUCTION)
|
29 |
|
30 |
+
with gr.Row():
|
31 |
+
search_bar = gr.Textbox(
|
32 |
+
placeholder="Search models...",
|
33 |
+
show_label=False,
|
34 |
+
elem_id="search-bar"
|
35 |
+
)
|
36 |
+
|
37 |
+
df = get_df()
|
38 |
+
min_size, max_size = get_size_range(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
+
with gr.Row():
|
41 |
+
min_size_slider = gr.Slider(
|
42 |
+
minimum=min_size,
|
43 |
+
maximum=max_size,
|
44 |
+
value=min_size,
|
45 |
+
step=0.1,
|
46 |
+
label="Minimum number of parameters (B)",
|
47 |
+
)
|
48 |
+
max_size_slider = gr.Slider(
|
49 |
+
minimum=min_size,
|
50 |
+
maximum=max_size,
|
51 |
+
value=max_size,
|
52 |
+
step=0.1,
|
53 |
+
label="Maximum number of parameters (B)",
|
54 |
+
)
|
55 |
+
|
56 |
+
subject_choices = [col for col in COLUMN_NAMES if col not in ['Models', 'Model Size(B)', 'Data Source', 'Overall', 'IND', 'OOD']]
|
57 |
+
with gr.Row():
|
58 |
+
subjects_select = gr.CheckboxGroup(
|
59 |
+
choices=subject_choices,
|
60 |
+
value=subject_choices,
|
61 |
+
label="Select Subjects to Display",
|
62 |
+
elem_id="subjects-select"
|
63 |
+
)
|
64 |
+
|
65 |
+
data_component = gr.components.Dataframe(
|
66 |
+
value=df[COLUMN_NAMES],
|
67 |
+
headers=COLUMN_NAMES,
|
68 |
+
type="pandas",
|
69 |
+
datatype=DATA_TITLE_TYPE,
|
70 |
+
interactive=False,
|
71 |
+
visible=True,
|
72 |
)
|
73 |
+
|
74 |
+
refresh_button = gr.Button("Refresh")
|
75 |
+
|
76 |
+
def update_with_subjects(*args):
|
77 |
+
return update_table(*args)
|
78 |
+
|
79 |
+
search_bar.change(
|
80 |
+
fn=update_with_subjects,
|
81 |
+
inputs=[search_bar, min_size_slider, max_size_slider, subjects_select],
|
82 |
+
outputs=data_component
|
83 |
+
)
|
84 |
+
min_size_slider.change(
|
85 |
+
fn=update_with_subjects,
|
86 |
+
inputs=[search_bar, min_size_slider, max_size_slider, subjects_select],
|
87 |
+
outputs=data_component
|
88 |
+
)
|
89 |
+
max_size_slider.change(
|
90 |
+
fn=update_with_subjects,
|
91 |
+
inputs=[search_bar, min_size_slider, max_size_slider, subjects_select],
|
92 |
+
outputs=data_component
|
93 |
+
)
|
94 |
+
subjects_select.change(
|
95 |
+
fn=update_with_subjects,
|
96 |
+
inputs=[search_bar, min_size_slider, max_size_slider, subjects_select],
|
97 |
+
outputs=data_component
|
98 |
)
|
99 |
+
refresh_button.click(fn=refresh_data, outputs=data_component)
|
100 |
+
|
101 |
+
# table 2
|
102 |
+
with gr.TabItem("📝 About", elem_id="qa-tab-table2", id=2):
|
103 |
+
gr.Markdown(LEADERBOARD_INFO, elem_classes="markdown-text")
|
104 |
+
|
105 |
+
# table 3
|
106 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="submit-tab", id=3):
|
107 |
+
with gr.Row():
|
108 |
+
gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
|
109 |
|
110 |
+
block.launch(share=True)
|
|
|
|
|
|
index.html
DELETED
@@ -1,19 +0,0 @@
|
|
1 |
-
<!doctype html>
|
2 |
-
<html>
|
3 |
-
<head>
|
4 |
-
<meta charset="utf-8" />
|
5 |
-
<meta name="viewport" content="width=device-width" />
|
6 |
-
<title>My static Space</title>
|
7 |
-
<link rel="stylesheet" href="style.css" />
|
8 |
-
</head>
|
9 |
-
<body>
|
10 |
-
<div class="card">
|
11 |
-
<h1>Welcome to your static Space!</h1>
|
12 |
-
<p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
|
13 |
-
<p>
|
14 |
-
Also don't forget to check the
|
15 |
-
<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
|
16 |
-
</p>
|
17 |
-
</div>
|
18 |
-
</body>
|
19 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results.csv
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Models,Model Size(B),Data Source,Overall,IND,OOD,Classification,VQA,Retrieval,Grounding
|
2 |
+
CLIP,unk,unk,37.8,37.1,38.7,42.8,9.1,53.0,51.8
|
3 |
+
BLIP2,unk,unk,25.2,25.3,25.1,27.0,4.2,33.9,47.0
|
4 |
+
SigLIP,unk,unk,34.8,32.3,38.0,40.3,8.4,31.6,59.5
|
5 |
+
OpenCLIP,unk,unk,39.7,39.3,40.2,47.8,10.9,52.3,53.3
|
6 |
+
UniIR (BLIP_FF),unk,unk,42.8,44.7,40.4,42.1,15.0,60.1,62.2
|
7 |
+
UniIR (CLIP_SF),unk,unk,44.7,47.1,41.7,44.3,16.2,61.8,65.3
|
8 |
+
E5-V,unk,unk,13.3,14.9,11.5,21.8,4.9,11.5,19.0
|
9 |
+
Magiclens,unk,unk,27.8,31.0,23.7,38.8,8.3,35.4,26.0
|
10 |
+
CLIP-FFT,unk,TIGER-Lab,45.4,47.6,42.8,55.2,19.7,53.2,62.2
|
11 |
+
OpenCLIP-FFT,unk,unk,47.2,50.5,43.1,56.0,21.9,55.4,64.1
|
12 |
+
VLM2Vec (Phi-3.5-V-FFT),unk,TIGER-Lab,55.9,62.8,47.4,52.8,50.3,57.8,72.3
|
13 |
+
VLM2Vec (Phi-3.5-V-LoRA),unk,TIGER-Lab,60.1,66.5,52.0,54.8,54.9,62.3,79.5
|
14 |
+
VLM2Vec (LLaVA-1.6-LoRA-LowRes),unk,TIGER-Lab,55.0,61.0,47.5,54.7,50.3,56.2,64.0
|
15 |
+
VLM2Vec (LLaVA-1.6-LoRA-HighRes),unk,TIGER-Lab,62.9,67.5,57.1,61.2,49.9,67.4,86.1
|
src/about.py
CHANGED
@@ -21,7 +21,7 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
21 |
|
22 |
|
23 |
# Your leaderboard name
|
24 |
-
TITLE = """<h1 align="center" id="space-title">
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
|
|
21 |
|
22 |
|
23 |
# Your leaderboard name
|
24 |
+
TITLE = """<h1 align="center" id="space-title">MMEB Leaderboard</h1>"""
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
src/display/css_html_js.py
CHANGED
@@ -48,6 +48,7 @@ custom_css = """
|
|
48 |
|
49 |
.tab-buttons button {
|
50 |
font-size: 20px;
|
|
|
51 |
}
|
52 |
|
53 |
#scale-logo {
|
|
|
48 |
|
49 |
.tab-buttons button {
|
50 |
font-size: 20px;
|
51 |
+
height: 1500px;
|
52 |
}
|
53 |
|
54 |
#scale-logo {
|
style.css
DELETED
@@ -1,28 +0,0 @@
|
|
1 |
-
body {
|
2 |
-
padding: 2rem;
|
3 |
-
font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
|
4 |
-
}
|
5 |
-
|
6 |
-
h1 {
|
7 |
-
font-size: 16px;
|
8 |
-
margin-top: 0;
|
9 |
-
}
|
10 |
-
|
11 |
-
p {
|
12 |
-
color: rgb(107, 114, 128);
|
13 |
-
font-size: 15px;
|
14 |
-
margin-bottom: 10px;
|
15 |
-
margin-top: 5px;
|
16 |
-
}
|
17 |
-
|
18 |
-
.card {
|
19 |
-
max-width: 620px;
|
20 |
-
margin: 0 auto;
|
21 |
-
padding: 16px;
|
22 |
-
border: 1px solid lightgray;
|
23 |
-
border-radius: 16px;
|
24 |
-
}
|
25 |
-
|
26 |
-
.card p:last-child {
|
27 |
-
margin-bottom: 0;
|
28 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import gradio as gr
|
3 |
+
import csv
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
import shutil
|
7 |
+
from huggingface_hub import Repository
|
8 |
+
|
9 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
10 |
+
|
11 |
+
SUBJECTS = ["Classification", "VQA", "Retrieval", "Grounding"]
|
12 |
+
|
13 |
+
MODEL_INFO = [
|
14 |
+
"Models", "Model Size(B)", "Data Source",
|
15 |
+
"Overall", "IND", "OOD",
|
16 |
+
"Classification", "VQA", "Retrieval", "Grounding"
|
17 |
+
]
|
18 |
+
|
19 |
+
DATA_TITLE_TYPE = ['markdown', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
|
20 |
+
|
21 |
+
# TODO: submission process not implemented yet
|
22 |
+
SUBMISSION_NAME = ""
|
23 |
+
SUBMISSION_URL = ""
|
24 |
+
CSV_DIR = "results.csv" # TODO: Temporary file, to be updated with the actual file
|
25 |
+
|
26 |
+
COLUMN_NAMES = MODEL_INFO
|
27 |
+
|
28 |
+
LEADERBOARD_INTRODUCTION = """# MMEB Leaderboard
|
29 |
+
|
30 |
+
## Introduction
|
31 |
+
We introduce MMEB, a benchmark for multimodal evaluation of models. The benchmark consists of four tasks: Classification, VQA, Retrieval, and Grounding. Models are evaluated based on 36 datasets.
|
32 |
+
|
33 |
+
|
34 |
+
"""
|
35 |
+
|
36 |
+
TABLE_INTRODUCTION = """"""
|
37 |
+
|
38 |
+
LEADERBOARD_INFO = """
|
39 |
+
## Dataset Summary
|
40 |
+
"""
|
41 |
+
|
42 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
43 |
+
CITATION_BUTTON_TEXT = """"""
|
44 |
+
|
45 |
+
SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction
|
46 |
+
|
47 |
+
## ⚠ Please note that you need to submit the JSON file with the following format:
|
48 |
+
```json
|
49 |
+
[
|
50 |
+
{
|
51 |
+
"question_id": 123,
|
52 |
+
"question": "abc",
|
53 |
+
"options": ["abc", "xyz", ...],
|
54 |
+
"answer": "ABC",
|
55 |
+
"answer_index": 1,
|
56 |
+
"category": "abc,
|
57 |
+
"pred": "B",
|
58 |
+
"model_outputs": ""
|
59 |
+
}, ...
|
60 |
+
]
|
61 |
+
```
|
62 |
+
...
|
63 |
+
"""
|
64 |
+
|
65 |
+
def get_df():
|
66 |
+
# TODO: Update this after the hf dataset has been created!
|
67 |
+
# repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
|
68 |
+
# repo.git_pull()
|
69 |
+
df = pd.read_csv(CSV_DIR)
|
70 |
+
df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
|
71 |
+
df = df.sort_values(by=['Overall'], ascending=False)
|
72 |
+
return df
|
73 |
+
|
74 |
+
|
75 |
+
def add_new_eval(
|
76 |
+
input_file,
|
77 |
+
):
|
78 |
+
if input_file is None:
|
79 |
+
return "Error! Empty file!"
|
80 |
+
|
81 |
+
upload_data = json.loads(input_file)
|
82 |
+
print("upload_data:\n", upload_data)
|
83 |
+
data_row = [f'{upload_data["Model"]}', upload_data['Overall']]
|
84 |
+
for subject in SUBJECTS:
|
85 |
+
data_row += [upload_data[subject]]
|
86 |
+
print("data_row:\n", data_row)
|
87 |
+
submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
|
88 |
+
use_auth_token=HF_TOKEN, repo_type="dataset")
|
89 |
+
submission_repo.git_pull()
|
90 |
+
|
91 |
+
already_submitted = []
|
92 |
+
with open(CSV_DIR, mode='r') as file:
|
93 |
+
reader = csv.reader(file, delimiter=',')
|
94 |
+
for row in reader:
|
95 |
+
already_submitted.append(row[0])
|
96 |
+
|
97 |
+
if data_row[0] not in already_submitted:
|
98 |
+
with open(CSV_DIR, mode='a', newline='') as file:
|
99 |
+
writer = csv.writer(file)
|
100 |
+
writer.writerow(data_row)
|
101 |
+
|
102 |
+
submission_repo.push_to_hub()
|
103 |
+
print('Submission Successful')
|
104 |
+
else:
|
105 |
+
print('The entry already exists')
|
106 |
+
|
107 |
+
def refresh_data():
|
108 |
+
df = get_df()
|
109 |
+
return df[COLUMN_NAMES]
|
110 |
+
|
111 |
+
|
112 |
+
def search_and_filter_models(df, query, min_size, max_size):
|
113 |
+
filtered_df = df.copy()
|
114 |
+
|
115 |
+
if query:
|
116 |
+
filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)]
|
117 |
+
|
118 |
+
size_mask = filtered_df['Model Size(B)'].apply(lambda x:
|
119 |
+
(min_size <= 1000.0 <= max_size) if x == 'unknown'
|
120 |
+
else (min_size <= x <= max_size))
|
121 |
+
|
122 |
+
filtered_df = filtered_df[size_mask]
|
123 |
+
|
124 |
+
return filtered_df[COLUMN_NAMES]
|
125 |
+
|
126 |
+
|
127 |
+
# def search_and_filter_models(df, query, min_size, max_size):
|
128 |
+
# filtered_df = df.copy()
|
129 |
+
|
130 |
+
# if query:
|
131 |
+
# filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)]
|
132 |
+
|
133 |
+
# def size_filter(x):
|
134 |
+
# if isinstance(x, (int, float)):
|
135 |
+
# return min_size <= x <= max_size
|
136 |
+
# return True
|
137 |
+
|
138 |
+
# filtered_df = filtered_df[filtered_df['Model Size(B)'].apply(size_filter)]
|
139 |
+
|
140 |
+
# return filtered_df[COLUMN_NAMES]
|
141 |
+
|
142 |
+
|
143 |
+
def search_models(df, query):
|
144 |
+
if query:
|
145 |
+
return df[df['Models'].str.contains(query, case=False, na=False)]
|
146 |
+
return df
|
147 |
+
|
148 |
+
|
149 |
+
# def get_size_range(df):
|
150 |
+
# numeric_sizes = df[df['Model Size(B)'].apply(lambda x: isinstance(x, (int, float)))]['Model Size(B)']
|
151 |
+
# if len(numeric_sizes) > 0:
|
152 |
+
# return float(numeric_sizes.min()), float(numeric_sizes.max())
|
153 |
+
# return 0, 1000
|
154 |
+
|
155 |
+
|
156 |
+
def get_size_range(df):
|
157 |
+
sizes = df['Model Size(B)'].apply(lambda x: 1000.0 if x == 'unknown' else x)
|
158 |
+
return float(sizes.min()), float(sizes.max())
|
159 |
+
|
160 |
+
|
161 |
+
def process_model_size(size):
|
162 |
+
if pd.isna(size) or size == 'unk':
|
163 |
+
return 'unknown'
|
164 |
+
try:
|
165 |
+
val = float(size)
|
166 |
+
return val
|
167 |
+
except (ValueError, TypeError):
|
168 |
+
return 'unknown'
|
169 |
+
|
170 |
+
|
171 |
+
def filter_columns_by_subjects(df, selected_subjects=None):
|
172 |
+
if selected_subjects is None or len(selected_subjects) == 0:
|
173 |
+
return df[COLUMN_NAMES]
|
174 |
+
|
175 |
+
base_columns = ['Models', 'Model Size(B)', 'Data Source', 'Overall']
|
176 |
+
selected_columns = base_columns + selected_subjects
|
177 |
+
|
178 |
+
available_columns = [col for col in selected_columns if col in df.columns]
|
179 |
+
return df[available_columns]
|
180 |
+
|
181 |
+
def get_subject_choices():
|
182 |
+
return SUBJECTS
|
183 |
+
|