AtsuMiyai commited on
Commit
6ec92ad
β€’
0 Parent(s):

initial commit

Browse files
Files changed (10) hide show
  1. .gitattributes +55 -0
  2. .gitignore +13 -0
  3. .pre-commit-config.yaml +53 -0
  4. Makefile +13 -0
  5. README.md +45 -0
  6. app.py +368 -0
  7. constants.py +67 -0
  8. download_from_dataset +1 -0
  9. pyproject.toml +13 -0
  10. requirements.txt +19 -0
.gitattributes ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
+ *.model filter=lfs diff=lfs merge=lfs -text
14
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
15
+ *.npy filter=lfs diff=lfs merge=lfs -text
16
+ *.npz filter=lfs diff=lfs merge=lfs -text
17
+ *.onnx filter=lfs diff=lfs merge=lfs -text
18
+ *.ot filter=lfs diff=lfs merge=lfs -text
19
+ *.parquet filter=lfs diff=lfs merge=lfs -text
20
+ *.pb filter=lfs diff=lfs merge=lfs -text
21
+ *.pickle filter=lfs diff=lfs merge=lfs -text
22
+ *.pkl filter=lfs diff=lfs merge=lfs -text
23
+ *.pt filter=lfs diff=lfs merge=lfs -text
24
+ *.pth filter=lfs diff=lfs merge=lfs -text
25
+ *.rar filter=lfs diff=lfs merge=lfs -text
26
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
27
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar filter=lfs diff=lfs merge=lfs -text
30
+ *.tflite filter=lfs diff=lfs merge=lfs -text
31
+ *.tgz filter=lfs diff=lfs merge=lfs -text
32
+ *.wasm filter=lfs diff=lfs merge=lfs -text
33
+ *.xz filter=lfs diff=lfs merge=lfs -text
34
+ *.zip filter=lfs diff=lfs merge=lfs -text
35
+ *.zst filter=lfs diff=lfs merge=lfs -text
36
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
37
+ # Audio files - uncompressed
38
+ *.pcm filter=lfs diff=lfs merge=lfs -text
39
+ *.sam filter=lfs diff=lfs merge=lfs -text
40
+ *.raw filter=lfs diff=lfs merge=lfs -text
41
+ # Audio files - compressed
42
+ *.aac filter=lfs diff=lfs merge=lfs -text
43
+ *.flac filter=lfs diff=lfs merge=lfs -text
44
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
45
+ *.ogg filter=lfs diff=lfs merge=lfs -text
46
+ *.wav filter=lfs diff=lfs merge=lfs -text
47
+ # Image files - uncompressed
48
+ *.bmp filter=lfs diff=lfs merge=lfs -text
49
+ *.gif filter=lfs diff=lfs merge=lfs -text
50
+ *.png filter=lfs diff=lfs merge=lfs -text
51
+ *.tiff filter=lfs diff=lfs merge=lfs -text
52
+ # Image files - compressed
53
+ *.jpg filter=lfs diff=lfs merge=lfs -text
54
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
55
+ *.webp filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ auto_evals/
2
+ venv/
3
+ __pycache__/
4
+ .env
5
+ .ipynb_checkpoints
6
+ *ipynb
7
+ .vscode/
8
+
9
+ eval-queue/
10
+ eval-results/
11
+ eval-queue-bk/
12
+ eval-results-bk/
13
+ logs/
.pre-commit-config.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ default_language_version:
16
+ python: python3
17
+
18
+ ci:
19
+ autofix_prs: true
20
+ autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
21
+ autoupdate_schedule: quarterly
22
+
23
+ repos:
24
+ - repo: https://github.com/pre-commit/pre-commit-hooks
25
+ rev: v4.3.0
26
+ hooks:
27
+ - id: check-yaml
28
+ - id: check-case-conflict
29
+ - id: detect-private-key
30
+ - id: check-added-large-files
31
+ args: ['--maxkb=1000']
32
+ - id: requirements-txt-fixer
33
+ - id: end-of-file-fixer
34
+ - id: trailing-whitespace
35
+
36
+ - repo: https://github.com/PyCQA/isort
37
+ rev: 5.12.0
38
+ hooks:
39
+ - id: isort
40
+ name: Format imports
41
+
42
+ - repo: https://github.com/psf/black
43
+ rev: 22.12.0
44
+ hooks:
45
+ - id: black
46
+ name: Format code
47
+ additional_dependencies: ['click==8.0.2']
48
+
49
+ - repo: https://github.com/charliermarsh/ruff-pre-commit
50
+ # Ruff version.
51
+ rev: 'v0.0.267'
52
+ hooks:
53
+ - id: ruff
Makefile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style format
2
+
3
+
4
+ style:
5
+ python -m black --line-length 119 .
6
+ python -m isort .
7
+ ruff check --fix .
8
+
9
+
10
+ quality:
11
+ python -m black --check --line-length 119 .
12
+ python -m isort --check-only .
13
+ ruff check .
README.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: JMMMU Leaderboard
3
+ emoji: πŸ₯‡
4
+ colorFrom: green
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 4.44.0
8
+ app_file: app.py
9
+ pinned: true
10
+ license: apache-2.0
11
+ ---
12
+
13
+ # Start the configuration
14
+
15
+ Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
16
+
17
+ Results files should have the following format and be stored as json files:
18
+ ```json
19
+ {
20
+ "config": {
21
+ "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
22
+ "model_name": "path of the model on the hub: org/model",
23
+ "model_sha": "revision on the hub",
24
+ },
25
+ "results": {
26
+ "task_name": {
27
+ "metric_name": score,
28
+ },
29
+ "task_name2": {
30
+ "metric_name": score,
31
+ }
32
+ }
33
+ }
34
+ ```
35
+
36
+ Request files are created automatically by this tool.
37
+
38
+ If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
39
+
40
+ # Code logic for more complex edits
41
+
42
+ You'll find
43
+ - the main table' columns names and properties in `src/display/utils.py`
44
+ - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
45
+ - teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
app.py ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
2
+
3
+ import gradio as gr
4
+ import pandas as pd
5
+ import re
6
+ import pandas as pd
7
+ import numpy as np
8
+ from collections import defaultdict
9
+ from constants import *
10
+ import os
11
+ from huggingface_hub import Repository
12
+ import json
13
+
14
+
15
+ global data_component, filter_component
16
+
17
+
18
+ TOKEN = os.environ.get("TOKEN")
19
+ repo = Repository(local_dir="./download_from_dataset", clone_from="JMMMU/leaderboard_result", repo_type="dataset", use_auth_token=TOKEN)
20
+
21
+ current_directory = os.getcwd()
22
+
23
+
24
+ def validate_model_size(s):
25
+ pattern = r'^\d+B$|^-$'
26
+ if re.match(pattern, s):
27
+ return s
28
+ else:
29
+ return '-'
30
+
31
+
32
+ def upload_file(files):
33
+ file_paths = [file.name for file in files]
34
+ return file_paths
35
+
36
+
37
+ def get_acc(data, subject_list):
38
+ acc = 0
39
+ for subject in subject_list:
40
+ acc += data["results"][subject]['jmmmu_acc,none']
41
+ acc = acc/len(subject_list)
42
+ acc = acc * 100
43
+ acc = round(acc, 1)
44
+ return acc
45
+
46
+
47
+ def calculate_score(input_file):
48
+ json_string = input_file.decode('utf-8')
49
+ data = json.loads(json_string)
50
+ result_dict = {}
51
+
52
+ overall = data["results"]["jmmmu"]['jmmmu_acc,none']*100
53
+ ca = data["results"]["culture_agnostic"]['jmmmu_acc,none']*100
54
+ cs = data["results"]["culture_specific"]['jmmmu_acc,none']*100
55
+ overall = round(overall, 1)
56
+ ca = round(ca, 1)
57
+ cs = round(cs, 1)
58
+ # Art_Psychology
59
+ art_psychology_subject_list = ["jmmmu_design", "jmmmu_music", "jmmmu_psychology"]
60
+ # Science
61
+ science_subject_list = ["jmmmu_biology", "jmmmu_chemistry", "jmmmu_physics", "jmmmu_physics"]
62
+ # Business
63
+ business_subject_list = ["jmmmu_accounting", "jmmmu_economics", "jmmmu_finance", "jmmmu_manage", "jmmmu_marketing"]
64
+ # Medicine
65
+ medicine_subject_list = ["jmmmu_basic_medical_science", "jmmmu_clinical_medicine", "jmmmu_diagnostics_and_laboratory_medicine", "jmmmu_pharmacy", "jmmmu_public_health"]
66
+ # Tech_Eng.
67
+ tech_eng_subject_list = ["jmmmu_agriculture", "jmmmu_architecture_and_engineering", "jmmmu_computer_science", "jmmmu_electronics", "jmmmu_energy_and_power", "jmmmu_materials", "jmmmu_mechanical_engineering"]
68
+
69
+ jmmmu_japanese_art_subject_list = ["jmmmu_japanese_art"]
70
+ jmmmu_japanese_heritage_subject_list = ["jmmmu_japanese_heritage"]
71
+ jmmmu_japanese_history_subject_list = ["jmmmu_japanese_history"]
72
+ jmmmu_world_history_subject_list = ["jmmmu_world_history"]
73
+
74
+ art_psychology = get_acc(data, art_psychology_subject_list)
75
+ science = get_acc(data, science_subject_list)
76
+ business = get_acc(data, business_subject_list)
77
+ medicine = get_acc(data, medicine_subject_list)
78
+ tech_eng = get_acc(data, tech_eng_subject_list)
79
+ japanese_art = get_acc(data, jmmmu_japanese_art_subject_list)
80
+ japanese_heritage = get_acc(data, jmmmu_japanese_heritage_subject_list)
81
+ japanese_history = get_acc(data, jmmmu_japanese_history_subject_list)
82
+ world_history = get_acc(data, jmmmu_world_history_subject_list)
83
+
84
+ result_dict =\
85
+ {
86
+ "overall": overall,
87
+ "cultureSpecific": cs,
88
+ "cultureAgnostic": ca,
89
+ "japaneseArt": japanese_art,
90
+ "japaneseHeritage": japanese_heritage,
91
+ "japaneseHistory": japanese_history,
92
+ "worldHistory": world_history,
93
+ "artPsychology": art_psychology,
94
+ "business": business,
95
+ "science": science,
96
+ "healthMedicine": medicine,
97
+ "techEngineering": tech_eng
98
+ }
99
+ return result_dict
100
+
101
+
102
+ def add_new_eval(
103
+ input_file,
104
+ model_type: str,
105
+ model_name_textbox: str,
106
+ revision_name_textbox: str,
107
+ model_link: str,
108
+ model_size: str,
109
+ # upd_type: str,
110
+ # question_type: str
111
+
112
+ ):
113
+
114
+ if input_file is None:
115
+ warning_text = "Error! Empty file!"
116
+ print(warning_text)
117
+ return warning_text
118
+ else:
119
+ model_size = validate_model_size(model_size)
120
+ # if upd_type == 'AAD':
121
+ csv_path = CSV_RESULT_PATH
122
+
123
+ # validity_check(input_file)
124
+
125
+ csv_data = pd.read_csv(csv_path)
126
+
127
+ result_dict = calculate_score(input_file)
128
+
129
+ if revision_name_textbox == '':
130
+ col = csv_data.shape[0]
131
+ model_name = model_name_textbox
132
+ else:
133
+ model_name = revision_name_textbox
134
+ model_name_list = csv_data['Model']
135
+ name_list = [name.split(']')[0][1:] for name in model_name_list]
136
+ if revision_name_textbox not in name_list:
137
+ col = csv_data.shape[0]
138
+ else:
139
+ col = name_list.index(revision_name_textbox)
140
+ model_name_wo_link = model_name
141
+ if model_link == '':
142
+ model_name = model_name # no url
143
+ else:
144
+ model_name = '[' + model_name + '](' + model_link + ')'
145
+
146
+ # add new data
147
+ new_data = [
148
+ model_type,
149
+ model_name,
150
+ model_size,
151
+ result_dict["overall"],
152
+ result_dict["cultureSpecific"],
153
+ result_dict["cultureAgnostic"],
154
+ result_dict["japaneseArt"],
155
+ result_dict["japaneseHeritage"],
156
+ result_dict["japaneseHistory"],
157
+ result_dict["worldHistory"],
158
+ result_dict["artPsychology"],
159
+ result_dict["business"],
160
+ result_dict["science"],
161
+ result_dict["healthMedicine"],
162
+ result_dict["techEngineering"]
163
+ ]
164
+
165
+ # If the same data already exists, return an error.
166
+ if new_data in csv_data.values.tolist():
167
+ warning_text = "Error! The same data already exists!"
168
+ print(warning_text)
169
+ return warning_text
170
+ # If the same model name already exists, return an error.
171
+ elif new_data[:5] in csv_data.values.tolist():
172
+ warning_text = "Error! The same data already exists! Please fill revision_name."
173
+ print(warning_text)
174
+ return warning_text
175
+
176
+ csv_data.loc[col] = new_data
177
+ csv_data = csv_data.to_csv(csv_path, index=False)
178
+
179
+ absolute_result_path = os.path.abspath(csv_path)
180
+ if not os.path.exists(absolute_result_path):
181
+ raise FileNotFoundError(f"File {absolute_result_path} not found")
182
+
183
+ repo.git_pull()
184
+ repo.git_add(absolute_result_path)
185
+
186
+ save_path = os.path.join(CSV_QUEUE_DIR, f"{model_name_wo_link}.json")
187
+ with open(save_path, "wb") as f:
188
+ f.write(input_file)
189
+
190
+ absolute_queue_path = os.path.abspath(save_path)
191
+
192
+ repo.git_add(absolute_queue_path)
193
+ repo.git_commit(f"add {model_name_wo_link} results")
194
+ repo.git_push()
195
+ print(f"Success! Your {model_name_wo_link} has been added!")
196
+
197
+ return 0
198
+
199
+
200
+ def get_baseline_df():
201
+ repo.git_pull()
202
+ df = pd.read_csv(CSV_RESULT_PATH)
203
+ df = df.sort_values(by="Overall", ascending=False)
204
+ present_columns = MODEL_INFO + checkbox_group.value
205
+ df = df[present_columns]
206
+ return df
207
+
208
+
209
+ def get_all_df():
210
+ repo.git_pull()
211
+ df = pd.read_csv(CSV_RESULT_PATH)
212
+ df = df.sort_values(by="Overall", ascending=False)
213
+ return df
214
+
215
+
216
+
217
+ block = gr.Blocks()
218
+
219
+
220
+ with block:
221
+ gr.Markdown(
222
+ LEADERBORAD_INTRODUCTION
223
+ )
224
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
225
+ # table jmmmu bench
226
+ with gr.TabItem("πŸ… JMMMU Benchmark", elem_id="jmmmu-benchmark-tab-table", id=1):
227
+ # selection for column part:
228
+ checkbox_group = gr.CheckboxGroup(
229
+ choices=TASK_INFO,
230
+ value=AVG_INFO,
231
+ label="Evaluation Dimension",
232
+ interactive=True,
233
+ ) # user can select the evaluation dimension
234
+
235
+ with gr.Row():
236
+ # selection for model size part:
237
+ model_size = gr.CheckboxGroup(
238
+ choices=MODEL_SIZE,
239
+ value=MODEL_SIZE,
240
+ label="Model Size",
241
+ interactive=True,
242
+ )
243
+
244
+ baseline_value = get_baseline_df()
245
+ baseline_header = MODEL_INFO + checkbox_group.value
246
+ baseline_datatype = ['markdown'] * 2 + ['number'] * len(checkbox_group.value)
247
+
248
+ data_component = gr.components.Dataframe(
249
+ value=baseline_value,
250
+ headers=baseline_header,
251
+ type="pandas",
252
+ datatype=baseline_datatype,
253
+ interactive=False,
254
+ visible=True,
255
+ )
256
+
257
+ def on_filter_model_size_method_change(selected_model_size, selected_columns):
258
+
259
+ updated_data = get_all_df()
260
+ # model_size
261
+
262
+ def custom_filter(row, model_size_filters):
263
+ model_size = row['Model Size']
264
+ model_size = model_size.upper()
265
+
266
+ if model_size == '-':
267
+ size_filter = '-' in model_size_filters
268
+ elif 'B' in model_size:
269
+ size = float(model_size.replace('B', ''))
270
+ size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10)
271
+ else:
272
+ size_filter = False
273
+
274
+ return size_filter
275
+
276
+ mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size)
277
+ updated_data = updated_data[mask]
278
+
279
+ # columns:
280
+ selected_columns = [item for item in TASK_INFO if item in selected_columns]
281
+ present_columns = MODEL_INFO + selected_columns
282
+ updated_data = updated_data[present_columns]
283
+ updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False)
284
+ updated_headers = present_columns
285
+ update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
286
+
287
+ filter_component = gr.components.Dataframe(
288
+ value=updated_data,
289
+ headers=updated_headers,
290
+ type="pandas",
291
+ datatype=update_datatype,
292
+ interactive=False,
293
+ visible=True,
294
+ )
295
+ return filter_component
296
+
297
+
298
+ model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, checkbox_group], outputs=data_component)
299
+ checkbox_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, checkbox_group], outputs=data_component)
300
+
301
+ # table 5
302
+ with gr.TabItem("πŸš€ Submit here! ", elem_id="jmmmu-benchmark-tab-table", id=5):
303
+ with gr.Row():
304
+ gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
305
+
306
+ with gr.Row():
307
+ gr.Markdown("# βœ‰οΈβœ¨ Submit your model evaluation json file here!", elem_classes="markdown-text")
308
+
309
+ with gr.Row():
310
+ with gr.Column():
311
+ model_type = gr.Dropdown(
312
+ choices=["LMM", "LLM"],
313
+ label="Model type",
314
+ multiselect=False,
315
+ value="LMM",
316
+ interactive=True,
317
+ )
318
+ model_name_textbox = gr.Textbox(
319
+ label="Model name", placeholder="LLaMA-7B"
320
+ )
321
+ revision_name_textbox = gr.Textbox(
322
+ label="Revision Model Name", placeholder="LLaMA-7B"
323
+ )
324
+ model_link = gr.Textbox(
325
+ label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
326
+ )
327
+ model_size = gr.Textbox(
328
+ label="Model size", placeholder="7B(Input content format must be 'number+B' or '-', default is '-')"
329
+ )
330
+
331
+
332
+ with gr.Column():
333
+ input_file = gr.components.File(label="Click to Upload a JSON File", file_count="single", type='binary')
334
+ submit_button = gr.Button("Submit Eval")
335
+
336
+ submission_result = gr.Markdown()
337
+ submit_button.click(
338
+ add_new_eval,
339
+ inputs = [
340
+ input_file,
341
+ model_type,
342
+ model_name_textbox,
343
+ revision_name_textbox,
344
+ model_link,
345
+ model_size
346
+ ],
347
+ )
348
+
349
+ def refresh_data():
350
+ value = get_baseline_df()
351
+
352
+ return value
353
+
354
+ with gr.Row():
355
+ data_run = gr.Button("Refresh")
356
+ data_run.click(
357
+ refresh_data, outputs=[data_component]
358
+ )
359
+
360
+ with gr.Accordion("Citation", open=False):
361
+ citation_button = gr.Textbox(
362
+ value=CITATION_BUTTON_TEXT,
363
+ label=CITATION_BUTTON_LABEL,
364
+ elem_id="citation-button",
365
+ show_copy_button=True,
366
+ )
367
+
368
+ block.launch()
constants.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # this is .py for store constants
2
+ MODEL_INFO = ["Model Type", "Model"]
3
+ MODEL_SIZE = ["<10B", ">=10B", "-"]
4
+ LEADERBOARD_VERSION = ["Version1"]
5
+ TASK_INFO = ["Overall", "Culture-Specific", "Culture-Agnostic", "Japanese Art", "Japanese Heritage", "Japanese History", "World History", "Art & Psychology", "Business", "Science", "Health & Medicine", "Tech & Engineering"]
6
+ # Overall, Culture-Specific, Culture-Agnostic, English Original, Japanese Art, Japanese Heritage, Japanese History, World History, Art & Psychology, Business, Science, Health & Medicine, Tech & Engineering
7
+ AVG_INFO = ["Overall"]
8
+
9
+
10
+ DATA_TITILE_TYPE = ["markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
11
+
12
+ CSV_RESULT_PATH = "./download_from_dataset/result.csv"
13
+ CSV_QUEUE_DIR = "./download_from_dataset/queue"
14
+ COLUMN_NAMES = MODEL_INFO + TASK_INFO
15
+
16
+ LEADERBORAD_VERSION = ["JMMMU"]
17
+
18
+
19
+ LEADERBORAD_INTRODUCTION = """
20
+ # JMMMU Leaderboard
21
+
22
+ [🌐 **Homepage**](https://mmmu-japanese-benchmark.github.io/JMMMU/) | [πŸ€— **Dataset**](https://huggingface.co/datasets/JMMMU/JMMMU/) | [πŸ† **HF Leaderboard**](https://huggingface.co/spaces/JMMMU/JMMMU_Leaderboard) | [πŸ“– **arXiv (coming soon)**]() | [**GitHub**](https://github.com/EvolvingLMMs-Lab/lmms-eval)
23
+
24
+
25
+ ### *"Which LMM is expert in Japanese subjects?"* πŸ† Welcome to the leaderboard of JMMMU
26
+ <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
27
+ </div>
28
+
29
+ We introduce **JMMMU** (***Japanese MMMU***), a multimodal benchmark that can truly evaluate LMM performance in Japanese.
30
+ JMMMU consists of **720 translation-based (Culture Agnostic)** and **600 brand-new questions (Culture Specific)**, for a **total of 1,320 questions**, updating the size of the existing culture-aware Japanese benchmark by >10x.
31
+
32
+ """
33
+
34
+
35
+ SUBMIT_INTRODUCTION = """# Submit on JMMMU Benchmark Introduction
36
+ 1. Obtain Result JSON File from [lmms-eval code base](https://github.com/EvolvingLMMs-Lab/lmms-eval).
37
+ 2. If you want to update existing model performance by uploading new results, please ensure 'Model Name Revision' is the same as what's shown in the leaderboard. For example, if you want to modify LLaVA-OV 7B's performance, you need to fill in 'LLaVA-OV 7B' in 'Revision Model Name'.
38
+ 3. Please provide the correct link of your model's repository for each submission.
39
+ 4. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
40
+
41
+ Note: The example of the submitted JSON file is this url: [result.json](https://drive.google.com/file/d/10CF1c24BhoK9OM8De-2gLXDDnNcnMWvy/view?usp=sharing).
42
+
43
+ ## Submit Example
44
+ If you want to upload LLaVA-OV 7B's result in the leaderboard, you need to:
45
+ 1. Select LMM in 'Model Type'.
46
+ 2. Fill in 'LLaVA-OV 7B' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
47
+ 3. Fill in 'LLaVA-OV 7B' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank).
48
+ 4. Fill in 'https://huggingface.co/lmms-lab/llava-onevision-qwen2-7b-ov' in 'Model Link'.
49
+ 5. Fill in '7B' in 'Model size'.
50
+ 10. Upload results.json.
51
+ 11. Click the 'Submit Eval' button.
52
+ 12. Click 'Refresh' to obtain the uploaded leaderboard.
53
+
54
+ To check whether the submission is successful, you can click the 'Logs' button. If the message 'Success! Your submission has been added!' appears, the submission is successful.
55
+
56
+ ### If you have any questions or deletion requests, please contact [miyai@cvm.t.u-tokyo.ac.jp](miyai@cvm.t.u-tokyo.ac.jp).
57
+ ### ⚠️ Please do not submit any malicious file (e.g, files you manually edited).
58
+ """
59
+
60
+
61
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
62
+ CITATION_BUTTON_TEXT = r"""@misc{onohara2024jmmmu,
63
+ title={JMMMU: A Japanese Massive Multi-discipline Multimodal Understanding Benchmark},
64
+ author={Shota Onohara and Atsuyuki Miyai and Yuki Imajuku and Kazuki Egashira and Jeonghun Baek and Xiang Yue and Graham Neubig and Kiyoharu Aizawa},
65
+ url={https://huggingface.co/datasets/JMMMU/JMMMU},
66
+ year={2024}
67
+ }"""
download_from_dataset ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit fd8fab23dd3d21d86514740c10690fbe6f30ae31
pyproject.toml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.ruff]
2
+ # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
+ select = ["E", "F"]
4
+ ignore = ["E501"] # line too long (black is taking care of this)
5
+ line-length = 119
6
+ fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
+
8
+ [tool.isort]
9
+ profile = "black"
10
+ line_length = 119
11
+
12
+ [tool.black]
13
+ line-length = 119
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ APScheduler==3.10.1
2
+ black==23.11.0
3
+ click==8.1.3
4
+ datasets==2.14.5
5
+ gradio==4.4.0
6
+ gradio_client==0.7.0
7
+ huggingface-hub>=0.23.2
8
+ matplotlib==3.7.1
9
+ numpy==1.24.2
10
+ pandas==2.0.0
11
+ python-dateutil==2.8.2
12
+ requests==2.28.2
13
+ tqdm==4.65.0
14
+ transformers==4.35.2
15
+ tokenizers>=0.15.0
16
+ git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
17
+ accelerate==0.24.1
18
+ sentencepiece
19
+ openpyxl