__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions'] import gradio as gr import pandas as pd import re import pandas as pd import numpy as np from collections import defaultdict from constants import * import os from huggingface_hub import Repository import json global data_component, filter_component TOKEN = os.environ.get("TOKEN") repo = Repository(local_dir="./download_from_dataset", clone_from="JMMMU/leaderboard_result", repo_type="dataset", use_auth_token=TOKEN) current_directory = os.getcwd() def validate_model_size(s): pattern = r'^\d+B$|^-$' if re.match(pattern, s): return s else: return '-' def upload_file(files): file_paths = [file.name for file in files] return file_paths def get_acc(data, subject_list): acc = 0 for subject in subject_list: acc += data["results"][subject]['jmmmu_acc,none'] acc = acc/len(subject_list) acc = acc * 100 acc = round(acc, 1) return acc def calculate_score(input_file): json_string = input_file.decode('utf-8') data = json.loads(json_string) result_dict = {} overall = data["results"]["jmmmu"]['jmmmu_acc,none']*100 ca = data["results"]["culture_agnostic"]['jmmmu_acc,none']*100 cs = data["results"]["culture_specific"]['jmmmu_acc,none']*100 overall = round(overall, 1) ca = round(ca, 1) cs = round(cs, 1) # Art_Psychology art_psychology_subject_list = ["jmmmu_design", "jmmmu_music", "jmmmu_psychology"] # Science science_subject_list = ["jmmmu_biology", "jmmmu_chemistry", "jmmmu_physics", "jmmmu_math"] # Business business_subject_list = ["jmmmu_accounting", "jmmmu_economics", "jmmmu_finance", "jmmmu_manage", "jmmmu_marketing"] # Medicine medicine_subject_list = ["jmmmu_basic_medical_science", "jmmmu_clinical_medicine", "jmmmu_diagnostics_and_laboratory_medicine", "jmmmu_pharmacy", "jmmmu_public_health"] # Tech_Eng. tech_eng_subject_list = ["jmmmu_agriculture", "jmmmu_architecture_and_engineering", "jmmmu_computer_science", "jmmmu_electronics", "jmmmu_energy_and_power", "jmmmu_materials", "jmmmu_mechanical_engineering"] jmmmu_japanese_art_subject_list = ["jmmmu_japanese_art"] jmmmu_japanese_heritage_subject_list = ["jmmmu_japanese_heritage"] jmmmu_japanese_history_subject_list = ["jmmmu_japanese_history"] jmmmu_world_history_subject_list = ["jmmmu_world_history"] art_psychology = get_acc(data, art_psychology_subject_list) science = get_acc(data, science_subject_list) business = get_acc(data, business_subject_list) medicine = get_acc(data, medicine_subject_list) tech_eng = get_acc(data, tech_eng_subject_list) japanese_art = get_acc(data, jmmmu_japanese_art_subject_list) japanese_heritage = get_acc(data, jmmmu_japanese_heritage_subject_list) japanese_history = get_acc(data, jmmmu_japanese_history_subject_list) world_history = get_acc(data, jmmmu_world_history_subject_list) result_dict =\ { "overall": overall, "cultureSpecific": cs, "cultureAgnostic": ca, "japaneseArt": japanese_art, "japaneseHeritage": japanese_heritage, "japaneseHistory": japanese_history, "worldHistory": world_history, "artPsychology": art_psychology, "business": business, "science": science, "healthMedicine": medicine, "techEngineering": tech_eng } return result_dict def add_new_eval( input_file, model_type: str, model_name_textbox: str, revision_name_textbox: str, model_link: str, model_size: str, # upd_type: str, # question_type: str ): if input_file is None: warning_text = "Error! Empty file!" print(warning_text) return warning_text else: model_size = validate_model_size(model_size) # if upd_type == 'AAD': csv_path = CSV_RESULT_PATH # validity_check(input_file) csv_data = pd.read_csv(csv_path) result_dict = calculate_score(input_file) if revision_name_textbox == '': col = csv_data.shape[0] model_name = model_name_textbox else: model_name = revision_name_textbox model_name_list = csv_data['Model'] name_list = [name.split(']')[0][1:] for name in model_name_list] if revision_name_textbox not in name_list: col = csv_data.shape[0] else: col = name_list.index(revision_name_textbox) model_name_wo_link = model_name if model_link == '': model_name = model_name # no url else: model_name = '[' + model_name + '](' + model_link + ')' # add new data new_data = [ model_type, model_name, model_size, result_dict["overall"], result_dict["cultureSpecific"], result_dict["cultureAgnostic"], result_dict["japaneseArt"], result_dict["japaneseHeritage"], result_dict["japaneseHistory"], result_dict["worldHistory"], result_dict["artPsychology"], result_dict["business"], result_dict["science"], result_dict["healthMedicine"], result_dict["techEngineering"] ] # If the same data already exists, return an error. if new_data in csv_data.values.tolist(): warning_text = "Error! The same data already exists!" print(warning_text) return warning_text # If the same model name already exists, return an error. elif new_data[:5] in csv_data.values.tolist(): warning_text = "Error! The same data already exists! Please fill revision_name." print(warning_text) return warning_text csv_data.loc[col] = new_data csv_data = csv_data.to_csv(csv_path, index=False) absolute_result_path = os.path.abspath(csv_path) if not os.path.exists(absolute_result_path): raise FileNotFoundError(f"File {absolute_result_path} not found") repo.git_pull() repo.git_add(absolute_result_path) save_path = os.path.join(CSV_QUEUE_DIR, f"{model_name_wo_link}.json") with open(save_path, "wb") as f: f.write(input_file) absolute_queue_path = os.path.abspath(save_path) repo.git_add(absolute_queue_path) repo.git_commit(f"add {model_name_wo_link} results") repo.git_push() print(f"Success! Your {model_name_wo_link} has been added!") return 0 def get_baseline_df(): repo.git_pull() df = pd.read_csv(CSV_RESULT_PATH) df = df.sort_values(by="Overall", ascending=False) present_columns = MODEL_INFO + checkbox_group.value df = df[present_columns] return df def get_all_df(): repo.git_pull() df = pd.read_csv(CSV_RESULT_PATH) df = df.sort_values(by="Overall", ascending=False) return df block = gr.Blocks() with block: gr.Markdown( LEADERBORAD_INTRODUCTION ) with gr.Tabs(elem_classes="tab-buttons") as tabs: # table jmmmu bench with gr.TabItem("🏅 JMMMU Benchmark", elem_id="jmmmu-benchmark-tab-table", id=1): # selection for column part: checkbox_group = gr.CheckboxGroup( choices=TASK_INFO, value=AVG_INFO, label="Evaluation Dimension", interactive=True, ) # user can select the evaluation dimension with gr.Row(): # selection for model size part: model_size = gr.CheckboxGroup( choices=MODEL_SIZE, value=MODEL_SIZE, label="Model Size", interactive=True, ) baseline_value = get_baseline_df() baseline_header = MODEL_INFO + checkbox_group.value baseline_datatype = ['markdown'] * 2 + ['number'] * len(checkbox_group.value) data_component = gr.components.Dataframe( value=baseline_value, headers=baseline_header, type="pandas", datatype=baseline_datatype, interactive=False, visible=True, ) def on_filter_model_size_method_change(selected_model_size, selected_columns): updated_data = get_all_df() # model_size def custom_filter(row, model_size_filters): model_size = row['Model Size'] model_size = model_size.upper() if model_size == '-': size_filter = '-' in model_size_filters elif 'B' in model_size: size = float(model_size.replace('B', '')) size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10) else: size_filter = False return size_filter mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size) updated_data = updated_data[mask] # columns: selected_columns = [item for item in TASK_INFO if item in selected_columns] present_columns = MODEL_INFO + selected_columns updated_data = updated_data[present_columns] updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False) updated_headers = present_columns update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers] filter_component = gr.components.Dataframe( value=updated_data, headers=updated_headers, type="pandas", datatype=update_datatype, interactive=False, visible=True, ) return filter_component model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, checkbox_group], outputs=data_component) checkbox_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, checkbox_group], outputs=data_component) # table 5 with gr.TabItem("🚀 Submit here! ", elem_id="jmmmu-benchmark-tab-table", id=5): with gr.Row(): gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text") with gr.Row(): gr.Markdown("# ✉️✨ Submit your model evaluation json file here!", elem_classes="markdown-text") with gr.Row(): with gr.Column(): model_type = gr.Dropdown( choices=["LMM", "LLM"], label="Model type", multiselect=False, value="LMM", interactive=True, ) model_name_textbox = gr.Textbox( label="Model name", placeholder="LLaMA-7B" ) revision_name_textbox = gr.Textbox( label="Revision Model Name", placeholder="LLaMA-7B" ) model_link = gr.Textbox( label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf" ) model_size = gr.Textbox( label="Model size", placeholder="7B(Input content format must be 'number+B' or '-', default is '-')" ) with gr.Column(): input_file = gr.components.File(label="Click to Upload a JSON File", file_count="single", type='binary') submit_button = gr.Button("Submit Eval") submission_result = gr.Markdown() submit_button.click( add_new_eval, inputs = [ input_file, model_type, model_name_textbox, revision_name_textbox, model_link, model_size ], ) def refresh_data(): value = get_baseline_df() return value with gr.Row(): data_run = gr.Button("Refresh") data_run.click( refresh_data, outputs=[data_component] ) with gr.Accordion("Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button", show_copy_button=True, ) block.launch()