import gradio as gr import pandas as pd import os from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import HfApi from uploads import add_new_eval CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r""" @article{wei2024evaluating, title={Evaluating Copyright Takedown Methods for Language Models}, author={Wei, Boyi and Shi, Weijia and Huang, Yangsibo and Smith, Noah A and Zhang, Chiyuan and Zettlemoyer, Luke and Li, Kai and Henderson, Peter}, journal={arXiv preprint arXiv:2406.18664}, year={2024} }""" api = HfApi() TOKEN = os.environ.get("TOKEN", None) LEADERBOARD_PATH = f"boyiwei/CoTaEval_leaderboard" def restart_space(): api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN) def format_floats(x): if isinstance(x, float): return f"{x:.3f}" return x # Function to load data from a given CSV file def baseline_load_data(model, dataset, setting, criteria): file_path = f'versions/{model}_{dataset}_{setting}_{criteria}.csv' # Replace with your file paths df = pd.read_csv(file_path) df = df.map(format_floats) # we only want specific columns and in a specific order if dataset == 'news': column_names = ["model_name","method","rouge1","rougeL","semantic_sim","LCS(character)","LCS(word)","ACS(word)","Levenshtein Distance","Minhash Similarity", "MMLU","MT-Bench","Blocklisted F1","In-Domain F1","Efficiency"] elif dataset == 'books': column_names = ["model_name","method","bleu","rouge1","rougeL","semantic_sim","LCS(character)","LCS(word)","ACS(word)","Levenshtein Distance","Minhash Similarity", "MMLU","MT-Bench","Blocklisted rougeL","In-Domain rougeL","Efficiency" ] df = df[column_names] return df def update_dropdowns(setting, dataset, model, criteria): updates = { "setting": gr.update(interactive=True), "dataset": gr.update(interactive=True), "model": gr.update(interactive=True), "criteria": gr.update(interactive=True), } if setting == "memorization": updates["dataset"] = gr.update(value="news", interactive=False) updates["model"] = gr.update(value="llama2-7b-chat-hf-newsqa", interactive=False) elif dataset == "books": updates["setting"] = gr.update(value="rag", interactive=False) if model == "llama2-7b-chat-hf-newsqa": updates["model"] = gr.update(value="llama2-7b-chat-hf", interactive=True) elif model == "llama2-7b-chat-hf-newsqa": updates["setting"] = gr.update(value="memorization", interactive=False) updates["dataset"] = gr.update(value="news", interactive=False) elif model != "llama2-7b-chat-hf-newsqa": updates["setting"] = gr.update(value="rag", interactive=False) return updates["model"], updates["dataset"], updates["setting"], updates["criteria"] def load_data(model, dataset, setting, criteria): baseline_df = baseline_load_data(model, dataset, setting, criteria) # now for every file in "versions/{model}-{version}/*.csv" # if file name is not "model-version.csv", load the file and append it to the dataframe # version = version.replace("%", "p") # for file in os.listdir(f'versions/{model}-{version}'): # if file == f"{model}-{version}.csv": # continue # df = pd.read_csv(f'versions/{model}-{version}/{file}') # df = df[baseline_df.columns] # baseline_df = pd.concat([baseline_df, df]) return baseline_df # Function for searching in the leaderboard def search_leaderboard(df, query): if query == "": return df else: return df[df['Method'].str.contains(query)] # Function to change the version of the leaderboard def change_version(model, dataset, setting, criteria): new_df = load_data(model, dataset, setting, criteria) return new_df # Initialize Gradio app demo = gr.Blocks() with demo: gr.Markdown(""" ## 🥇 CoTaEval Leaderboard CoTaEval is a benchmark to evaluate the feasibility and side effects of copyright takedown methods for language models. Project website: [https://cotaeval.github.io/](https://cotaeval.github.io/). """) with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button", show_copy_button=True, ) #.style(show_copy_button=True) with gr.Tabs(): with gr.TabItem("Leaderboard"): with gr.Row(): setting_dropdown = gr.Dropdown( choices = ["rag", "memorization"], label="🔄 Select Setting", value="rag", ) dataset_dropdown = gr.Dropdown( choices = ['news', 'books'], label="🔄 Select Dataset", value="news", ) model_dropdown = gr.Dropdown( choices=["llama2-7b-chat-hf", "llama2-70b-chat-hf", "dbrx-instruct", "llama2-7b-chat-hf-newsqa"], label="🔄 Select Model", value="llama2-7b-chat-hf", ) criteria_dropdown = gr.Dropdown( choices=['mean', 'max'], label = "🔄 Select Criteria", value = 'mean', ) leaderboard_table = gr.components.Dataframe( value=load_data("llama2-7b-chat-hf", "news", "rag", "mean"), interactive=True, visible=True, ) # setting_dropdown.change( # update_dropdowns, # inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown], # outputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown] # ) # dataset_dropdown.change( # update_dropdowns, # inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown], # outputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown] # ) # model_dropdown.change( # update_dropdowns, # inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown], # outputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown] # ) setting_dropdown.change( change_version, inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown], outputs=leaderboard_table ) dataset_dropdown.change( change_version, inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown], outputs=leaderboard_table ) model_dropdown.change( change_version, inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown], outputs=leaderboard_table ) criteria_dropdown.change( change_version, inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown], outputs=leaderboard_table ) # with gr.Accordion("Submit a new model for evaluation"): # with gr.Row(): # with gr.Column(): # method_name_textbox = gr.Textbox(label="Method name") # #llama, phi # model_family_radio = gr.Radio(["llama", "phi"], value="llama", label="Model family") # forget_rate_radio = gr.Radio(["1%", "5%", "10%"], value="10%", label="Forget rate") # url_textbox = gr.Textbox(label="Url to model information") # with gr.Column(): # organisation = gr.Textbox(label="Organisation") # mail = gr.Textbox(label="Contact email") # file_output = gr.File() # submit_button = gr.Button("Submit Eval") # submission_result = gr.Markdown() # submit_button.click( # add_new_eval, # [ # method_name_textbox, # model_family_radio, # forget_rate_radio, # url_textbox, # file_output, # organisation, # mail # ], # submission_result, # ) gr.Markdown(""" ## Links - [**Website**](https://cotaeval.github.io): The website for CoTaEval Project. - [**GitHub Repository**](https://github.com/boyiwei/CoTaEval): For source code of evaluating the takedown methods with CoTaEval. - [**Datasets**](https://huggingface.co/datasets/boyiwei/CoTaEval): Dataset for evaluation and unlearning. This leaderboard is based on the design of the [TOFU Leaderboard](https://huggingface.co/spaces/locuslab/tofu_leaderboard). """) # scheduler = BackgroundScheduler() # scheduler.add_job(restart_space, "interval", seconds=1800) # scheduler.start() # demo.queue(default_concurrency_limit=40).launch() # demo.launch() scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=3600) scheduler.start() custom_css = """ """ # demo.launch(debug=True, custom_css=custom_css) demo.launch(debug=True)