import gradio as gr import subprocess import os import sys import time import pandas as pd from threading import Thread import numpy as np # Add the path to the "src" directory of detect-pretrain-code-contamination to the sys.path project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "detect-pretrain-code-contamination")) src_dir = os.path.join(project_root, "src") sys.path.insert(0, src_dir) import run as evaluator # Import the run module from src.css_html import custom_css from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT, SUBMISSION_TEXT_2 from src.envs import API, H4_TOKEN, REPO_ID from huggingface_hub import HfApi from src.utils import ( AutoEvalColumn, fields, is_model_on_hub, make_clickable_names, styled_error, styled_message, EVAL_COLS, EVAL_TYPES ) COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden] TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden] COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden] TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden] # CONFIGURATION: test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"] modelQueue = (pd.read_csv('data/queue.csv')).values.tolist() print(modelQueue) def restart_space(): #Most dumbest update function to ever exist, I'm sobbing in tears as I've tried to make gradio update the leaderboard literally any other way. API.restart_space(repo_id=REPO_ID, token=H4_TOKEN) def formatr(result): result = str(result) result = result.split(",")[2].replace(")","") result = result.replace(" ","") return result def save_to_txt(model, results, model_type,ref_model): file_path = "data/code_eval_board.csv" with open(file_path, "a") as f: f.write(f"\n{model_type},{model}," + str(formatr(results["arc"])) + "," + str(formatr(results["hellaswag"])) + "," + str(formatr(results["mmlu"])) + "," + str(formatr(results["truthfulQA"])) + "," + str(formatr(results["winogrande"])) + "," + str(formatr(results["gsm8k"])) + f",{ref_model}") print(f"Finished evaluation of model: {model} using ref_model: {ref_model}") print(f"\n{model_type},{model}," + str(formatr(results["arc"])) + "," + str(formatr(results["hellaswag"])) + "," + str(formatr(results["mmlu"])) + "," + str(formatr(results["truthfulQA"])) + "," + str(formatr(results["winogrande"])) + "," + str(formatr(results["gsm8k"])) + f",{ref_model}") f.close() def run_test(model,ref_model,data): print(f"|| TESTING {data} ||") return evaluator.main( target_model=f"{model}", ref_model=f"{ref_model}", output_dir="out", data=f"{data}", length=64, key_name="input", ratio_gen=0.4 ) # Call the main function in detect-pretrain-code-contamination/src/run.py def evaluate(model,model_type,ref_model): print(f"|| EVALUATING {model} ||") results = { "arc": run_test(model, ref_model, test_datasets[2]), "hellaswag": run_test(model, ref_model, test_datasets[4]), "mmlu": run_test(model, ref_model, test_datasets[1]), "truthfulQA": run_test(model, ref_model, test_datasets[0]), "winogrande": run_test(model, ref_model, test_datasets[5]), "gsm8k": run_test(model, ref_model, test_datasets[3]), "ref_model": ref_model, } # Save to .txt file in /Evaluations/{model} save_to_txt(model, results, model_type,ref_model) return "\n".join([f"{k}:{results[k]}" for k in results]) def worker_thread(): global modelQueue, server while True: for submission in modelQueue: #evaluate(submission[1],submission[0].split(" ")[0],submission[2]) #modelQueue.pop(modelQueue.index(submission)) #exit() #The exit above is temporal while I figure out how to unload a model from a thread or similar. # Uncomment those lines in order to begin testing, I test these models outside of this space and later commit the results back. # I highly encourage you to try to reproduce the results I get using your own implementation. # Do NOT take anything listed here as fact, as I'm not 100% my implementation works as intended. # Take whatever you see in the leaderboard as a grain of salt, do NOT accuse models of cheating just because of their placement here alone. time.sleep(1) time.sleep(1) def queue(model,model_type,ref_model): global modelQueue modelQueue.append([model_type,model,ref_model]) file_path = "data/queue.csv" with open(file_path, "a") as f: model = model.strip() ref_model = ref_model.strip() f.write(f"\n{model_type},{model},{ref_model}") f.close() print(f"QUEUE:\n{modelQueue}") ### bigcode/bigcode-models-leaderboard def add_new_eval( model: str, revision: str, ref_model: str, model_type: str, ): ref_model = ref_model if model_type is None or model_type == "" or model_type == []: return styled_error("Please select a model type.") print(model_type) # check the model actually exists before adding the eval if revision == "": revision = "main" model_on_hub, error = is_model_on_hub(model, revision) if not model_on_hub: return styled_error(f'Model "{model}" {error}') print("Adding new eval") queue(model,model_type,ref_model) return styled_message("Your request has been submitted to the evaluation queue!\n") def select_columns(df, columns): always_here_cols = [ AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name, ] # We use COLS to maintain sorting filtered_df = df[ always_here_cols + [c for c in COLS if c in df.columns and c in columns] ] return filtered_df def filter_items(df, leaderboard_table, query): if query == "All": return df[leaderboard_table.columns] else: query = query[0] # take only the emoji character filtered_df = df[(df["T"] == query)] return filtered_df[leaderboard_table.columns] def search_table(df, leaderboard_table, query): filtered_df = df[(df["Models"].str.contains(query, case=False))] return filtered_df[leaderboard_table.columns] demo = gr.Blocks(css=custom_css) with demo: with gr.Row(): gr.Markdown( """<div style="text-align: center;"><h1> 📄 LLM Contamination Detector </h1></div>\ <br>\ <p>Inspired from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a> and <a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">🤗 Big Code Models Leaderboard ⭐</a>, we use an implementation of <a href="https://huggingface.co/papers/2310.16789">Detecting Pretraining Data from Large Language Models</a> paper found in <a href="https://github.com/swj0419/detect-pretrain-code-contamination/tree/master">this github repo</a>, to provide contamination scores for LLMs on the datasets used by Open LLM Leaderboard.\ This space should NOT be used to flag or accuse models of cheating / being contamined, instead, it should form part of a holistic assesment by the parties involved.</p>""", elem_classes="markdown-text", ) with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.Column(): with gr.Tabs(elem_classes="A100-tabs") as A100_tabs: with gr.TabItem("🔍 Evaluations", id=0): with gr.Column(): with gr.Accordion("➡️ See filters", open=False): shown_columns = gr.CheckboxGroup( choices=[ c for c in COLS if c not in [ AutoEvalColumn.dummy.name, AutoEvalColumn.model.name, AutoEvalColumn.model_type_symbol.name, ] ], value=[ c for c in COLS_LITE if c not in [ AutoEvalColumn.dummy.name, AutoEvalColumn.model.name, AutoEvalColumn.model_type_symbol.name, ] ], label="", elem_id="column-select", interactive=True, ) # with gr.Column(min_width=780): with gr.Row(): search_bar = gr.Textbox( placeholder="🔍 Search for a model and press ENTER...", show_label=False, elem_id="search-bar", ) filter_columns = gr.Radio( label="⏚ Filter model types", choices=["All", "🟢 Base", "🔶 Finetuned"], value="All", elem_id="filter-columns", ) df = pd.read_csv("data/code_eval_board.csv") leaderboard_df = gr.components.Dataframe( value=df[ [ AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name, ] + shown_columns.value ], headers=[ AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name, ] + shown_columns.value, datatype=TYPES, elem_id="leaderboard-table", interactive=False, ) hidden_leaderboard_df = gr.components.Dataframe( value=df, headers=COLS, datatype=["str" for _ in range(len(COLS))], visible=False, ) search_bar.submit( search_table, [hidden_leaderboard_df, leaderboard_df, search_bar], leaderboard_df, ) filter_columns.change( filter_items, [hidden_leaderboard_df, leaderboard_df, filter_columns], leaderboard_df, ) shown_columns.change( select_columns, [hidden_leaderboard_df, shown_columns], leaderboard_df, ) gr.Markdown( """ **Notes:** - The Huggingface team is working on their own implementation of this paper as a space, I'll be leaving this space up until that's available. - Some scores may not be entirely accurate according to the paper cited as I still work out the kinks and innacuracies of this implementation. - For any issues, questions, or comments either open a discussion in this space's community tab or message me directly to my discord: yeyito777. - Make sure to check the pinned discussion in this space's community tab for implementation details I'm not 100% about. """, elem_classes="markdown-text", ) with gr.TabItem("📝 About", id=2): gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text") with gr.TabItem("🛠️ Submit models", id=3): gr.Markdown(SUBMISSION_TEXT) gr.Markdown( "## 📤 Submit a model here:", elem_classes="markdown-text" ) with gr.Column(): with gr.Column(): with gr.Accordion( f"⏳ Evaluation Queue ({len(modelQueue)})", open=False, ): with gr.Row(): finished_eval_table = gr.components.Dataframe( value=pd.DataFrame(modelQueue, columns=['Type','Model','Reference Model']), ) with gr.Row(): model_name = gr.Textbox(label="Model name") revision_name = gr.Textbox( label="revision", placeholder="main" ) with gr.Row(): ref_model = gr.Dropdown( choices=[ "mistralai/Mistral-7B-v0.1", "huggyllama/llama-7b", "NousResearch/Llama-2-7b-hf", "upstage/SOLAR-10.7B-v1.0", ], label="Reference Model", multiselect=False, value="mistralai/Mistral-7B-v0.1", interactive=True, ) model_type = gr.Dropdown( choices=["🟢 base", "🔶 finetuned"], label="Model type", multiselect=False, value=None, interactive=True, ) submit_button = gr.Button("Submit Eval") submission_result = gr.Markdown() submit_button.click( add_new_eval, inputs=[model_name, revision_name, ref_model, model_type], outputs=[submission_result], ) gr.Markdown(SUBMISSION_TEXT_2) thread = Thread(target=worker_thread) thread.start() demo.launch(share=True) # Some worries: # 1. Am I testing things correctly in eval.py, following the template format? # 2. Am I choosing the correct splits in run.py? The higherarchy I use is: test > val > train # (As in: if test exists, I go with that, then validation, then default) # 3. I decided to go with winogrande_debiased instead of winogrande_l arbitrarily. # (Not sure which one open llm leaderboard uses, or what is the standard) # 4. I'm unsure why in eval.py we append the output at the end of the input. # 5. Currently I'm using huggyllama/llama-7b as ref_model, should I switch to llama2-7B? Maybe Mistral-7B?