Clémentine
commited on
Commit
·
ed1fdef
1
Parent(s):
6fefae4
added 'forbidden models' submission, to allow orgs to request their models to not be submitted in case of contamination
Browse files
app.py
CHANGED
@@ -10,7 +10,7 @@ from apscheduler.schedulers.background import BackgroundScheduler
|
|
10 |
from huggingface_hub import HfApi
|
11 |
from transformers import AutoConfig
|
12 |
|
13 |
-
from src.auto_leaderboard.get_model_metadata import apply_metadata
|
14 |
from src.assets.text_content import *
|
15 |
from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
|
16 |
from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
|
@@ -227,9 +227,13 @@ def add_new_eval(
|
|
227 |
os.makedirs(OUT_DIR, exist_ok=True)
|
228 |
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
|
229 |
|
|
|
|
|
|
|
|
|
230 |
# Check for duplicate submission
|
231 |
if out_path.split("eval-queue/")[1].lower() in requested_models:
|
232 |
-
return styled_warning("This model has been already submitted.")
|
233 |
|
234 |
with open(out_path, "w") as f:
|
235 |
f.write(json.dumps(eval_entry))
|
|
|
10 |
from huggingface_hub import HfApi
|
11 |
from transformers import AutoConfig
|
12 |
|
13 |
+
from src.auto_leaderboard.get_model_metadata import apply_metadata, DO_NOT_SUBMIT_MODELS
|
14 |
from src.assets.text_content import *
|
15 |
from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
|
16 |
from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
|
|
|
227 |
os.makedirs(OUT_DIR, exist_ok=True)
|
228 |
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
|
229 |
|
230 |
+
# Check if the model has been forbidden:
|
231 |
+
if out_path.split("eval-queue/")[1] in DO_NOT_SUBMIT_MODELS:
|
232 |
+
return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
|
233 |
+
|
234 |
# Check for duplicate submission
|
235 |
if out_path.split("eval-queue/")[1].lower() in requested_models:
|
236 |
+
return styled_warning("This model has been already submitted.")
|
237 |
|
238 |
with open(out_path, "w") as f:
|
239 |
f.write(json.dumps(eval_entry))
|
src/auto_leaderboard/get_model_metadata.py
CHANGED
@@ -8,7 +8,7 @@ from tqdm import tqdm
|
|
8 |
|
9 |
from src.utils_display import AutoEvalColumn, model_hyperlink
|
10 |
from src.auto_leaderboard.model_metadata_type import ModelType, model_type_from_str, MODEL_TYPE_METADATA
|
11 |
-
from src.auto_leaderboard.model_metadata_flags import FLAGGED_MODELS
|
12 |
|
13 |
from huggingface_hub import HfApi
|
14 |
import huggingface_hub
|
@@ -106,7 +106,18 @@ def flag_models(leaderboard_data:List[dict]):
|
|
106 |
issue_link = model_hyperlink(FLAGGED_MODELS[model_data["model_name_for_query"]], f"See discussion #{issue_num}")
|
107 |
model_data[AutoEvalColumn.model.name] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
|
108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
def apply_metadata(leaderboard_data: List[dict]):
|
|
|
110 |
get_model_type(leaderboard_data)
|
111 |
get_model_infos_from_hub(leaderboard_data)
|
112 |
flag_models(leaderboard_data)
|
|
|
8 |
|
9 |
from src.utils_display import AutoEvalColumn, model_hyperlink
|
10 |
from src.auto_leaderboard.model_metadata_type import ModelType, model_type_from_str, MODEL_TYPE_METADATA
|
11 |
+
from src.auto_leaderboard.model_metadata_flags import FLAGGED_MODELS, DO_NOT_SUBMIT_MODELS
|
12 |
|
13 |
from huggingface_hub import HfApi
|
14 |
import huggingface_hub
|
|
|
106 |
issue_link = model_hyperlink(FLAGGED_MODELS[model_data["model_name_for_query"]], f"See discussion #{issue_num}")
|
107 |
model_data[AutoEvalColumn.model.name] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
|
108 |
|
109 |
+
def remove_forbidden_models(leaderboard_data: List[dict]):
|
110 |
+
indices_to_remove = []
|
111 |
+
for ix, model in enumerate(leaderboard_data):
|
112 |
+
if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
|
113 |
+
indices_to_remove.append(ix)
|
114 |
+
|
115 |
+
for ix in reversed(indices_to_remove):
|
116 |
+
leaderboard_data.pop(ix)
|
117 |
+
return leaderboard_data
|
118 |
+
|
119 |
def apply_metadata(leaderboard_data: List[dict]):
|
120 |
+
leaderboard_data = remove_forbidden_models(leaderboard_data)
|
121 |
get_model_type(leaderboard_data)
|
122 |
get_model_infos_from_hub(leaderboard_data)
|
123 |
flag_models(leaderboard_data)
|
src/auto_leaderboard/model_metadata_flags.py
CHANGED
@@ -1,6 +1,12 @@
|
|
1 |
-
#
|
|
|
2 |
FLAGGED_MODELS = {
|
3 |
"Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
|
4 |
"deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
|
5 |
"Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
|
6 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Models which have been flagged by users as being problematic for a reason or another
|
2 |
+
# (Model name to forum discussion link)
|
3 |
FLAGGED_MODELS = {
|
4 |
"Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
|
5 |
"deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
|
6 |
"Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
|
7 |
+
}
|
8 |
+
|
9 |
+
# Models which have been requested by orgs to not be submitted on the leaderboard
|
10 |
+
DO_NOT_SUBMIT_MODELS = [
|
11 |
+
"Voicelab/trurl-2-13b", # trained on MMLU
|
12 |
+
]
|