Spaces:

open-llm-leaderboard
/

open_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

1020

simplify_ux

#944

by clefourrier HF staff - opened Sep 24

base: refs/heads/main

←

from: refs/pr/944

Discussion Files changed

+54

-95

This PR is in draft mode

Files changed (7) hide show

app.py +36 -44
pyproject.toml +10 -9
requirements.txt +4 -4
src/display/about.py +0 -1
src/display/utils.py +1 -5
src/submission/check_validity.py +1 -22
src/submission/submit.py +2 -10

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import logging
 import time
 import schedule
@@ -60,18 +61,8 @@ NEW_DATA_ON_LEADERBOARD = True
 LEADERBOARD_DF = None
 def restart_space():
-    logging.info(f"Restarting space with repo ID: {REPO_ID}")
-    try:
-        # Check if new data is pending and download if necessary
-        if NEW_DATA_ON_LEADERBOARD:
-            logging.info("Fetching latest leaderboard data before restart.")
-            get_latest_data_leaderboard()
-        # Now restart the space
-        API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
-        logging.info("Space restarted successfully.")
-    except Exception as e:
-        logging.error(f"Failed to restart space: {e}")
 def time_diff_wrapper(func):
     def wrapper(*args, **kwargs):
@@ -109,35 +100,29 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
             attempt += 1
     raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
-def get_latest_data_leaderboard(leaderboard_initial_df=None):
     global NEW_DATA_ON_LEADERBOARD
     global LEADERBOARD_DF
     if NEW_DATA_ON_LEADERBOARD:
-        logging.info("Leaderboard updated at reload!")
-        try:
-            leaderboard_dataset = datasets.load_dataset(
-                AGGREGATED_REPO,
-                "default",
-                split="train",
-                cache_dir=HF_HOME,
-                download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,  # Always download fresh data
-                verification_mode="no_checks"
-            )
-            LEADERBOARD_DF = get_leaderboard_df(
-                leaderboard_dataset=leaderboard_dataset,
-                cols=COLS,
-                benchmark_cols=BENCHMARK_COLS,
-            )
-            logging.info("Leaderboard dataset successfully downloaded.")
-        except Exception as e:
-            logging.error(f"Failed to download leaderboard dataset: {e}")
-            return
-        # Reset the flag after successful download
         NEW_DATA_ON_LEADERBOARD = False
     else:
         LEADERBOARD_DF = leaderboard_initial_df
-        logging.info("Using cached leaderboard dataset.")
     return LEADERBOARD_DF
@@ -147,9 +132,6 @@ def get_latest_data_queue():
 def init_space():
     """Initializes the application space, loading only necessary data."""
-    global NEW_DATA_ON_LEADERBOARD
-    NEW_DATA_ON_LEADERBOARD = True  # Ensure new data is always pulled on restart
     if DO_FULL_INIT:
         # These downloads only occur on full initialization
         try:
@@ -184,6 +166,12 @@ LEADERBOARD_DF, eval_queue_dfs = init_space()
 finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
 def init_leaderboard(dataframe):
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
@@ -461,13 +449,18 @@ def update_leaderboard(payload: WebhookPayload) -> None:
     """Redownloads the leaderboard dataset each time it updates"""
     if payload.repo.type == "dataset" and payload.event.action == "update":
         global NEW_DATA_ON_LEADERBOARD
-        logging.info("New data detected, downloading updated leaderboard dataset.")
-        # Mark the flag for new data
         NEW_DATA_ON_LEADERBOARD = True
-        # Now actually download the latest data immediately
-        get_latest_data_leaderboard()
 # The below code is not used at the moment, as we can manage the queue file locally
 LAST_UPDATE_QUEUE = datetime.datetime.now()
@@ -487,6 +480,5 @@ def update_queue(payload: WebhookPayload) -> None:
 webhooks_server.launch()
 scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", hours=1)  # Restart every 1h
-logging.info("Scheduler initialized to restart space every 1 hour.")
 scheduler.start()

+import os
 import logging
 import time
 import schedule
 LEADERBOARD_DF = None
 def restart_space():
+    API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
 def time_diff_wrapper(func):
     def wrapper(*args, **kwargs):
             attempt += 1
     raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
+def get_latest_data_leaderboard(leaderboard_initial_df = None):
     global NEW_DATA_ON_LEADERBOARD
     global LEADERBOARD_DF
     if NEW_DATA_ON_LEADERBOARD:
+        print("Leaderboard updated at reload!")
+        leaderboard_dataset = datasets.load_dataset(
+            AGGREGATED_REPO,
+            "default",
+            split="train",
+            cache_dir=HF_HOME,
+            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
+            verification_mode="no_checks"
+        )
+        LEADERBOARD_DF = get_leaderboard_df(
+            leaderboard_dataset=leaderboard_dataset,
+            cols=COLS,
+            benchmark_cols=BENCHMARK_COLS,
+        )
         NEW_DATA_ON_LEADERBOARD = False
     else:
         LEADERBOARD_DF = leaderboard_initial_df
     return LEADERBOARD_DF
 def init_space():
     """Initializes the application space, loading only necessary data."""
     if DO_FULL_INIT:
         # These downloads only occur on full initialization
         try:
 finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
+# Function to check if a user is logged in
+def check_login(profile: gr.OAuthProfile | None) -> bool:
+    if profile is None:
+        return False
+    return True
 def init_leaderboard(dataframe):
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
     """Redownloads the leaderboard dataset each time it updates"""
     if payload.repo.type == "dataset" and payload.event.action == "update":
         global NEW_DATA_ON_LEADERBOARD
+        if NEW_DATA_ON_LEADERBOARD:
+            return
         NEW_DATA_ON_LEADERBOARD = True
+        datasets.load_dataset(
+            AGGREGATED_REPO,
+            "default",
+            split="train",
+            cache_dir=HF_HOME,
+            download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
+            verification_mode="no_checks"
+        )
 # The below code is not used at the moment, as we can manage the queue file locally
 LAST_UPDATE_QUEUE = datetime.datetime.now()
 webhooks_server.launch()
 scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h as backup in case automatic updates are not working
 scheduler.start()

pyproject.toml CHANGED Viewed

@@ -11,18 +11,19 @@ dependencies = [
     "black>=24.8.0",
     "click>=8.1.7",
     "datasets>=3.0.0",
-    "huggingface-hub>=0.26.2",
     "pandas>=2.2.2",
     "python-dateutil>=2.9.0",
     "sentencepiece>=0.2.0",
-    "transformers==4.46.1",
     "tokenizers>=0.19.0",
-    "gradio-space-ci",
     "isort>=5.13.2",
     "ruff>=0.6.4",
-    "gradio-leaderboard==0.0.12",
-    "gradio[oauth]==4.44.1",
     "schedule>=1.2.2",
 ]
 [tool.ruff]
@@ -33,16 +34,16 @@ ignore=["I","EM","FBT","TRY003","S101","D101","D102","D103","D104","D105","G004"
 fixable=["ALL"]
 select=["ALL"]
-[tool.ruff.lint]
 select = ["E", "F"]
 fixable = ["ALL"]
 ignore = ["E501"] # line too long (black is taking care of this)
-[tool.isort]
 profile = "black"
 [tool.black]
 line-length = 119
-[tool.uv.sources]
-gradio-space-ci = { git = "https://huggingface.co/spaces/Wauplin/gradio-space-ci" }

     "black>=24.8.0",
     "click>=8.1.7",
     "datasets>=3.0.0",
+    "huggingface-hub>=0.24.7",
     "pandas>=2.2.2",
     "python-dateutil>=2.9.0",
     "sentencepiece>=0.2.0",
+    "transformers==4.44.2",
     "tokenizers>=0.19.0",
+    "gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3",
     "isort>=5.13.2",
     "ruff>=0.6.4",
+    "gradio-leaderboard==0.0.11",
+    "gradio[oauth]==4.44.0",
     "schedule>=1.2.2",
+    "pigar>=2.1.6",
 ]
 [tool.ruff]
 fixable=["ALL"]
 select=["ALL"]
+ [tool.ruff.lint]
 select = ["E", "F"]
 fixable = ["ALL"]
 ignore = ["E501"] # line too long (black is taking care of this)
+ [tool.isort]
 profile = "black"
 [tool.black]
 line-length = 119
+[tool.hatch.metadata]
+allow-direct-references = true

requirements.txt CHANGED Viewed

@@ -2,16 +2,16 @@ APScheduler==3.10.4
 black==24.8.0
 click==8.1.7
 datasets==3.0.0
-huggingface-hub>=0.26.2
 pandas==2.2.2
 python-dateutil==2.9.0
 sentencepiece==0.2.0
-transformers==4.46.1
 tokenizers>=0.19.0
 gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 # CI !!!
 isort==5.13.2
 ruff===0.6.4
-gradio==4.44.1
 gradio[oauth]
-gradio_leaderboard==0.0.12
 schedule == 1.2.2

 black==24.8.0
 click==8.1.7
 datasets==3.0.0
+huggingface-hub>=0.24.7
 pandas==2.2.2
 python-dateutil==2.9.0
 sentencepiece==0.2.0
+transformers==4.44.2
 tokenizers>=0.19.0
 gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 # CI !!!
 isort==5.13.2
 ruff===0.6.4
+gradio==4.44.0
 gradio[oauth]
+gradio_leaderboard==0.0.11
 schedule == 1.2.2

src/display/about.py CHANGED Viewed

@@ -13,7 +13,6 @@ icons = f"""
 - {ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
 - {ModelType.chat.to_str(" : ")} model: chat like fine-tunes, either using IFT (datasets of task instruction), RLHF or DPO (changing the model loss a bit with an added policy), etc
 - {ModelType.merges.to_str(" : ")} model: merges or MoErges, models which have been merged or fused without additional fine-tuning.
-- {ModelType.MM.to_str(" : ")} model: models integrating multiple data types (e.g., text, image, audio) for tasks like image captioning and visual question answering.
 """
 LLM_BENCHMARKS_TEXT = """
 ## ABOUT

 - {ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
 - {ModelType.chat.to_str(" : ")} model: chat like fine-tunes, either using IFT (datasets of task instruction), RLHF or DPO (changing the model loss a bit with an added policy), etc
 - {ModelType.merges.to_str(" : ")} model: merges or MoErges, models which have been merged or fused without additional fine-tuning.
 """
 LLM_BENCHMARKS_TEXT = """
 ## ABOUT

src/display/utils.py CHANGED Viewed

@@ -128,8 +128,6 @@ auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname
 auto_eval_column_dict.append(["generation", ColumnContent, ColumnContent("Generation", "number", False)])
 auto_eval_column_dict.append(["base_model", ColumnContent, ColumnContent("Base Model", "str", False)])
-auto_eval_column_dict.append(["co2_emissions_kg", ColumnContent, ColumnContent("CO₂ cost (kg)", "number", True)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
@@ -198,7 +196,6 @@ class ModelType(Enum):
     PT = ModelDetails(name="🟢 pretrained", symbol="🟢")
     CPT = ModelDetails(name="🟩 continuously pretrained", symbol="🟩")
     FT = ModelDetails(name="🔶 fine-tuned on domain-specific datasets", symbol="🔶")
-    MM = ModelDetails(name="🌸 multimodal", symbol="🌸")
     chat = ModelDetails(name="💬 chat models (RLHF, DPO, IFT, ...)", symbol="💬")
     merges = ModelDetails(name="🤝 base merges and moerges", symbol="🤝")
     Unknown = ModelDetails(name="❓ other", symbol="❓")
@@ -218,10 +215,9 @@ class ModelType(Enum):
             return ModelType.chat
         if "merge" in m_type or "🤝" in m_type:
             return ModelType.merges
-        if "multimodal" in m_type or "🌸" in m_type:
-            return ModelType.MM
         return ModelType.Unknown
 class WeightType(Enum):
     Adapter = ModelDetails("Adapter")
     Original = ModelDetails("Original")

 auto_eval_column_dict.append(["generation", ColumnContent, ColumnContent("Generation", "number", False)])
 auto_eval_column_dict.append(["base_model", ColumnContent, ColumnContent("Base Model", "str", False)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
     PT = ModelDetails(name="🟢 pretrained", symbol="🟢")
     CPT = ModelDetails(name="🟩 continuously pretrained", symbol="🟩")
     FT = ModelDetails(name="🔶 fine-tuned on domain-specific datasets", symbol="🔶")
     chat = ModelDetails(name="💬 chat models (RLHF, DPO, IFT, ...)", symbol="💬")
     merges = ModelDetails(name="🤝 base merges and moerges", symbol="🤝")
     Unknown = ModelDetails(name="❓ other", symbol="❓")
             return ModelType.chat
         if "merge" in m_type or "🤝" in m_type:
             return ModelType.merges
         return ModelType.Unknown
 class WeightType(Enum):
     Adapter = ModelDetails("Adapter")
     Original = ModelDetails("Original")

src/submission/check_validity.py CHANGED Viewed

@@ -6,7 +6,7 @@ from collections import defaultdict
 from datetime import datetime, timedelta, timezone
 import huggingface_hub
-from huggingface_hub import ModelCard, hf_hub_download
 from huggingface_hub.hf_api import ModelInfo, get_safetensors_metadata, parse_safetensors_file_metadata
 from transformers import AutoConfig, AutoTokenizer
@@ -179,28 +179,7 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
     return set(file_names), users_to_submission_dates
-def check_chat_template(model: str, revision: str) -> tuple[bool, str]:
-    try:
-        # Attempt to download only the tokenizer_config.json file
-        config_file = hf_hub_download(
-            repo_id=model,
-            filename="tokenizer_config.json",
-            revision=revision,
-            repo_type="model"
-        )
-        # Read and parse the tokenizer_config.json file
-        with open(config_file, 'r') as f:
-            tokenizer_config = json.load(f)
-        # Check if chat_template exists in the tokenizer configuration
-        if 'chat_template' not in tokenizer_config:
-            return False, f"The model {model} doesn't have a chat_template in its tokenizer_config.json. Please add a chat_template before submitting or submit without it."
-        return True, ""
-    except Exception as e:
-        return False, f"Error checking chat_template for model {model}: {str(e)}"
 def get_model_tags(model_card, model: str):
     is_merge_from_metadata = False
     is_moe_from_metadata = False

 from datetime import datetime, timedelta, timezone
 import huggingface_hub
+from huggingface_hub import ModelCard
 from huggingface_hub.hf_api import ModelInfo, get_safetensors_metadata, parse_safetensors_file_metadata
 from transformers import AutoConfig, AutoTokenizer
     return set(file_names), users_to_submission_dates
 def get_model_tags(model_card, model: str):
     is_merge_from_metadata = False
     is_moe_from_metadata = False

src/submission/submit.py CHANGED Viewed

@@ -24,7 +24,6 @@ from src.submission.check_validity import (
     get_model_size,
     is_model_on_hub,
     user_submission_permission,
-    check_chat_template,
 )
 from src.voting.vote_system import VoteManager
@@ -115,7 +114,6 @@ def add_new_eval(
     except Exception as e:
         return styled_error("Could not get your model information. Please fill it up properly.")
-    # Has it been submitted already?
     model_key = f"{model}_{model_info.sha}_{precision}"
     if model_key in requested_models:
         return styled_error(f"The model '{model}' with revision '{model_info.sha}' and precision '{precision}' has already been submitted.")
@@ -125,12 +123,12 @@ def add_new_eval(
     if model_size is None:
         return styled_error(error_text)
-    # Absolute size limit for float16 and bfloat16
     if precision in ["float16", "bfloat16"] and model_size > 100:
         return styled_error(f"Sadly, models larger than 100B parameters cannot be submitted in {precision} precision at this time. "
                             f"Your model size: {model_size:.2f}B parameters.")
-    # Precision-adjusted size limit for 8bit, 4bit, and GPTQ
     if precision in ["8bit", "4bit", "GPTQ"]:
         size_checker = ModelSizeChecker(model=model, precision=precision, model_size_in_b=model_size)
@@ -165,12 +163,6 @@ def add_new_eval(
     modelcard_OK, error_msg, model_card = check_model_card(model)
     if not modelcard_OK:
         return styled_error(error_msg)
-    # Check the chat template submission
-    if use_chat_template:
-        chat_template_valid, chat_template_error = check_chat_template(model, revision)
-        if not chat_template_valid:
-            return styled_error(chat_template_error)
     # Seems good, creating the eval
     print("Adding new eval")

     get_model_size,
     is_model_on_hub,
     user_submission_permission,
 )
 from src.voting.vote_system import VoteManager
     except Exception as e:
         return styled_error("Could not get your model information. Please fill it up properly.")
     model_key = f"{model}_{model_info.sha}_{precision}"
     if model_key in requested_models:
         return styled_error(f"The model '{model}' with revision '{model_info.sha}' and precision '{precision}' has already been submitted.")
     if model_size is None:
         return styled_error(error_text)
+    # First check: Absolute size limit for float16 and bfloat16
     if precision in ["float16", "bfloat16"] and model_size > 100:
         return styled_error(f"Sadly, models larger than 100B parameters cannot be submitted in {precision} precision at this time. "
                             f"Your model size: {model_size:.2f}B parameters.")
+    # Second check: Precision-adjusted size limit for 8bit, 4bit, and GPTQ
     if precision in ["8bit", "4bit", "GPTQ"]:
         size_checker = ModelSizeChecker(model=model, precision=precision, model_size_in_b=model_size)
     modelcard_OK, error_msg, model_card = check_model_card(model)
     if not modelcard_OK:
         return styled_error(error_msg)
     # Seems good, creating the eval
     print("Adding new eval")