Spaces:

open-llm-leaderboard
/

comparator

Running on CPU Upgrade

App Files Files Community

albertvillanova HF staff commited on Nov 19, 2024

Commit

608184c

verified ·

1 Parent(s): 966ae7b

Support comparing environmental impact

Browse files

Files changed (5) hide show

app.py +27 -6
src/constants.py +12 -0
src/env_impact.py +109 -0
src/requests.py +20 -0
src/results.py +11 -5

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from src.details import (
     update_subtasks_component,
     update_task_description_component,
 )
 from src.model_tree import load_model_tree
 from src.results import (
     clear_results,
@@ -120,6 +121,21 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
             details_show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
             details = gr.HTML()
             details_dataframe = gr.State()
     # DEMO:
     demo.load(
@@ -134,15 +150,15 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
     # Buttons:
     gr.on(
         triggers=[model_ids.input],
-        fn=lambda: (gr.Button(interactive=True),) * 3,
-        outputs=[load_model_tree_btn, load_results_btn, load_configs_btn],
     )
     # RESULTS:
     gr.on(
-        triggers=[load_results_btn.click, load_configs_btn.click],
         fn=display_loading_message_for_results,
-        outputs=[results, configs],
     ).then(
         fn=load_results,
         inputs=[
@@ -178,11 +194,15 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
         ],
         fn=display_results,
         inputs=[results_dataframe, results_task, hide_std_errors, show_only_differences],
-        outputs=[results, configs],
     ).then(
         fn=plot_results,
         inputs=[results_dataframe, results_task],
         outputs=[results_plot_1, results_plot_2],
     ).then(
         fn=clear_results_file,
         outputs=results_file,
@@ -193,13 +213,14 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
         outputs=results_file,
     )
     gr.on(
-        triggers=[clear_results_btn.click, clear_configs_btn.click],
         fn=clear_results,
         outputs=[
             model_ids,
             results_dataframe,
             load_results_btn,
             load_configs_btn,
             results_task,
             configs_task,
         ],

     update_subtasks_component,
     update_task_description_component,
 )
+from src.env_impact import plot_env_impact
 from src.model_tree import load_model_tree
 from src.results import (
     clear_results,
             details_show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
             details = gr.HTML()
             details_dataframe = gr.State()
+        with gr.Tab("Environmental impact"):
+            gr.Markdown(
+                "The environmental impact calculations we display are derived from the specific inference setup used "
+                "for evaluation. We leverage 🤗 [Accelerate](https://huggingface.co/docs/accelerate) to efficiently "
+                "parallelize the model across 8 Nvidia H100 SXM GPUs in a compute cluster located in Northern Virginia. "
+                "These results reflect the energy consumption and associated emissions of this configuration, "
+                "providing transparency and insight into the resource requirements of large language model evaluations. "
+                "You can find more details in our documentation about the [environmental impact](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/emissions)."
+            )
+            load_env_impact_btn = gr.Button("Load", interactive=False)
+            clear_env_impact_btn = gr.Button("Clear")
+            with gr.Row():
+                env_impact_plot_1 = gr.Plot(visible=True)
+                env_impact_plot_2 = gr.Plot(visible=True)
+            env_impact = gr.HTML()
     # DEMO:
     demo.load(
     # Buttons:
     gr.on(
         triggers=[model_ids.input],
+        fn=lambda: (gr.Button(interactive=True),) * 4,
+        outputs=[load_model_tree_btn, load_results_btn, load_configs_btn, load_env_impact_btn],
     )
     # RESULTS:
     gr.on(
+        triggers=[load_results_btn.click, load_configs_btn.click, load_env_impact_btn.click],
         fn=display_loading_message_for_results,
+        outputs=[results, configs, env_impact],
     ).then(
         fn=load_results,
         inputs=[
         ],
         fn=display_results,
         inputs=[results_dataframe, results_task, hide_std_errors, show_only_differences],
+        outputs=[results, configs, env_impact],
     ).then(
         fn=plot_results,
         inputs=[results_dataframe, results_task],
         outputs=[results_plot_1, results_plot_2],
+    ).then(
+        fn=plot_env_impact,
+        inputs=[results_dataframe],
+        outputs=[env_impact_plot_1, env_impact_plot_2],
     ).then(
         fn=clear_results_file,
         outputs=results_file,
         outputs=results_file,
     )
     gr.on(
+        triggers=[clear_results_btn.click, clear_configs_btn.click, clear_env_impact_btn.click],
         fn=clear_results,
         outputs=[
             model_ids,
             results_dataframe,
             load_results_btn,
             load_configs_btn,
+            load_env_impact_btn,
             results_task,
             configs_task,
         ],

src/constants.py CHANGED Viewed

@@ -1,3 +1,4 @@
 RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results"
 DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
@@ -82,3 +83,14 @@ DERIVED_MODEL_TYPES = [
     ("Merges", "merge"),
     ("Quantizations", "quantized"),
 ]

+REQUESTS_DATASET_ID = "datasets/open-llm-leaderboard/requests"
 RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results"
 DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
     ("Merges", "merge"),
     ("Quantizations", "quantized"),
 ]
+MODEL_TYPE_LABEL_TO_TYPE = {
+    "🟢 : 🟢 pretrained": "pretrained",
+    "🟩 : 🟩 continuously pretrained": "pretrained",
+    "🔶 : 🔶 fine-tuned on domain-specific datasets": "fine_tuned_chat",
+    "💬 : 💬 chat models (RLHF, DPO, IFT, ...)": "fine_tuned_chat",
+    "🤝 : 🤝 base merges and moerges": "merges",
+    "🌸 : 🌸 multimodal": "multimodal",
+    "❓ : ❓ other": "other",
+}

src/env_impact.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from datetime import timedelta
+import plotly.express as px
+import src.constants as constants
+from src.hub import load_model_card
+from src.requests import load_request
+async def get_env_impact(data):
+    total_evaluation_time_seconds = data.get("total_evaluation_time_seconds")
+    if total_evaluation_time_seconds:
+        total_evaluation_time_seconds = float(total_evaluation_time_seconds)
+    env_impact = {
+        "co2_emissions": calculate_co2_emissions(total_evaluation_time_seconds),
+        "total_evaluation_time": str(timedelta(seconds=total_evaluation_time_seconds)),
+        "num_parameters_billions": data.get("config", {}).get("model_num_parameters") / 10**9,
+        "precision": data.get("config", {}).get("model_dtype"),
+    }
+    request = await load_request(data["model_name"], env_impact["precision"])
+    if request:
+        model_type_label = request.get("model_type", "unknown")
+        env_impact["model_type"] = constants.MODEL_TYPE_LABEL_TO_TYPE.get(model_type_label, model_type_label)
+        env_impact["architecture"] = request.get("architectures", "Unknown")
+    # MoE
+    model_card = await load_model_card(data["model_name"])
+    model_tags = get_moe_model_tags(model_card.data, data["model_name"])
+    moe = "moe" in model_tags or "moe" in data["model_name"].lower()
+    env_impact["moe"] = moe
+    return env_impact
+# Source: https://huggingface.co/docs/leaderboards/open_llm_leaderboard/emissions#function-for-c02-calculation
+def calculate_co2_emissions(total_evaluation_time_seconds: float | None) -> float:
+    if total_evaluation_time_seconds is None or total_evaluation_time_seconds <= 0:
+        return -1
+    # Power consumption for 8 H100 SXM GPUs in kilowatts (kW)
+    power_consumption_kW = 5.6
+    # Carbon intensity in grams CO₂ per kWh in Virginia
+    carbon_intensity_g_per_kWh = 269.8
+    # Convert evaluation time to hours
+    total_evaluation_time_hours = total_evaluation_time_seconds / 3600
+    # Calculate energy consumption in kWh
+    energy_consumption_kWh = power_consumption_kW * total_evaluation_time_hours
+    # Calculate CO₂ emissions in grams
+    co2_emissions_g = energy_consumption_kWh * carbon_intensity_g_per_kWh
+    # Convert grams to kilograms
+    return co2_emissions_g / 1000
+# Source: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard_parser/blob/main/src/submission/check_validity.py#L33
+def get_moe_model_tags(model_card, model_id):
+    # is_merge_from_metadata = False
+    is_moe_from_metadata = False
+    is_moe_from_model_card = False
+    # is_merge_from_model_card = False
+    tags = []
+    if model_card is None:
+        return tags
+    if model_card.tags:
+        # is_merge_from_metadata = any(tag in model_card.tags for tag in ["merge", "moerge", "mergekit", "lazymergekit"])
+        is_moe_from_metadata = any(tag in model_card.tags for tag in ["moe", "moerge", "mixtral"])
+    if model_card.get("text", False):
+        # is_merge_from_model_card = any(
+        #     keyword in model_card.text.lower() for keyword in ["merged model", "merge model", "moerge"]
+        # )
+        is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in ["moe", "mixtral"])
+    # if is_merge_from_model_card or is_merge_from_metadata:
+    #     tags.append("merge")
+    is_moe_from_name = any(
+        key in model_id.lower().replace("/", "-").replace("_", "-").split("-") for key in ["moe", "mixtral"]
+    )
+    # Hardcoded check for "rhymes-ai/Aria" model
+    if model_id == "rhymes-ai/Aria":
+        tags.append("moe")
+    elif is_moe_from_model_card or is_moe_from_name or is_moe_from_metadata:
+        tags.append("moe")
+    return tags
+def plot_env_impact(df):
+    if df is None:
+        return None, None
+    fig_1 = px.scatter(
+        df.rename_axis(index="Model").reset_index(),
+        x="env_impact.num_parameters_billions",
+        y="env_impact.co2_emissions",
+        color="Model",
+        title="Evaluation CO₂ Emissions (kg) vs. #Params (B)",
+        labels={
+            "env_impact.num_parameters_billions": "#Params (B)",
+            "env_impact.co2_emissions": "Evaluation CO₂ Emissions (kg)",
+        },
+        color_discrete_sequence=px.colors.qualitative.Safe,  # TODO: https://plotly.com/python/discrete-color/
+    )
+    fig_2 = px.scatter(
+        df.rename_axis(index="Model").reset_index(),
+        x="results.leaderboard.acc_norm,none",
+        y="env_impact.co2_emissions",
+        color="Model",
+        title="Evaluation CO₂ Emissions (kg) vs. Score",
+        labels={
+            "results.leaderboard.acc_norm,none": "Mean Score",
+            "env_impact.co2_emissions": "Evaluation CO₂ Emissions (kg)",
+        },
+        color_discrete_sequence=px.colors.qualitative.Safe,  # TODO: https://plotly.com/python/discrete-color/
+    )
+    fig_2.update_xaxes(range=[0, 1])
+    return fig_1, fig_2

src/requests.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import asyncio
+import src.constants as constants
+from src.hub import glob, load_json_file
+def fetch_request_paths(model_id):
+    path = f"{constants.REQUESTS_DATASET_ID}/{model_id}_eval_request_*.json"
+    return glob(path)
+async def load_request(model_id, precision):
+    paths = await asyncio.to_thread(fetch_request_paths, model_id)
+    if not paths:
+        return
+    # TODO: Why sorted and reversed? https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard_parser/blob/main/src/leaderboard/read_evals.py#L254
+    for path in sorted(paths, reverse=True):
+        data = await load_json_file(path)
+        if data["precision"] == precision.split(".")[-1]:
+            return data

src/results.py CHANGED Viewed

@@ -7,6 +7,7 @@ import pandas as pd
 import plotly.express as px
 import src.constants as constants
 from src.hub import glob, load_json_file
@@ -37,10 +38,11 @@ async def load_results_dataframe(model_id, result_paths_per_model=None):
     results = [result for result in results if result]
     if not results:
         return
-    data = {"results": {}, "configs": {}}
     for result in results:
         data["results"].update(result["results"])
         data["configs"].update(result["configs"])
         model_name = result.get("model_name", "Model")
     df = pd.json_normalize([data])
     # df.columns = df.columns.str.split(".")  # .split return a list instead of a tuple
@@ -70,6 +72,7 @@ def display_results(df, task, hide_std_errors, show_only_differences):
     return (
         display_tab("results", df, task, hide_std_errors=hide_std_errors),
         display_tab("configs", df, task, show_only_differences=show_only_differences),
     )
@@ -113,7 +116,10 @@ def display_tab(tab, df, task, hide_std_errors=True, show_only_differences=False
     subset = idx[colored_rows, idx[:]]
     df.background_gradient(cmap="PiYG", vmin=0, vmax=1, subset=subset, axis=None)
     # Format index values: remove prefix and suffix
-    start = len(f"{tab}.leaderboard_") if task == "All" else len(f"{tab}.{task} ")
     df.format_index(lambda idx: idx[start:].removesuffix(",none"), axis="index")
     # Fix overflow
     df.set_table_styles(
@@ -144,11 +150,11 @@ def update_tasks_component():
 def clear_results():
-    # model_ids, dataframe, load_results_btn, load_configs_btn, results_task, configs_task
     return (
         gr.Dropdown(value=[]),
         None,
-        *(gr.Button("Load", interactive=False),) * 2,
         *(
             gr.Radio(
                 ["All"] + list(constants.TASKS.values()),
@@ -163,7 +169,7 @@ def clear_results():
 def display_loading_message_for_results():
-    return ("<h3 style='text-align: center;'>Loading...</h3>",) * 2
 def plot_results(df, task):

 import plotly.express as px
 import src.constants as constants
+from src.env_impact import get_env_impact
 from src.hub import glob, load_json_file
     results = [result for result in results if result]
     if not results:
         return
+    data = {"results": {}, "configs": {}, "env_impact": {}}
     for result in results:
         data["results"].update(result["results"])
         data["configs"].update(result["configs"])
+        data["env_impact"].update(await get_env_impact(result))
         model_name = result.get("model_name", "Model")
     df = pd.json_normalize([data])
     # df.columns = df.columns.str.split(".")  # .split return a list instead of a tuple
     return (
         display_tab("results", df, task, hide_std_errors=hide_std_errors),
         display_tab("configs", df, task, show_only_differences=show_only_differences),
+        display_tab("env_impact", df, task),
     )
     subset = idx[colored_rows, idx[:]]
     df.background_gradient(cmap="PiYG", vmin=0, vmax=1, subset=subset, axis=None)
     # Format index values: remove prefix and suffix
+    if tab == "env_impact":
+        start = len(f"{tab}.")
+    else:
+        start = len(f"{tab}.leaderboard_") if task == "All" else len(f"{tab}.{task} ")
     df.format_index(lambda idx: idx[start:].removesuffix(",none"), axis="index")
     # Fix overflow
     df.set_table_styles(
 def clear_results():
+    # model_ids, dataframe, load_results_btn, load_configs_btn, load_env_impact_btn, results_task, configs_task
     return (
         gr.Dropdown(value=[]),
         None,
+        *(gr.Button("Load", interactive=False),) * 3,
         *(
             gr.Radio(
                 ["All"] + list(constants.TASKS.values()),
 def display_loading_message_for_results():
+    return ("<h3 style='text-align: center;'>Loading...</h3>",) * 3
 def plot_results(df, task):