Commit
•
608184c
1
Parent(s):
966ae7b
Support comparing environmental impact
Browse files- app.py +27 -6
- src/constants.py +12 -0
- src/env_impact.py +109 -0
- src/requests.py +20 -0
- src/results.py +11 -5
app.py
CHANGED
@@ -11,6 +11,7 @@ from src.details import (
|
|
11 |
update_subtasks_component,
|
12 |
update_task_description_component,
|
13 |
)
|
|
|
14 |
from src.model_tree import load_model_tree
|
15 |
from src.results import (
|
16 |
clear_results,
|
@@ -120,6 +121,21 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
|
|
120 |
details_show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
|
121 |
details = gr.HTML()
|
122 |
details_dataframe = gr.State()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
# DEMO:
|
125 |
demo.load(
|
@@ -134,15 +150,15 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
|
|
134 |
# Buttons:
|
135 |
gr.on(
|
136 |
triggers=[model_ids.input],
|
137 |
-
fn=lambda: (gr.Button(interactive=True),) *
|
138 |
-
outputs=[load_model_tree_btn, load_results_btn, load_configs_btn],
|
139 |
)
|
140 |
|
141 |
# RESULTS:
|
142 |
gr.on(
|
143 |
-
triggers=[load_results_btn.click, load_configs_btn.click],
|
144 |
fn=display_loading_message_for_results,
|
145 |
-
outputs=[results, configs],
|
146 |
).then(
|
147 |
fn=load_results,
|
148 |
inputs=[
|
@@ -178,11 +194,15 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
|
|
178 |
],
|
179 |
fn=display_results,
|
180 |
inputs=[results_dataframe, results_task, hide_std_errors, show_only_differences],
|
181 |
-
outputs=[results, configs],
|
182 |
).then(
|
183 |
fn=plot_results,
|
184 |
inputs=[results_dataframe, results_task],
|
185 |
outputs=[results_plot_1, results_plot_2],
|
|
|
|
|
|
|
|
|
186 |
).then(
|
187 |
fn=clear_results_file,
|
188 |
outputs=results_file,
|
@@ -193,13 +213,14 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
|
|
193 |
outputs=results_file,
|
194 |
)
|
195 |
gr.on(
|
196 |
-
triggers=[clear_results_btn.click, clear_configs_btn.click],
|
197 |
fn=clear_results,
|
198 |
outputs=[
|
199 |
model_ids,
|
200 |
results_dataframe,
|
201 |
load_results_btn,
|
202 |
load_configs_btn,
|
|
|
203 |
results_task,
|
204 |
configs_task,
|
205 |
],
|
|
|
11 |
update_subtasks_component,
|
12 |
update_task_description_component,
|
13 |
)
|
14 |
+
from src.env_impact import plot_env_impact
|
15 |
from src.model_tree import load_model_tree
|
16 |
from src.results import (
|
17 |
clear_results,
|
|
|
121 |
details_show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
|
122 |
details = gr.HTML()
|
123 |
details_dataframe = gr.State()
|
124 |
+
with gr.Tab("Environmental impact"):
|
125 |
+
gr.Markdown(
|
126 |
+
"The environmental impact calculations we display are derived from the specific inference setup used "
|
127 |
+
"for evaluation. We leverage 🤗 [Accelerate](https://huggingface.co/docs/accelerate) to efficiently "
|
128 |
+
"parallelize the model across 8 Nvidia H100 SXM GPUs in a compute cluster located in Northern Virginia. "
|
129 |
+
"These results reflect the energy consumption and associated emissions of this configuration, "
|
130 |
+
"providing transparency and insight into the resource requirements of large language model evaluations. "
|
131 |
+
"You can find more details in our documentation about the [environmental impact](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/emissions)."
|
132 |
+
)
|
133 |
+
load_env_impact_btn = gr.Button("Load", interactive=False)
|
134 |
+
clear_env_impact_btn = gr.Button("Clear")
|
135 |
+
with gr.Row():
|
136 |
+
env_impact_plot_1 = gr.Plot(visible=True)
|
137 |
+
env_impact_plot_2 = gr.Plot(visible=True)
|
138 |
+
env_impact = gr.HTML()
|
139 |
|
140 |
# DEMO:
|
141 |
demo.load(
|
|
|
150 |
# Buttons:
|
151 |
gr.on(
|
152 |
triggers=[model_ids.input],
|
153 |
+
fn=lambda: (gr.Button(interactive=True),) * 4,
|
154 |
+
outputs=[load_model_tree_btn, load_results_btn, load_configs_btn, load_env_impact_btn],
|
155 |
)
|
156 |
|
157 |
# RESULTS:
|
158 |
gr.on(
|
159 |
+
triggers=[load_results_btn.click, load_configs_btn.click, load_env_impact_btn.click],
|
160 |
fn=display_loading_message_for_results,
|
161 |
+
outputs=[results, configs, env_impact],
|
162 |
).then(
|
163 |
fn=load_results,
|
164 |
inputs=[
|
|
|
194 |
],
|
195 |
fn=display_results,
|
196 |
inputs=[results_dataframe, results_task, hide_std_errors, show_only_differences],
|
197 |
+
outputs=[results, configs, env_impact],
|
198 |
).then(
|
199 |
fn=plot_results,
|
200 |
inputs=[results_dataframe, results_task],
|
201 |
outputs=[results_plot_1, results_plot_2],
|
202 |
+
).then(
|
203 |
+
fn=plot_env_impact,
|
204 |
+
inputs=[results_dataframe],
|
205 |
+
outputs=[env_impact_plot_1, env_impact_plot_2],
|
206 |
).then(
|
207 |
fn=clear_results_file,
|
208 |
outputs=results_file,
|
|
|
213 |
outputs=results_file,
|
214 |
)
|
215 |
gr.on(
|
216 |
+
triggers=[clear_results_btn.click, clear_configs_btn.click, clear_env_impact_btn.click],
|
217 |
fn=clear_results,
|
218 |
outputs=[
|
219 |
model_ids,
|
220 |
results_dataframe,
|
221 |
load_results_btn,
|
222 |
load_configs_btn,
|
223 |
+
load_env_impact_btn,
|
224 |
results_task,
|
225 |
configs_task,
|
226 |
],
|
src/constants.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results"
|
2 |
|
3 |
DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
|
@@ -82,3 +83,14 @@ DERIVED_MODEL_TYPES = [
|
|
82 |
("Merges", "merge"),
|
83 |
("Quantizations", "quantized"),
|
84 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
REQUESTS_DATASET_ID = "datasets/open-llm-leaderboard/requests"
|
2 |
RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results"
|
3 |
|
4 |
DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
|
|
|
83 |
("Merges", "merge"),
|
84 |
("Quantizations", "quantized"),
|
85 |
]
|
86 |
+
|
87 |
+
|
88 |
+
MODEL_TYPE_LABEL_TO_TYPE = {
|
89 |
+
"🟢 : 🟢 pretrained": "pretrained",
|
90 |
+
"🟩 : 🟩 continuously pretrained": "pretrained",
|
91 |
+
"🔶 : 🔶 fine-tuned on domain-specific datasets": "fine_tuned_chat",
|
92 |
+
"💬 : 💬 chat models (RLHF, DPO, IFT, ...)": "fine_tuned_chat",
|
93 |
+
"🤝 : 🤝 base merges and moerges": "merges",
|
94 |
+
"🌸 : 🌸 multimodal": "multimodal",
|
95 |
+
"❓ : ❓ other": "other",
|
96 |
+
}
|
src/env_impact.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import timedelta
|
2 |
+
|
3 |
+
import plotly.express as px
|
4 |
+
|
5 |
+
import src.constants as constants
|
6 |
+
from src.hub import load_model_card
|
7 |
+
from src.requests import load_request
|
8 |
+
|
9 |
+
|
10 |
+
async def get_env_impact(data):
|
11 |
+
total_evaluation_time_seconds = data.get("total_evaluation_time_seconds")
|
12 |
+
if total_evaluation_time_seconds:
|
13 |
+
total_evaluation_time_seconds = float(total_evaluation_time_seconds)
|
14 |
+
env_impact = {
|
15 |
+
"co2_emissions": calculate_co2_emissions(total_evaluation_time_seconds),
|
16 |
+
"total_evaluation_time": str(timedelta(seconds=total_evaluation_time_seconds)),
|
17 |
+
"num_parameters_billions": data.get("config", {}).get("model_num_parameters") / 10**9,
|
18 |
+
"precision": data.get("config", {}).get("model_dtype"),
|
19 |
+
}
|
20 |
+
request = await load_request(data["model_name"], env_impact["precision"])
|
21 |
+
if request:
|
22 |
+
model_type_label = request.get("model_type", "unknown")
|
23 |
+
env_impact["model_type"] = constants.MODEL_TYPE_LABEL_TO_TYPE.get(model_type_label, model_type_label)
|
24 |
+
env_impact["architecture"] = request.get("architectures", "Unknown")
|
25 |
+
# MoE
|
26 |
+
model_card = await load_model_card(data["model_name"])
|
27 |
+
model_tags = get_moe_model_tags(model_card.data, data["model_name"])
|
28 |
+
moe = "moe" in model_tags or "moe" in data["model_name"].lower()
|
29 |
+
env_impact["moe"] = moe
|
30 |
+
return env_impact
|
31 |
+
|
32 |
+
|
33 |
+
# Source: https://huggingface.co/docs/leaderboards/open_llm_leaderboard/emissions#function-for-c02-calculation
|
34 |
+
def calculate_co2_emissions(total_evaluation_time_seconds: float | None) -> float:
|
35 |
+
if total_evaluation_time_seconds is None or total_evaluation_time_seconds <= 0:
|
36 |
+
return -1
|
37 |
+
# Power consumption for 8 H100 SXM GPUs in kilowatts (kW)
|
38 |
+
power_consumption_kW = 5.6
|
39 |
+
# Carbon intensity in grams CO₂ per kWh in Virginia
|
40 |
+
carbon_intensity_g_per_kWh = 269.8
|
41 |
+
# Convert evaluation time to hours
|
42 |
+
total_evaluation_time_hours = total_evaluation_time_seconds / 3600
|
43 |
+
# Calculate energy consumption in kWh
|
44 |
+
energy_consumption_kWh = power_consumption_kW * total_evaluation_time_hours
|
45 |
+
# Calculate CO₂ emissions in grams
|
46 |
+
co2_emissions_g = energy_consumption_kWh * carbon_intensity_g_per_kWh
|
47 |
+
# Convert grams to kilograms
|
48 |
+
return co2_emissions_g / 1000
|
49 |
+
|
50 |
+
|
51 |
+
# Source: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard_parser/blob/main/src/submission/check_validity.py#L33
|
52 |
+
def get_moe_model_tags(model_card, model_id):
|
53 |
+
# is_merge_from_metadata = False
|
54 |
+
is_moe_from_metadata = False
|
55 |
+
is_moe_from_model_card = False
|
56 |
+
# is_merge_from_model_card = False
|
57 |
+
tags = []
|
58 |
+
if model_card is None:
|
59 |
+
return tags
|
60 |
+
if model_card.tags:
|
61 |
+
# is_merge_from_metadata = any(tag in model_card.tags for tag in ["merge", "moerge", "mergekit", "lazymergekit"])
|
62 |
+
is_moe_from_metadata = any(tag in model_card.tags for tag in ["moe", "moerge", "mixtral"])
|
63 |
+
if model_card.get("text", False):
|
64 |
+
# is_merge_from_model_card = any(
|
65 |
+
# keyword in model_card.text.lower() for keyword in ["merged model", "merge model", "moerge"]
|
66 |
+
# )
|
67 |
+
is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in ["moe", "mixtral"])
|
68 |
+
# if is_merge_from_model_card or is_merge_from_metadata:
|
69 |
+
# tags.append("merge")
|
70 |
+
is_moe_from_name = any(
|
71 |
+
key in model_id.lower().replace("/", "-").replace("_", "-").split("-") for key in ["moe", "mixtral"]
|
72 |
+
)
|
73 |
+
# Hardcoded check for "rhymes-ai/Aria" model
|
74 |
+
if model_id == "rhymes-ai/Aria":
|
75 |
+
tags.append("moe")
|
76 |
+
elif is_moe_from_model_card or is_moe_from_name or is_moe_from_metadata:
|
77 |
+
tags.append("moe")
|
78 |
+
return tags
|
79 |
+
|
80 |
+
|
81 |
+
def plot_env_impact(df):
|
82 |
+
if df is None:
|
83 |
+
return None, None
|
84 |
+
fig_1 = px.scatter(
|
85 |
+
df.rename_axis(index="Model").reset_index(),
|
86 |
+
x="env_impact.num_parameters_billions",
|
87 |
+
y="env_impact.co2_emissions",
|
88 |
+
color="Model",
|
89 |
+
title="Evaluation CO₂ Emissions (kg) vs. #Params (B)",
|
90 |
+
labels={
|
91 |
+
"env_impact.num_parameters_billions": "#Params (B)",
|
92 |
+
"env_impact.co2_emissions": "Evaluation CO₂ Emissions (kg)",
|
93 |
+
},
|
94 |
+
color_discrete_sequence=px.colors.qualitative.Safe, # TODO: https://plotly.com/python/discrete-color/
|
95 |
+
)
|
96 |
+
fig_2 = px.scatter(
|
97 |
+
df.rename_axis(index="Model").reset_index(),
|
98 |
+
x="results.leaderboard.acc_norm,none",
|
99 |
+
y="env_impact.co2_emissions",
|
100 |
+
color="Model",
|
101 |
+
title="Evaluation CO₂ Emissions (kg) vs. Score",
|
102 |
+
labels={
|
103 |
+
"results.leaderboard.acc_norm,none": "Mean Score",
|
104 |
+
"env_impact.co2_emissions": "Evaluation CO₂ Emissions (kg)",
|
105 |
+
},
|
106 |
+
color_discrete_sequence=px.colors.qualitative.Safe, # TODO: https://plotly.com/python/discrete-color/
|
107 |
+
)
|
108 |
+
fig_2.update_xaxes(range=[0, 1])
|
109 |
+
return fig_1, fig_2
|
src/requests.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
|
3 |
+
import src.constants as constants
|
4 |
+
from src.hub import glob, load_json_file
|
5 |
+
|
6 |
+
|
7 |
+
def fetch_request_paths(model_id):
|
8 |
+
path = f"{constants.REQUESTS_DATASET_ID}/{model_id}_eval_request_*.json"
|
9 |
+
return glob(path)
|
10 |
+
|
11 |
+
|
12 |
+
async def load_request(model_id, precision):
|
13 |
+
paths = await asyncio.to_thread(fetch_request_paths, model_id)
|
14 |
+
if not paths:
|
15 |
+
return
|
16 |
+
# TODO: Why sorted and reversed? https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard_parser/blob/main/src/leaderboard/read_evals.py#L254
|
17 |
+
for path in sorted(paths, reverse=True):
|
18 |
+
data = await load_json_file(path)
|
19 |
+
if data["precision"] == precision.split(".")[-1]:
|
20 |
+
return data
|
src/results.py
CHANGED
@@ -7,6 +7,7 @@ import pandas as pd
|
|
7 |
import plotly.express as px
|
8 |
|
9 |
import src.constants as constants
|
|
|
10 |
from src.hub import glob, load_json_file
|
11 |
|
12 |
|
@@ -37,10 +38,11 @@ async def load_results_dataframe(model_id, result_paths_per_model=None):
|
|
37 |
results = [result for result in results if result]
|
38 |
if not results:
|
39 |
return
|
40 |
-
data = {"results": {}, "configs": {}}
|
41 |
for result in results:
|
42 |
data["results"].update(result["results"])
|
43 |
data["configs"].update(result["configs"])
|
|
|
44 |
model_name = result.get("model_name", "Model")
|
45 |
df = pd.json_normalize([data])
|
46 |
# df.columns = df.columns.str.split(".") # .split return a list instead of a tuple
|
@@ -70,6 +72,7 @@ def display_results(df, task, hide_std_errors, show_only_differences):
|
|
70 |
return (
|
71 |
display_tab("results", df, task, hide_std_errors=hide_std_errors),
|
72 |
display_tab("configs", df, task, show_only_differences=show_only_differences),
|
|
|
73 |
)
|
74 |
|
75 |
|
@@ -113,7 +116,10 @@ def display_tab(tab, df, task, hide_std_errors=True, show_only_differences=False
|
|
113 |
subset = idx[colored_rows, idx[:]]
|
114 |
df.background_gradient(cmap="PiYG", vmin=0, vmax=1, subset=subset, axis=None)
|
115 |
# Format index values: remove prefix and suffix
|
116 |
-
|
|
|
|
|
|
|
117 |
df.format_index(lambda idx: idx[start:].removesuffix(",none"), axis="index")
|
118 |
# Fix overflow
|
119 |
df.set_table_styles(
|
@@ -144,11 +150,11 @@ def update_tasks_component():
|
|
144 |
|
145 |
|
146 |
def clear_results():
|
147 |
-
# model_ids, dataframe, load_results_btn, load_configs_btn, results_task, configs_task
|
148 |
return (
|
149 |
gr.Dropdown(value=[]),
|
150 |
None,
|
151 |
-
*(gr.Button("Load", interactive=False),) *
|
152 |
*(
|
153 |
gr.Radio(
|
154 |
["All"] + list(constants.TASKS.values()),
|
@@ -163,7 +169,7 @@ def clear_results():
|
|
163 |
|
164 |
|
165 |
def display_loading_message_for_results():
|
166 |
-
return ("<h3 style='text-align: center;'>Loading...</h3>",) *
|
167 |
|
168 |
|
169 |
def plot_results(df, task):
|
|
|
7 |
import plotly.express as px
|
8 |
|
9 |
import src.constants as constants
|
10 |
+
from src.env_impact import get_env_impact
|
11 |
from src.hub import glob, load_json_file
|
12 |
|
13 |
|
|
|
38 |
results = [result for result in results if result]
|
39 |
if not results:
|
40 |
return
|
41 |
+
data = {"results": {}, "configs": {}, "env_impact": {}}
|
42 |
for result in results:
|
43 |
data["results"].update(result["results"])
|
44 |
data["configs"].update(result["configs"])
|
45 |
+
data["env_impact"].update(await get_env_impact(result))
|
46 |
model_name = result.get("model_name", "Model")
|
47 |
df = pd.json_normalize([data])
|
48 |
# df.columns = df.columns.str.split(".") # .split return a list instead of a tuple
|
|
|
72 |
return (
|
73 |
display_tab("results", df, task, hide_std_errors=hide_std_errors),
|
74 |
display_tab("configs", df, task, show_only_differences=show_only_differences),
|
75 |
+
display_tab("env_impact", df, task),
|
76 |
)
|
77 |
|
78 |
|
|
|
116 |
subset = idx[colored_rows, idx[:]]
|
117 |
df.background_gradient(cmap="PiYG", vmin=0, vmax=1, subset=subset, axis=None)
|
118 |
# Format index values: remove prefix and suffix
|
119 |
+
if tab == "env_impact":
|
120 |
+
start = len(f"{tab}.")
|
121 |
+
else:
|
122 |
+
start = len(f"{tab}.leaderboard_") if task == "All" else len(f"{tab}.{task} ")
|
123 |
df.format_index(lambda idx: idx[start:].removesuffix(",none"), axis="index")
|
124 |
# Fix overflow
|
125 |
df.set_table_styles(
|
|
|
150 |
|
151 |
|
152 |
def clear_results():
|
153 |
+
# model_ids, dataframe, load_results_btn, load_configs_btn, load_env_impact_btn, results_task, configs_task
|
154 |
return (
|
155 |
gr.Dropdown(value=[]),
|
156 |
None,
|
157 |
+
*(gr.Button("Load", interactive=False),) * 3,
|
158 |
*(
|
159 |
gr.Radio(
|
160 |
["All"] + list(constants.TASKS.values()),
|
|
|
169 |
|
170 |
|
171 |
def display_loading_message_for_results():
|
172 |
+
return ("<h3 style='text-align: center;'>Loading...</h3>",) * 3
|
173 |
|
174 |
|
175 |
def plot_results(df, task):
|