albertvillanova HF staff commited on
Commit
608184c
1 Parent(s): 966ae7b

Support comparing environmental impact

Browse files
Files changed (5) hide show
  1. app.py +27 -6
  2. src/constants.py +12 -0
  3. src/env_impact.py +109 -0
  4. src/requests.py +20 -0
  5. src/results.py +11 -5
app.py CHANGED
@@ -11,6 +11,7 @@ from src.details import (
11
  update_subtasks_component,
12
  update_task_description_component,
13
  )
 
14
  from src.model_tree import load_model_tree
15
  from src.results import (
16
  clear_results,
@@ -120,6 +121,21 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
120
  details_show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
121
  details = gr.HTML()
122
  details_dataframe = gr.State()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
  # DEMO:
125
  demo.load(
@@ -134,15 +150,15 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
134
  # Buttons:
135
  gr.on(
136
  triggers=[model_ids.input],
137
- fn=lambda: (gr.Button(interactive=True),) * 3,
138
- outputs=[load_model_tree_btn, load_results_btn, load_configs_btn],
139
  )
140
 
141
  # RESULTS:
142
  gr.on(
143
- triggers=[load_results_btn.click, load_configs_btn.click],
144
  fn=display_loading_message_for_results,
145
- outputs=[results, configs],
146
  ).then(
147
  fn=load_results,
148
  inputs=[
@@ -178,11 +194,15 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
178
  ],
179
  fn=display_results,
180
  inputs=[results_dataframe, results_task, hide_std_errors, show_only_differences],
181
- outputs=[results, configs],
182
  ).then(
183
  fn=plot_results,
184
  inputs=[results_dataframe, results_task],
185
  outputs=[results_plot_1, results_plot_2],
 
 
 
 
186
  ).then(
187
  fn=clear_results_file,
188
  outputs=results_file,
@@ -193,13 +213,14 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
193
  outputs=results_file,
194
  )
195
  gr.on(
196
- triggers=[clear_results_btn.click, clear_configs_btn.click],
197
  fn=clear_results,
198
  outputs=[
199
  model_ids,
200
  results_dataframe,
201
  load_results_btn,
202
  load_configs_btn,
 
203
  results_task,
204
  configs_task,
205
  ],
 
11
  update_subtasks_component,
12
  update_task_description_component,
13
  )
14
+ from src.env_impact import plot_env_impact
15
  from src.model_tree import load_model_tree
16
  from src.results import (
17
  clear_results,
 
121
  details_show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
122
  details = gr.HTML()
123
  details_dataframe = gr.State()
124
+ with gr.Tab("Environmental impact"):
125
+ gr.Markdown(
126
+ "The environmental impact calculations we display are derived from the specific inference setup used "
127
+ "for evaluation. We leverage 🤗 [Accelerate](https://huggingface.co/docs/accelerate) to efficiently "
128
+ "parallelize the model across 8 Nvidia H100 SXM GPUs in a compute cluster located in Northern Virginia. "
129
+ "These results reflect the energy consumption and associated emissions of this configuration, "
130
+ "providing transparency and insight into the resource requirements of large language model evaluations. "
131
+ "You can find more details in our documentation about the [environmental impact](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/emissions)."
132
+ )
133
+ load_env_impact_btn = gr.Button("Load", interactive=False)
134
+ clear_env_impact_btn = gr.Button("Clear")
135
+ with gr.Row():
136
+ env_impact_plot_1 = gr.Plot(visible=True)
137
+ env_impact_plot_2 = gr.Plot(visible=True)
138
+ env_impact = gr.HTML()
139
 
140
  # DEMO:
141
  demo.load(
 
150
  # Buttons:
151
  gr.on(
152
  triggers=[model_ids.input],
153
+ fn=lambda: (gr.Button(interactive=True),) * 4,
154
+ outputs=[load_model_tree_btn, load_results_btn, load_configs_btn, load_env_impact_btn],
155
  )
156
 
157
  # RESULTS:
158
  gr.on(
159
+ triggers=[load_results_btn.click, load_configs_btn.click, load_env_impact_btn.click],
160
  fn=display_loading_message_for_results,
161
+ outputs=[results, configs, env_impact],
162
  ).then(
163
  fn=load_results,
164
  inputs=[
 
194
  ],
195
  fn=display_results,
196
  inputs=[results_dataframe, results_task, hide_std_errors, show_only_differences],
197
+ outputs=[results, configs, env_impact],
198
  ).then(
199
  fn=plot_results,
200
  inputs=[results_dataframe, results_task],
201
  outputs=[results_plot_1, results_plot_2],
202
+ ).then(
203
+ fn=plot_env_impact,
204
+ inputs=[results_dataframe],
205
+ outputs=[env_impact_plot_1, env_impact_plot_2],
206
  ).then(
207
  fn=clear_results_file,
208
  outputs=results_file,
 
213
  outputs=results_file,
214
  )
215
  gr.on(
216
+ triggers=[clear_results_btn.click, clear_configs_btn.click, clear_env_impact_btn.click],
217
  fn=clear_results,
218
  outputs=[
219
  model_ids,
220
  results_dataframe,
221
  load_results_btn,
222
  load_configs_btn,
223
+ load_env_impact_btn,
224
  results_task,
225
  configs_task,
226
  ],
src/constants.py CHANGED
@@ -1,3 +1,4 @@
 
1
  RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results"
2
 
3
  DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
@@ -82,3 +83,14 @@ DERIVED_MODEL_TYPES = [
82
  ("Merges", "merge"),
83
  ("Quantizations", "quantized"),
84
  ]
 
 
 
 
 
 
 
 
 
 
 
 
1
+ REQUESTS_DATASET_ID = "datasets/open-llm-leaderboard/requests"
2
  RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results"
3
 
4
  DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
 
83
  ("Merges", "merge"),
84
  ("Quantizations", "quantized"),
85
  ]
86
+
87
+
88
+ MODEL_TYPE_LABEL_TO_TYPE = {
89
+ "🟢 : 🟢 pretrained": "pretrained",
90
+ "🟩 : 🟩 continuously pretrained": "pretrained",
91
+ "🔶 : 🔶 fine-tuned on domain-specific datasets": "fine_tuned_chat",
92
+ "💬 : 💬 chat models (RLHF, DPO, IFT, ...)": "fine_tuned_chat",
93
+ "🤝 : 🤝 base merges and moerges": "merges",
94
+ "🌸 : 🌸 multimodal": "multimodal",
95
+ "❓ : ❓ other": "other",
96
+ }
src/env_impact.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import timedelta
2
+
3
+ import plotly.express as px
4
+
5
+ import src.constants as constants
6
+ from src.hub import load_model_card
7
+ from src.requests import load_request
8
+
9
+
10
+ async def get_env_impact(data):
11
+ total_evaluation_time_seconds = data.get("total_evaluation_time_seconds")
12
+ if total_evaluation_time_seconds:
13
+ total_evaluation_time_seconds = float(total_evaluation_time_seconds)
14
+ env_impact = {
15
+ "co2_emissions": calculate_co2_emissions(total_evaluation_time_seconds),
16
+ "total_evaluation_time": str(timedelta(seconds=total_evaluation_time_seconds)),
17
+ "num_parameters_billions": data.get("config", {}).get("model_num_parameters") / 10**9,
18
+ "precision": data.get("config", {}).get("model_dtype"),
19
+ }
20
+ request = await load_request(data["model_name"], env_impact["precision"])
21
+ if request:
22
+ model_type_label = request.get("model_type", "unknown")
23
+ env_impact["model_type"] = constants.MODEL_TYPE_LABEL_TO_TYPE.get(model_type_label, model_type_label)
24
+ env_impact["architecture"] = request.get("architectures", "Unknown")
25
+ # MoE
26
+ model_card = await load_model_card(data["model_name"])
27
+ model_tags = get_moe_model_tags(model_card.data, data["model_name"])
28
+ moe = "moe" in model_tags or "moe" in data["model_name"].lower()
29
+ env_impact["moe"] = moe
30
+ return env_impact
31
+
32
+
33
+ # Source: https://huggingface.co/docs/leaderboards/open_llm_leaderboard/emissions#function-for-c02-calculation
34
+ def calculate_co2_emissions(total_evaluation_time_seconds: float | None) -> float:
35
+ if total_evaluation_time_seconds is None or total_evaluation_time_seconds <= 0:
36
+ return -1
37
+ # Power consumption for 8 H100 SXM GPUs in kilowatts (kW)
38
+ power_consumption_kW = 5.6
39
+ # Carbon intensity in grams CO₂ per kWh in Virginia
40
+ carbon_intensity_g_per_kWh = 269.8
41
+ # Convert evaluation time to hours
42
+ total_evaluation_time_hours = total_evaluation_time_seconds / 3600
43
+ # Calculate energy consumption in kWh
44
+ energy_consumption_kWh = power_consumption_kW * total_evaluation_time_hours
45
+ # Calculate CO₂ emissions in grams
46
+ co2_emissions_g = energy_consumption_kWh * carbon_intensity_g_per_kWh
47
+ # Convert grams to kilograms
48
+ return co2_emissions_g / 1000
49
+
50
+
51
+ # Source: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard_parser/blob/main/src/submission/check_validity.py#L33
52
+ def get_moe_model_tags(model_card, model_id):
53
+ # is_merge_from_metadata = False
54
+ is_moe_from_metadata = False
55
+ is_moe_from_model_card = False
56
+ # is_merge_from_model_card = False
57
+ tags = []
58
+ if model_card is None:
59
+ return tags
60
+ if model_card.tags:
61
+ # is_merge_from_metadata = any(tag in model_card.tags for tag in ["merge", "moerge", "mergekit", "lazymergekit"])
62
+ is_moe_from_metadata = any(tag in model_card.tags for tag in ["moe", "moerge", "mixtral"])
63
+ if model_card.get("text", False):
64
+ # is_merge_from_model_card = any(
65
+ # keyword in model_card.text.lower() for keyword in ["merged model", "merge model", "moerge"]
66
+ # )
67
+ is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in ["moe", "mixtral"])
68
+ # if is_merge_from_model_card or is_merge_from_metadata:
69
+ # tags.append("merge")
70
+ is_moe_from_name = any(
71
+ key in model_id.lower().replace("/", "-").replace("_", "-").split("-") for key in ["moe", "mixtral"]
72
+ )
73
+ # Hardcoded check for "rhymes-ai/Aria" model
74
+ if model_id == "rhymes-ai/Aria":
75
+ tags.append("moe")
76
+ elif is_moe_from_model_card or is_moe_from_name or is_moe_from_metadata:
77
+ tags.append("moe")
78
+ return tags
79
+
80
+
81
+ def plot_env_impact(df):
82
+ if df is None:
83
+ return None, None
84
+ fig_1 = px.scatter(
85
+ df.rename_axis(index="Model").reset_index(),
86
+ x="env_impact.num_parameters_billions",
87
+ y="env_impact.co2_emissions",
88
+ color="Model",
89
+ title="Evaluation CO₂ Emissions (kg) vs. #Params (B)",
90
+ labels={
91
+ "env_impact.num_parameters_billions": "#Params (B)",
92
+ "env_impact.co2_emissions": "Evaluation CO₂ Emissions (kg)",
93
+ },
94
+ color_discrete_sequence=px.colors.qualitative.Safe, # TODO: https://plotly.com/python/discrete-color/
95
+ )
96
+ fig_2 = px.scatter(
97
+ df.rename_axis(index="Model").reset_index(),
98
+ x="results.leaderboard.acc_norm,none",
99
+ y="env_impact.co2_emissions",
100
+ color="Model",
101
+ title="Evaluation CO₂ Emissions (kg) vs. Score",
102
+ labels={
103
+ "results.leaderboard.acc_norm,none": "Mean Score",
104
+ "env_impact.co2_emissions": "Evaluation CO₂ Emissions (kg)",
105
+ },
106
+ color_discrete_sequence=px.colors.qualitative.Safe, # TODO: https://plotly.com/python/discrete-color/
107
+ )
108
+ fig_2.update_xaxes(range=[0, 1])
109
+ return fig_1, fig_2
src/requests.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+
3
+ import src.constants as constants
4
+ from src.hub import glob, load_json_file
5
+
6
+
7
+ def fetch_request_paths(model_id):
8
+ path = f"{constants.REQUESTS_DATASET_ID}/{model_id}_eval_request_*.json"
9
+ return glob(path)
10
+
11
+
12
+ async def load_request(model_id, precision):
13
+ paths = await asyncio.to_thread(fetch_request_paths, model_id)
14
+ if not paths:
15
+ return
16
+ # TODO: Why sorted and reversed? https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard_parser/blob/main/src/leaderboard/read_evals.py#L254
17
+ for path in sorted(paths, reverse=True):
18
+ data = await load_json_file(path)
19
+ if data["precision"] == precision.split(".")[-1]:
20
+ return data
src/results.py CHANGED
@@ -7,6 +7,7 @@ import pandas as pd
7
  import plotly.express as px
8
 
9
  import src.constants as constants
 
10
  from src.hub import glob, load_json_file
11
 
12
 
@@ -37,10 +38,11 @@ async def load_results_dataframe(model_id, result_paths_per_model=None):
37
  results = [result for result in results if result]
38
  if not results:
39
  return
40
- data = {"results": {}, "configs": {}}
41
  for result in results:
42
  data["results"].update(result["results"])
43
  data["configs"].update(result["configs"])
 
44
  model_name = result.get("model_name", "Model")
45
  df = pd.json_normalize([data])
46
  # df.columns = df.columns.str.split(".") # .split return a list instead of a tuple
@@ -70,6 +72,7 @@ def display_results(df, task, hide_std_errors, show_only_differences):
70
  return (
71
  display_tab("results", df, task, hide_std_errors=hide_std_errors),
72
  display_tab("configs", df, task, show_only_differences=show_only_differences),
 
73
  )
74
 
75
 
@@ -113,7 +116,10 @@ def display_tab(tab, df, task, hide_std_errors=True, show_only_differences=False
113
  subset = idx[colored_rows, idx[:]]
114
  df.background_gradient(cmap="PiYG", vmin=0, vmax=1, subset=subset, axis=None)
115
  # Format index values: remove prefix and suffix
116
- start = len(f"{tab}.leaderboard_") if task == "All" else len(f"{tab}.{task} ")
 
 
 
117
  df.format_index(lambda idx: idx[start:].removesuffix(",none"), axis="index")
118
  # Fix overflow
119
  df.set_table_styles(
@@ -144,11 +150,11 @@ def update_tasks_component():
144
 
145
 
146
  def clear_results():
147
- # model_ids, dataframe, load_results_btn, load_configs_btn, results_task, configs_task
148
  return (
149
  gr.Dropdown(value=[]),
150
  None,
151
- *(gr.Button("Load", interactive=False),) * 2,
152
  *(
153
  gr.Radio(
154
  ["All"] + list(constants.TASKS.values()),
@@ -163,7 +169,7 @@ def clear_results():
163
 
164
 
165
  def display_loading_message_for_results():
166
- return ("<h3 style='text-align: center;'>Loading...</h3>",) * 2
167
 
168
 
169
  def plot_results(df, task):
 
7
  import plotly.express as px
8
 
9
  import src.constants as constants
10
+ from src.env_impact import get_env_impact
11
  from src.hub import glob, load_json_file
12
 
13
 
 
38
  results = [result for result in results if result]
39
  if not results:
40
  return
41
+ data = {"results": {}, "configs": {}, "env_impact": {}}
42
  for result in results:
43
  data["results"].update(result["results"])
44
  data["configs"].update(result["configs"])
45
+ data["env_impact"].update(await get_env_impact(result))
46
  model_name = result.get("model_name", "Model")
47
  df = pd.json_normalize([data])
48
  # df.columns = df.columns.str.split(".") # .split return a list instead of a tuple
 
72
  return (
73
  display_tab("results", df, task, hide_std_errors=hide_std_errors),
74
  display_tab("configs", df, task, show_only_differences=show_only_differences),
75
+ display_tab("env_impact", df, task),
76
  )
77
 
78
 
 
116
  subset = idx[colored_rows, idx[:]]
117
  df.background_gradient(cmap="PiYG", vmin=0, vmax=1, subset=subset, axis=None)
118
  # Format index values: remove prefix and suffix
119
+ if tab == "env_impact":
120
+ start = len(f"{tab}.")
121
+ else:
122
+ start = len(f"{tab}.leaderboard_") if task == "All" else len(f"{tab}.{task} ")
123
  df.format_index(lambda idx: idx[start:].removesuffix(",none"), axis="index")
124
  # Fix overflow
125
  df.set_table_styles(
 
150
 
151
 
152
  def clear_results():
153
+ # model_ids, dataframe, load_results_btn, load_configs_btn, load_env_impact_btn, results_task, configs_task
154
  return (
155
  gr.Dropdown(value=[]),
156
  None,
157
+ *(gr.Button("Load", interactive=False),) * 3,
158
  *(
159
  gr.Radio(
160
  ["All"] + list(constants.TASKS.values()),
 
169
 
170
 
171
  def display_loading_message_for_results():
172
+ return ("<h3 style='text-align: center;'>Loading...</h3>",) * 3
173
 
174
 
175
  def plot_results(df, task):