Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
eduagarcia
commited on
Commit
•
5639a81
1
Parent(s):
03f7287
Add new tasks and make leadboard work without new tasks evals
Browse files- src/display/utils.py +1 -1
- src/leaderboard/read_evals.py +18 -3
- src/populate.py +1 -1
- src/submission/check_validity.py +0 -1
- src/tools/plots.py +2 -0
- tasks_config/pt_config.yaml +42 -18
src/display/utils.py
CHANGED
@@ -65,7 +65,7 @@ auto_eval_column_dict.append(["eval_time", ColumnContent, ColumnContent("Evaluat
|
|
65 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("Model Name", "str", False, dummy=True)])
|
66 |
if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
|
67 |
auto_eval_column_dict.append(["original_benchmark_average", ColumnContent, ColumnContent("🤗 Leaderboard Average", "number", False)])
|
68 |
-
|
69 |
|
70 |
# We use make dataclass to dynamically fill the scores from Tasks
|
71 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
65 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("Model Name", "str", False, dummy=True)])
|
66 |
if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
|
67 |
auto_eval_column_dict.append(["original_benchmark_average", ColumnContent, ColumnContent("🤗 Leaderboard Average", "number", False)])
|
68 |
+
auto_eval_column_dict.append(["npm", ColumnContent, ColumnContent("NPM (Average) ⬆️", "number", False)])
|
69 |
|
70 |
# We use make dataclass to dynamically fill the scores from Tasks
|
71 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
src/leaderboard/read_evals.py
CHANGED
@@ -3,6 +3,7 @@ import json
|
|
3 |
import math
|
4 |
import os
|
5 |
from dataclasses import dataclass
|
|
|
6 |
|
7 |
import dateutil
|
8 |
import numpy as np
|
@@ -155,7 +156,19 @@ class EvalResult:
|
|
155 |
|
156 |
def to_dict(self):
|
157 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
158 |
-
average =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
data_dict = {
|
160 |
"eval_name": self.eval_name, # not a column, just a save name,
|
161 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
@@ -174,11 +187,13 @@ class EvalResult:
|
|
174 |
AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
|
175 |
AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
|
176 |
AutoEvalColumn.flagged.name: self.flagged,
|
177 |
-
AutoEvalColumn.eval_time.name: self.eval_time
|
|
|
178 |
}
|
179 |
|
180 |
for task in Tasks:
|
181 |
-
|
|
|
182 |
|
183 |
if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
|
184 |
data_dict[AutoEvalColumn.original_benchmark_average.name] = self.original_benchmark_average
|
|
|
3 |
import math
|
4 |
import os
|
5 |
from dataclasses import dataclass
|
6 |
+
from typing import List
|
7 |
|
8 |
import dateutil
|
9 |
import numpy as np
|
|
|
156 |
|
157 |
def to_dict(self):
|
158 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
159 |
+
average = []
|
160 |
+
npm = []
|
161 |
+
for task in Tasks:
|
162 |
+
if task.value.benchmark not in self.results:
|
163 |
+
continue
|
164 |
+
res = self.results[task.value.benchmark]
|
165 |
+
if res is None or np.isnan(res) or not (isinstance(res, float) or isinstance(res, int)):
|
166 |
+
continue
|
167 |
+
average.append(res)
|
168 |
+
npm.append((res-task.value.baseline)*100.0 / (100.0-task.value.baseline))
|
169 |
+
average = sum(average)/len(average)
|
170 |
+
npm = sum(npm)/len(npm)
|
171 |
+
|
172 |
data_dict = {
|
173 |
"eval_name": self.eval_name, # not a column, just a save name,
|
174 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
187 |
AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
|
188 |
AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
|
189 |
AutoEvalColumn.flagged.name: self.flagged,
|
190 |
+
AutoEvalColumn.eval_time.name: self.eval_time,
|
191 |
+
AutoEvalColumn.npm.name: npm
|
192 |
}
|
193 |
|
194 |
for task in Tasks:
|
195 |
+
if task.value.benchmark in self.results:
|
196 |
+
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
197 |
|
198 |
if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
|
199 |
data_dict[AutoEvalColumn.original_benchmark_average.name] = self.original_benchmark_average
|
src/populate.py
CHANGED
@@ -21,7 +21,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str,
|
|
21 |
df = df[cols].round(decimals=2)
|
22 |
|
23 |
# filter out if any of the benchmarks have not been produced
|
24 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
25 |
return raw_data, df
|
26 |
|
27 |
|
|
|
21 |
df = df[cols].round(decimals=2)
|
22 |
|
23 |
# filter out if any of the benchmarks have not been produced
|
24 |
+
#df = df[has_no_nan_values(df, benchmark_cols)]
|
25 |
return raw_data, df
|
26 |
|
27 |
|
src/submission/check_validity.py
CHANGED
@@ -22,7 +22,6 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
|
|
22 |
except huggingface_hub.utils.EntryNotFoundError:
|
23 |
return False, "Please add a model card to your model to explain how you trained/fine-tuned it.", None
|
24 |
except Exception as e:
|
25 |
-
traceback.print_exc()
|
26 |
return False, f"Error while loading the model card. Exception: {str(e)}", None
|
27 |
|
28 |
# Enforce license metadata
|
|
|
22 |
except huggingface_hub.utils.EntryNotFoundError:
|
23 |
return False, "Please add a model card to your model to explain how you trained/fine-tuned it.", None
|
24 |
except Exception as e:
|
|
|
25 |
return False, f"Error while loading the model card. Exception: {str(e)}", None
|
26 |
|
27 |
# Enforce license metadata
|
src/tools/plots.py
CHANGED
@@ -41,6 +41,8 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
|
|
41 |
if task.benchmark == "Average":
|
42 |
current_score = np.mean(list(row["results"].values()))
|
43 |
else:
|
|
|
|
|
44 |
current_score = row["results"][task.benchmark]
|
45 |
|
46 |
if current_score > current_max:
|
|
|
41 |
if task.benchmark == "Average":
|
42 |
current_score = np.mean(list(row["results"].values()))
|
43 |
else:
|
44 |
+
if task.benchmark not in row["results"]:
|
45 |
+
continue
|
46 |
current_score = row["results"][task.benchmark]
|
47 |
|
48 |
if current_score > current_max:
|
tasks_config/pt_config.yaml
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
version: 1.
|
2 |
config:
|
3 |
REPO_ID: "eduagarcia/open_pt_llm_leaderboard"
|
4 |
QUEUE_REPO: eduagarcia-temp/llm_pt_leaderboard_requests
|
@@ -160,25 +160,49 @@ tasks:
|
|
160 |
entailment task between a question and its possible answers."
|
161 |
link: https://huggingface.co/datasets/ruanchaves/faquad-nli
|
162 |
sources: ["https://github.com/liafacom/faquad/"]
|
163 |
-
|
164 |
-
benchmark:
|
165 |
-
col_name:
|
166 |
task_list:
|
167 |
-
-
|
168 |
-
- sparrow_hate-2019-fortuna-por
|
169 |
-
- sparrow_sentiment-2016-mozetic-por
|
170 |
-
- sparrow_sentiment-2018-brum-por
|
171 |
metric: f1_macro
|
172 |
few_shot: 25
|
173 |
-
|
174 |
-
baseline: 29.5 #random baseline [3.3, 48.8, 33.1, 33.0]
|
175 |
human_baseline: null
|
176 |
expert_human_baseline: null
|
177 |
-
description: "
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: 1.1.0
|
2 |
config:
|
3 |
REPO_ID: "eduagarcia/open_pt_llm_leaderboard"
|
4 |
QUEUE_REPO: eduagarcia-temp/llm_pt_leaderboard_requests
|
|
|
160 |
entailment task between a question and its possible answers."
|
161 |
link: https://huggingface.co/datasets/ruanchaves/faquad-nli
|
162 |
sources: ["https://github.com/liafacom/faquad/"]
|
163 |
+
hatebr_offensive:
|
164 |
+
benchmark: hatebr_offensive
|
165 |
+
col_name: HateBR Offensive
|
166 |
task_list:
|
167 |
+
- hatebr_offensive
|
|
|
|
|
|
|
168 |
metric: f1_macro
|
169 |
few_shot: 25
|
170 |
+
baseline: 50.0
|
|
|
171 |
human_baseline: null
|
172 |
expert_human_baseline: null
|
173 |
+
description: "HateBR is the first large-scale expert annotated dataset of Brazilian Instagram comments for abusive language detection
|
174 |
+
on the web and social media. The HateBR was collected from Brazilian Instagram comments of politicians and manually annotated
|
175 |
+
by specialists. It is composed of 7,000 documents annotated with a binary classification (offensive
|
176 |
+
versus non-offensive comments)."
|
177 |
+
link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
|
178 |
+
sources: ["https://github.com/franciellevargas/HateBR", "https://huggingface.co/datasets/ruanchaves/hatebr"]
|
179 |
+
portuguese_hate_speech:
|
180 |
+
benchmark: portuguese_hate_speech
|
181 |
+
col_name: PT Hate Speech
|
182 |
+
task_list:
|
183 |
+
- portuguese_hate_speech
|
184 |
+
metric: f1_macro
|
185 |
+
few_shot: 25
|
186 |
+
baseline: 47.9
|
187 |
+
human_baseline: null
|
188 |
+
expert_human_baseline: null
|
189 |
+
description: "Portuguese dataset for hate speech detection composed of 5,668 tweets with binary annotations (i.e. 'hate' vs. 'no-hate')"
|
190 |
+
link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
|
191 |
+
sources: ["https://github.com/paulafortuna/Portuguese-Hate-Speech-Dataset", "https://huggingface.co/datasets/hate_speech_portuguese"]
|
192 |
+
tweetsentbr:
|
193 |
+
benchmark: tweetsentbr
|
194 |
+
col_name: tweetSentBR
|
195 |
+
task_list:
|
196 |
+
- tweetsentbr
|
197 |
+
metric: f1_macro
|
198 |
+
few_shot: 25
|
199 |
+
baseline: 32.8
|
200 |
+
human_baseline: null
|
201 |
+
expert_human_baseline: null
|
202 |
+
description: "TweetSentBR is a corpus of Tweets in Brazilian Portuguese.
|
203 |
+
It was labeled by several annotators following steps stablished on the literature for
|
204 |
+
improving reliability on the task of Sentiment Analysis. Each Tweet was annotated
|
205 |
+
in one of the three following classes: Positive, Negative, Neutral."
|
206 |
+
link: https://bitbucket.org/HBrum/tweetsentbr
|
207 |
+
sources: ["https://bitbucket.org/HBrum/tweetsentbr", "https://arxiv.org/abs/1712.08917"]
|
208 |
+
|