eduagarcia commited on
Commit
5639a81
1 Parent(s): 03f7287

Add new tasks and make leadboard work without new tasks evals

Browse files
src/display/utils.py CHANGED
@@ -65,7 +65,7 @@ auto_eval_column_dict.append(["eval_time", ColumnContent, ColumnContent("Evaluat
65
  auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("Model Name", "str", False, dummy=True)])
66
  if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
67
  auto_eval_column_dict.append(["original_benchmark_average", ColumnContent, ColumnContent("🤗 Leaderboard Average", "number", False)])
68
-
69
 
70
  # We use make dataclass to dynamically fill the scores from Tasks
71
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
65
  auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("Model Name", "str", False, dummy=True)])
66
  if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
67
  auto_eval_column_dict.append(["original_benchmark_average", ColumnContent, ColumnContent("🤗 Leaderboard Average", "number", False)])
68
+ auto_eval_column_dict.append(["npm", ColumnContent, ColumnContent("NPM (Average) ⬆️", "number", False)])
69
 
70
  # We use make dataclass to dynamically fill the scores from Tasks
71
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
src/leaderboard/read_evals.py CHANGED
@@ -3,6 +3,7 @@ import json
3
  import math
4
  import os
5
  from dataclasses import dataclass
 
6
 
7
  import dateutil
8
  import numpy as np
@@ -155,7 +156,19 @@ class EvalResult:
155
 
156
  def to_dict(self):
157
  """Converts the Eval Result to a dict compatible with our dataframe display"""
158
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
 
 
 
 
 
 
 
 
 
 
 
159
  data_dict = {
160
  "eval_name": self.eval_name, # not a column, just a save name,
161
  AutoEvalColumn.precision.name: self.precision.value.name,
@@ -174,11 +187,13 @@ class EvalResult:
174
  AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
175
  AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
176
  AutoEvalColumn.flagged.name: self.flagged,
177
- AutoEvalColumn.eval_time.name: self.eval_time
 
178
  }
179
 
180
  for task in Tasks:
181
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
 
182
 
183
  if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
184
  data_dict[AutoEvalColumn.original_benchmark_average.name] = self.original_benchmark_average
 
3
  import math
4
  import os
5
  from dataclasses import dataclass
6
+ from typing import List
7
 
8
  import dateutil
9
  import numpy as np
 
156
 
157
  def to_dict(self):
158
  """Converts the Eval Result to a dict compatible with our dataframe display"""
159
+ average = []
160
+ npm = []
161
+ for task in Tasks:
162
+ if task.value.benchmark not in self.results:
163
+ continue
164
+ res = self.results[task.value.benchmark]
165
+ if res is None or np.isnan(res) or not (isinstance(res, float) or isinstance(res, int)):
166
+ continue
167
+ average.append(res)
168
+ npm.append((res-task.value.baseline)*100.0 / (100.0-task.value.baseline))
169
+ average = sum(average)/len(average)
170
+ npm = sum(npm)/len(npm)
171
+
172
  data_dict = {
173
  "eval_name": self.eval_name, # not a column, just a save name,
174
  AutoEvalColumn.precision.name: self.precision.value.name,
 
187
  AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
188
  AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
189
  AutoEvalColumn.flagged.name: self.flagged,
190
+ AutoEvalColumn.eval_time.name: self.eval_time,
191
+ AutoEvalColumn.npm.name: npm
192
  }
193
 
194
  for task in Tasks:
195
+ if task.value.benchmark in self.results:
196
+ data_dict[task.value.col_name] = self.results[task.value.benchmark]
197
 
198
  if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
199
  data_dict[AutoEvalColumn.original_benchmark_average.name] = self.original_benchmark_average
src/populate.py CHANGED
@@ -21,7 +21,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str,
21
  df = df[cols].round(decimals=2)
22
 
23
  # filter out if any of the benchmarks have not been produced
24
- df = df[has_no_nan_values(df, benchmark_cols)]
25
  return raw_data, df
26
 
27
 
 
21
  df = df[cols].round(decimals=2)
22
 
23
  # filter out if any of the benchmarks have not been produced
24
+ #df = df[has_no_nan_values(df, benchmark_cols)]
25
  return raw_data, df
26
 
27
 
src/submission/check_validity.py CHANGED
@@ -22,7 +22,6 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
22
  except huggingface_hub.utils.EntryNotFoundError:
23
  return False, "Please add a model card to your model to explain how you trained/fine-tuned it.", None
24
  except Exception as e:
25
- traceback.print_exc()
26
  return False, f"Error while loading the model card. Exception: {str(e)}", None
27
 
28
  # Enforce license metadata
 
22
  except huggingface_hub.utils.EntryNotFoundError:
23
  return False, "Please add a model card to your model to explain how you trained/fine-tuned it.", None
24
  except Exception as e:
 
25
  return False, f"Error while loading the model card. Exception: {str(e)}", None
26
 
27
  # Enforce license metadata
src/tools/plots.py CHANGED
@@ -41,6 +41,8 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
41
  if task.benchmark == "Average":
42
  current_score = np.mean(list(row["results"].values()))
43
  else:
 
 
44
  current_score = row["results"][task.benchmark]
45
 
46
  if current_score > current_max:
 
41
  if task.benchmark == "Average":
42
  current_score = np.mean(list(row["results"].values()))
43
  else:
44
+ if task.benchmark not in row["results"]:
45
+ continue
46
  current_score = row["results"][task.benchmark]
47
 
48
  if current_score > current_max:
tasks_config/pt_config.yaml CHANGED
@@ -1,4 +1,4 @@
1
- version: 1.0.0
2
  config:
3
  REPO_ID: "eduagarcia/open_pt_llm_leaderboard"
4
  QUEUE_REPO: eduagarcia-temp/llm_pt_leaderboard_requests
@@ -160,25 +160,49 @@ tasks:
160
  entailment task between a question and its possible answers."
161
  link: https://huggingface.co/datasets/ruanchaves/faquad-nli
162
  sources: ["https://github.com/liafacom/faquad/"]
163
- sparrow_pt:
164
- benchmark: sparrow_pt
165
- col_name: Sparrow POR
166
  task_list:
167
- - sparrow_emotion-2021-cortiz-por
168
- - sparrow_hate-2019-fortuna-por
169
- - sparrow_sentiment-2016-mozetic-por
170
- - sparrow_sentiment-2018-brum-por
171
  metric: f1_macro
172
  few_shot: 25
173
- limit: 500
174
- baseline: 29.5 #random baseline [3.3, 48.8, 33.1, 33.0]
175
  human_baseline: null
176
  expert_human_baseline: null
177
- description: "SPARROW is a multilingual evaluation benchmark for sociopragmatic meaning understanding.
178
- SPARROW comprises 169 datasets encompassing 64 different languages,
179
- this split evaluates only on the validation set of 4 datasets avaliable for the Portuguese language.
180
- One on hate speech detection by Fortuna et al. (2019), one on emotion detection by Cortiz et al. (2021)
181
- and two on sentiment analysis by Mozetic et al. (2016) and Brum et al. (2018).
182
- All were extracted and manually annotated from Twitter/X."
183
- link: https://huggingface.co/datasets/UBC-NLP/sparrow
184
- sources: ["https://sparrow.dlnlp.ai/", "https://aclanthology.org/W19-3510/", "https://arxiv.org/abs/2108.07638", "https://aclanthology.org/L18-1658/", "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0155036"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.1.0
2
  config:
3
  REPO_ID: "eduagarcia/open_pt_llm_leaderboard"
4
  QUEUE_REPO: eduagarcia-temp/llm_pt_leaderboard_requests
 
160
  entailment task between a question and its possible answers."
161
  link: https://huggingface.co/datasets/ruanchaves/faquad-nli
162
  sources: ["https://github.com/liafacom/faquad/"]
163
+ hatebr_offensive:
164
+ benchmark: hatebr_offensive
165
+ col_name: HateBR Offensive
166
  task_list:
167
+ - hatebr_offensive
 
 
 
168
  metric: f1_macro
169
  few_shot: 25
170
+ baseline: 50.0
 
171
  human_baseline: null
172
  expert_human_baseline: null
173
+ description: "HateBR is the first large-scale expert annotated dataset of Brazilian Instagram comments for abusive language detection
174
+ on the web and social media. The HateBR was collected from Brazilian Instagram comments of politicians and manually annotated
175
+ by specialists. It is composed of 7,000 documents annotated with a binary classification (offensive
176
+ versus non-offensive comments)."
177
+ link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
178
+ sources: ["https://github.com/franciellevargas/HateBR", "https://huggingface.co/datasets/ruanchaves/hatebr"]
179
+ portuguese_hate_speech:
180
+ benchmark: portuguese_hate_speech
181
+ col_name: PT Hate Speech
182
+ task_list:
183
+ - portuguese_hate_speech
184
+ metric: f1_macro
185
+ few_shot: 25
186
+ baseline: 47.9
187
+ human_baseline: null
188
+ expert_human_baseline: null
189
+ description: "Portuguese dataset for hate speech detection composed of 5,668 tweets with binary annotations (i.e. 'hate' vs. 'no-hate')"
190
+ link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
191
+ sources: ["https://github.com/paulafortuna/Portuguese-Hate-Speech-Dataset", "https://huggingface.co/datasets/hate_speech_portuguese"]
192
+ tweetsentbr:
193
+ benchmark: tweetsentbr
194
+ col_name: tweetSentBR
195
+ task_list:
196
+ - tweetsentbr
197
+ metric: f1_macro
198
+ few_shot: 25
199
+ baseline: 32.8
200
+ human_baseline: null
201
+ expert_human_baseline: null
202
+ description: "TweetSentBR is a corpus of Tweets in Brazilian Portuguese.
203
+ It was labeled by several annotators following steps stablished on the literature for
204
+ improving reliability on the task of Sentiment Analysis. Each Tweet was annotated
205
+ in one of the three following classes: Positive, Negative, Neutral."
206
+ link: https://bitbucket.org/HBrum/tweetsentbr
207
+ sources: ["https://bitbucket.org/HBrum/tweetsentbr", "https://arxiv.org/abs/1712.08917"]
208
+