open_pt_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

eduagarcia commited on Feb 16

Commit

5639a81

•

1 Parent(s): 03f7287

Add new tasks and make leadboard work without new tasks evals

Browse files

Files changed (6) hide show

src/display/utils.py +1 -1
src/leaderboard/read_evals.py +18 -3
src/populate.py +1 -1
src/submission/check_validity.py +0 -1
src/tools/plots.py +2 -0
tasks_config/pt_config.yaml +42 -18

src/display/utils.py CHANGED Viewed

@@ -65,7 +65,7 @@ auto_eval_column_dict.append(["eval_time", ColumnContent, ColumnContent("Evaluat
 auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("Model Name", "str", False, dummy=True)])
 if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
     auto_eval_column_dict.append(["original_benchmark_average", ColumnContent, ColumnContent("🤗 Leaderboard Average", "number", False)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)

 auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("Model Name", "str", False, dummy=True)])
 if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
     auto_eval_column_dict.append(["original_benchmark_average", ColumnContent, ColumnContent("🤗 Leaderboard Average", "number", False)])
+auto_eval_column_dict.append(["npm", ColumnContent, ColumnContent("NPM (Average) ⬆️", "number", False)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)

src/leaderboard/read_evals.py CHANGED Viewed

@@ -3,6 +3,7 @@ import json
 import math
 import os
 from dataclasses import dataclass
 import dateutil
 import numpy as np
@@ -155,7 +156,19 @@ class EvalResult:
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
-        average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,
@@ -174,11 +187,13 @@ class EvalResult:
             AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
             AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
             AutoEvalColumn.flagged.name: self.flagged,
-            AutoEvalColumn.eval_time.name: self.eval_time
         }
         for task in Tasks:
-            data_dict[task.value.col_name] = self.results[task.value.benchmark]
         if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
             data_dict[AutoEvalColumn.original_benchmark_average.name] = self.original_benchmark_average

 import math
 import os
 from dataclasses import dataclass
+from typing import List
 import dateutil
 import numpy as np
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
+        average = []
+        npm = []
+        for task in Tasks:
+            if task.value.benchmark not in self.results:
+                continue
+            res = self.results[task.value.benchmark]
+            if res is None or np.isnan(res) or not (isinstance(res, float) or isinstance(res, int)):
+                continue
+            average.append(res)
+            npm.append((res-task.value.baseline)*100.0 / (100.0-task.value.baseline))
+        average = sum(average)/len(average)
+        npm = sum(npm)/len(npm)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,
             AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
             AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
             AutoEvalColumn.flagged.name: self.flagged,
+            AutoEvalColumn.eval_time.name: self.eval_time,
+            AutoEvalColumn.npm.name: npm
         }
         for task in Tasks:
+            if task.value.benchmark in self.results:
+                data_dict[task.value.col_name] = self.results[task.value.benchmark]
         if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
             data_dict[AutoEvalColumn.original_benchmark_average.name] = self.original_benchmark_average

src/populate.py CHANGED Viewed

@@ -21,7 +21,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str,
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
-    df = df[has_no_nan_values(df, benchmark_cols)]
     return raw_data, df

     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
+    #df = df[has_no_nan_values(df, benchmark_cols)]
     return raw_data, df

src/submission/check_validity.py CHANGED Viewed

@@ -22,7 +22,6 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
     except huggingface_hub.utils.EntryNotFoundError:
         return False, "Please add a model card to your model to explain how you trained/fine-tuned it.", None
     except Exception as e:
-        traceback.print_exc()
         return False, f"Error while loading the model card. Exception: {str(e)}", None
     # Enforce license metadata

     except huggingface_hub.utils.EntryNotFoundError:
         return False, "Please add a model card to your model to explain how you trained/fine-tuned it.", None
     except Exception as e:
         return False, f"Error while loading the model card. Exception: {str(e)}", None
     # Enforce license metadata

src/tools/plots.py CHANGED Viewed

@@ -41,6 +41,8 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
             if task.benchmark == "Average":
                 current_score = np.mean(list(row["results"].values()))
             else:
                 current_score = row["results"][task.benchmark]
             if current_score > current_max:

             if task.benchmark == "Average":
                 current_score = np.mean(list(row["results"].values()))
             else:
+                if task.benchmark not in row["results"]:
+                    continue
                 current_score = row["results"][task.benchmark]
             if current_score > current_max:

tasks_config/pt_config.yaml CHANGED Viewed

@@ -1,4 +1,4 @@
-version: 1.0.0
 config:
   REPO_ID: "eduagarcia/open_pt_llm_leaderboard"
   QUEUE_REPO: eduagarcia-temp/llm_pt_leaderboard_requests
@@ -160,25 +160,49 @@ tasks:
     entailment task between a question and its possible answers."
     link: https://huggingface.co/datasets/ruanchaves/faquad-nli
     sources: ["https://github.com/liafacom/faquad/"]
-  sparrow_pt:
-    benchmark: sparrow_pt
-    col_name: Sparrow POR
     task_list:
-    - sparrow_emotion-2021-cortiz-por
-    - sparrow_hate-2019-fortuna-por
-    - sparrow_sentiment-2016-mozetic-por
-    - sparrow_sentiment-2018-brum-por
     metric: f1_macro
     few_shot: 25
-    limit: 500
-    baseline: 29.5 #random baseline [3.3, 48.8, 33.1, 33.0]
     human_baseline: null
     expert_human_baseline: null
-    description: "SPARROW is a multilingual evaluation benchmark for sociopragmatic meaning understanding.
-    SPARROW comprises 169 datasets encompassing 64 different languages,
-    this split evaluates only on the validation set of 4 datasets avaliable for the Portuguese language.
-    One on hate speech detection by Fortuna et al. (2019), one on emotion detection by Cortiz et al. (2021)
-    and two on sentiment analysis by Mozetic et al. (2016) and Brum et al. (2018).
-    All were extracted and manually annotated from Twitter/X."
-    link: https://huggingface.co/datasets/UBC-NLP/sparrow
-    sources: ["https://sparrow.dlnlp.ai/", "https://aclanthology.org/W19-3510/", "https://arxiv.org/abs/2108.07638", "https://aclanthology.org/L18-1658/", "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0155036"]

+version: 1.1.0
 config:
   REPO_ID: "eduagarcia/open_pt_llm_leaderboard"
   QUEUE_REPO: eduagarcia-temp/llm_pt_leaderboard_requests
     entailment task between a question and its possible answers."
     link: https://huggingface.co/datasets/ruanchaves/faquad-nli
     sources: ["https://github.com/liafacom/faquad/"]
+  hatebr_offensive:
+    benchmark: hatebr_offensive
+    col_name: HateBR Offensive
     task_list:
+    - hatebr_offensive
     metric: f1_macro
     few_shot: 25
+    baseline: 50.0
     human_baseline: null
     expert_human_baseline: null
+    description: "HateBR is the first large-scale expert annotated dataset of Brazilian Instagram comments for abusive language detection
+    on the web and social media. The HateBR was collected from Brazilian Instagram comments of politicians and manually annotated
+    by specialists. It is composed of 7,000 documents annotated with a binary classification (offensive
+    versus non-offensive comments)."
+    link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
+    sources: ["https://github.com/franciellevargas/HateBR", "https://huggingface.co/datasets/ruanchaves/hatebr"]
+  portuguese_hate_speech:
+    benchmark: portuguese_hate_speech
+    col_name: PT Hate Speech
+    task_list:
+    - portuguese_hate_speech
+    metric: f1_macro
+    few_shot: 25
+    baseline: 47.9
+    human_baseline: null
+    expert_human_baseline: null
+    description: "Portuguese dataset for hate speech detection composed of 5,668 tweets with binary annotations (i.e. 'hate' vs. 'no-hate')"
+    link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
+    sources: ["https://github.com/paulafortuna/Portuguese-Hate-Speech-Dataset", "https://huggingface.co/datasets/hate_speech_portuguese"]
+  tweetsentbr:
+    benchmark: tweetsentbr
+    col_name: tweetSentBR
+    task_list:
+    - tweetsentbr
+    metric: f1_macro
+    few_shot: 25
+    baseline: 32.8
+    human_baseline: null
+    expert_human_baseline: null
+    description: "TweetSentBR is a corpus of Tweets in Brazilian Portuguese.
+    It was labeled by several annotators following steps stablished on the literature for
+    improving reliability on the task of Sentiment Analysis. Each Tweet was annotated
+    in one of the three following classes: Positive, Negative, Neutral."
+    link: https://bitbucket.org/HBrum/tweetsentbr
+    sources: ["https://bitbucket.org/HBrum/tweetsentbr", "https://arxiv.org/abs/1712.08917"]