Spaces:

T145
/

open-llm-leaderboard-results-to-modelcard

Running

App Files Files Community

T145 commited on Dec 26, 2024

Commit

8323cf6

1 Parent(s): 4e2bd19

Updated datasets

Browse files

Files changed (1) hide show

functions.py +21 -23

functions.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import gradio as gr
 import pandas as pd
 from datasets import load_dataset
@@ -40,34 +39,34 @@ def get_query_url(repo):
 def get_task_summary(results):
     return {
         "IFEval": {
-            "dataset_type": "HuggingFaceH4/ifeval",
             "dataset_name": "IFEval (0-Shot)",
             "metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
             "metric_value": round(results["IFEval"], 2),
-            "dataset_config": None,  # don't know
-            "dataset_split": None,  # don't know
             "dataset_revision": None,
             "dataset_args": {"num_few_shot": 0},
-            "metric_name": "strict accuracy",
         },
         "BBH": {
-            "dataset_type": "BBH",
             "dataset_name": "BBH (3-Shot)",
             "metric_type": "acc_norm",
             "metric_value": round(results["BBH"], 2),
-            "dataset_config": None,  # don't know
-            "dataset_split": None,  # don't know
             "dataset_revision": None,
             "dataset_args": {"num_few_shot": 3},
             "metric_name": "normalized accuracy",
         },
         "MATH Lvl 5": {
-            "dataset_type": "hendrycks/competition_math",
             "dataset_name": "MATH Lvl 5 (4-Shot)",
             "metric_type": "exact_match",
             "metric_value": round(results["MATH Lvl 5"], 2),
-            "dataset_config": None,  # don't know
-            "dataset_split": None,  # don't know
             "dataset_revision": None,
             "dataset_args": {"num_few_shot": 4},
             "metric_name": "exact match",
@@ -77,8 +76,8 @@ def get_task_summary(results):
             "dataset_name": "GPQA (0-shot)",
             "metric_type": "acc_norm",
             "metric_value": round(results["GPQA"], 2),
-            "dataset_config": None,  # don't know
-            "dataset_split": None,  # don't know
             "dataset_revision": None,
             "dataset_args": {"num_few_shot": 0},
             "metric_name": "acc_norm",
@@ -88,8 +87,8 @@ def get_task_summary(results):
             "dataset_name": "MuSR (0-shot)",
             "metric_type": "acc_norm",
             "metric_value": round(results["MUSR"], 2),
-            "dataset_config": None,  # don't know
-            "dataset_split": None,  # don't know
             "dataset_args": {"num_few_shot": 0},
             "metric_name": "acc_norm",
         },
@@ -109,18 +108,17 @@ def get_task_summary(results):
 def get_eval_results(df, repo):
     results = search(df, repo)
     task_summary = get_task_summary(results)
-    md_writer = MarkdownTableWriter()
-    md_writer.headers = ["Metric", "% Value"]
-    md_writer.value_matrix = [["Avg.", round(results["Average ⬆️"], 2)]] + [
         [v["dataset_name"], v["metric_value"]] for v in task_summary.values()
     ]
-    text = f"""
-# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
 Detailed results can be found [here]({get_details_url(repo)})!
 Summarized results can be found [here]({get_contents_url(repo)})!
-{md_writer.dumps()}
 """
     return text
@@ -175,8 +173,8 @@ def commit(
     if repo.startswith("https://huggingface.co/"):
         try:
             repo = RepoUrl(repo).repo_id
-        except Exception:
-            raise gr.Error(f"Not a valid repo id: {str(repo)}")
     edited = {"revision": f"refs/pr/{pr_number}"} if pr_number else {"create_pr": True}

 import gradio as gr
 import pandas as pd
 from datasets import load_dataset
 def get_task_summary(results):
     return {
         "IFEval": {
+            "dataset_type": "wis-k/instruction-following-eval",
             "dataset_name": "IFEval (0-Shot)",
             "metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
             "metric_value": round(results["IFEval"], 2),
+            "dataset_config": None,
+            "dataset_split": "train",
             "dataset_revision": None,
             "dataset_args": {"num_few_shot": 0},
+            "metric_name": "averaged accuracy",
         },
         "BBH": {
+            "dataset_type": "SaylorTwift/bbh",
             "dataset_name": "BBH (3-Shot)",
             "metric_type": "acc_norm",
             "metric_value": round(results["BBH"], 2),
+            "dataset_config": None,
+            "dataset_split": "test",
             "dataset_revision": None,
             "dataset_args": {"num_few_shot": 3},
             "metric_name": "normalized accuracy",
         },
         "MATH Lvl 5": {
+            "dataset_type": "lighteval/MATH-Hard",
             "dataset_name": "MATH Lvl 5 (4-Shot)",
             "metric_type": "exact_match",
             "metric_value": round(results["MATH Lvl 5"], 2),
+            "dataset_config": None,
+            "dataset_split": "test",
             "dataset_revision": None,
             "dataset_args": {"num_few_shot": 4},
             "metric_name": "exact match",
             "dataset_name": "GPQA (0-shot)",
             "metric_type": "acc_norm",
             "metric_value": round(results["GPQA"], 2),
+            "dataset_config": None,
+            "dataset_split": "train",
             "dataset_revision": None,
             "dataset_args": {"num_few_shot": 0},
             "metric_name": "acc_norm",
             "dataset_name": "MuSR (0-shot)",
             "metric_type": "acc_norm",
             "metric_value": round(results["MUSR"], 2),
+            "dataset_config": None,
+            "dataset_split": None,  # three test splits
             "dataset_args": {"num_few_shot": 0},
             "metric_name": "acc_norm",
         },
 def get_eval_results(df, repo):
     results = search(df, repo)
     task_summary = get_task_summary(results)
+    table = MarkdownTableWriter()
+    table.headers = ["Metric", "% Value"]
+    table.value_matrix = [["Avg.", round(results["Average ⬆️"], 2)]] + [
         [v["dataset_name"], v["metric_value"]] for v in task_summary.values()
     ]
+    text = f"""# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
 Detailed results can be found [here]({get_details_url(repo)})!
 Summarized results can be found [here]({get_contents_url(repo)})!
+{table.dumps()}
 """
     return text
     if repo.startswith("https://huggingface.co/"):
         try:
             repo = RepoUrl(repo).repo_id
+        except Exception as e:
+            raise gr.Error(f"Not a valid repo id: {str(repo)}") from e
     edited = {"revision": f"refs/pr/{pr_number}"} if pr_number else {"create_pr": True}