Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
xuanricheng
commited on
Commit
Β·
b6b9254
1
Parent(s):
85128b4
update result format
Browse files- README.md +2 -2
- src/display/about.py +5 -5
- src/display/utils.py +9 -6
- src/leaderboard/read_evals.py +2 -2
- src/scripts/create_request_file.py +1 -1
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title: Open LLM Leaderboard
|
3 |
emoji: π
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
@@ -8,7 +8,7 @@ sdk_version: 4.9.0
|
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
11 |
-
duplicated_from: HuggingFaceH4/open_llm_leaderboard
|
12 |
fullWidth: true
|
13 |
space_ci: # See https://huggingface.co/spaces/Wauplin/gradio-space-ci
|
14 |
private: true
|
|
|
1 |
---
|
2 |
+
title: Chinese Open LLM Leaderboard
|
3 |
emoji: π
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
|
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
11 |
+
# duplicated_from: HuggingFaceH4/open_llm_leaderboard
|
12 |
fullWidth: true
|
13 |
space_ci: # See https://huggingface.co/spaces/Wauplin/gradio-space-ci
|
14 |
private: true
|
src/display/about.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
from src.display.utils import ModelType
|
2 |
|
3 |
-
TITLE = """<h1 align="center" id="space-title">π€
|
4 |
|
5 |
INTRODUCTION_TEXT = """
|
6 |
-
π The π€
|
7 |
-
[FlagEval](https://flageval.baai.ac.cn/)
|
8 |
|
9 |
π€ Submit a model for automated evaluation on the π€ GPU cluster on the "Submit" page!
|
10 |
The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
|
@@ -69,8 +69,8 @@ To get more information about quantization, see:
|
|
69 |
- 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
|
70 |
|
71 |
## Useful links
|
72 |
-
- [Community resources](https://huggingface.co/spaces/
|
73 |
-
- [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-
|
74 |
"""
|
75 |
|
76 |
FAQ_TEXT = """
|
|
|
1 |
from src.display.utils import ModelType
|
2 |
|
3 |
+
TITLE = """<h1 align="center" id="space-title">π€ Open Chinese LLM Leaderboard</h1>"""
|
4 |
|
5 |
INTRODUCTION_TEXT = """
|
6 |
+
π The π€ Open Chinese LLM Leaderboard aims to track, rank and evaluate open LLMs and chatbots.
|
7 |
+
This leaderboard is subset of the [FlagEval](https://flageval.baai.ac.cn/)
|
8 |
|
9 |
π€ Submit a model for automated evaluation on the π€ GPU cluster on the "Submit" page!
|
10 |
The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
|
|
|
69 |
- 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
|
70 |
|
71 |
## Useful links
|
72 |
+
- [Community resources](https://huggingface.co/spaces/BAAI/open_cn_llm_leaderboard/discussions/174)
|
73 |
+
- [Collection of best models](https://huggingface.co/collections/open-cn-llm-leaderboard/chinese-llm-leaderboard-best-models-65b0d4511dbd85fd0c3ad9cd)
|
74 |
"""
|
75 |
|
76 |
FAQ_TEXT = """
|
src/display/utils.py
CHANGED
@@ -14,12 +14,13 @@ class Task:
|
|
14 |
col_name: str
|
15 |
|
16 |
class Tasks(Enum):
|
17 |
-
arc = Task("arc:challenge", "acc_norm", "ARC")
|
18 |
-
hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
23 |
|
24 |
# These classes are for user facing column names,
|
25 |
# to avoid having to change them all around the code
|
@@ -82,6 +83,7 @@ baseline_row = {
|
|
82 |
AutoEvalColumn.truthfulqa.name: 25.0,
|
83 |
AutoEvalColumn.winogrande.name: 50.0,
|
84 |
AutoEvalColumn.gsm8k.name: 0.21,
|
|
|
85 |
AutoEvalColumn.dummy.name: "baseline",
|
86 |
AutoEvalColumn.model_type.name: "",
|
87 |
AutoEvalColumn.flagged.name: False,
|
@@ -107,6 +109,7 @@ human_baseline_row = {
|
|
107 |
AutoEvalColumn.truthfulqa.name: 94.0,
|
108 |
AutoEvalColumn.winogrande.name: 94.0,
|
109 |
AutoEvalColumn.gsm8k.name: 100,
|
|
|
110 |
AutoEvalColumn.dummy.name: "human_baseline",
|
111 |
AutoEvalColumn.model_type.name: "",
|
112 |
AutoEvalColumn.flagged.name: False,
|
|
|
14 |
col_name: str
|
15 |
|
16 |
class Tasks(Enum):
|
17 |
+
arc = Task("arc:challenge", "acc_norm", "C-ARC")
|
18 |
+
hellaswag = Task("hellaswag", "acc_norm", "C-HellaSwag")
|
19 |
+
truthfulqa = Task("truthfulqa:mc", "mc2", "C-TruthfulQA")
|
20 |
+
winogrande = Task("winogrande", "acc", "C-Winogrande")
|
21 |
+
gsm8k = Task("gsm8k", "acc", "C-GSM8K")
|
22 |
+
c_sem = Task("c-sem-v2", "acc", "C-SEM")
|
23 |
+
mmlu = Task("cmmlu", "acc", "C-MMLU")
|
24 |
|
25 |
# These classes are for user facing column names,
|
26 |
# to avoid having to change them all around the code
|
|
|
83 |
AutoEvalColumn.truthfulqa.name: 25.0,
|
84 |
AutoEvalColumn.winogrande.name: 50.0,
|
85 |
AutoEvalColumn.gsm8k.name: 0.21,
|
86 |
+
AutoEvalColumn.c_sem.name: 25.0,
|
87 |
AutoEvalColumn.dummy.name: "baseline",
|
88 |
AutoEvalColumn.model_type.name: "",
|
89 |
AutoEvalColumn.flagged.name: False,
|
|
|
109 |
AutoEvalColumn.truthfulqa.name: 94.0,
|
110 |
AutoEvalColumn.winogrande.name: 94.0,
|
111 |
AutoEvalColumn.gsm8k.name: 100,
|
112 |
+
AutoEvalColumn.c_sem.name: 100,
|
113 |
AutoEvalColumn.dummy.name: "human_baseline",
|
114 |
AutoEvalColumn.model_type.name: "",
|
115 |
AutoEvalColumn.flagged.name: False,
|
src/leaderboard/read_evals.py
CHANGED
@@ -87,7 +87,7 @@ class EvalResult:
|
|
87 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
88 |
continue
|
89 |
|
90 |
-
mean_acc = np.mean(accs)
|
91 |
results[task.benchmark] = mean_acc
|
92 |
|
93 |
return self(
|
@@ -149,7 +149,7 @@ class EvalResult:
|
|
149 |
}
|
150 |
|
151 |
for task in Tasks:
|
152 |
-
data_dict[task.value.col_name] = self.results
|
153 |
|
154 |
return data_dict
|
155 |
|
|
|
87 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
88 |
continue
|
89 |
|
90 |
+
mean_acc = np.mean(accs)
|
91 |
results[task.benchmark] = mean_acc
|
92 |
|
93 |
return self(
|
|
|
149 |
}
|
150 |
|
151 |
for task in Tasks:
|
152 |
+
data_dict[task.value.col_name] = self.results.get(task.value.benchmark, 0)
|
153 |
|
154 |
return data_dict
|
155 |
|
src/scripts/create_request_file.py
CHANGED
@@ -11,7 +11,7 @@ from src.submission.check_validity import get_model_size
|
|
11 |
from src.display.utils import ModelType, WeightType
|
12 |
|
13 |
EVAL_REQUESTS_PATH = "eval-queue"
|
14 |
-
QUEUE_REPO = "
|
15 |
|
16 |
precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
|
17 |
model_types = [e.name for e in ModelType]
|
|
|
11 |
from src.display.utils import ModelType, WeightType
|
12 |
|
13 |
EVAL_REQUESTS_PATH = "eval-queue"
|
14 |
+
QUEUE_REPO = "open-cn-llm-leaderboard/requests"
|
15 |
|
16 |
precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
|
17 |
model_types = [e.name for e in ModelType]
|