Spaces:
Sleeping
Sleeping
meg-huggingface
commited on
Commit
•
130a6d2
1
Parent(s):
1075b83
UI
Browse files- app.py +20 -15
- src/about.py +6 -4
- src/display/utils.py +2 -1
app.py
CHANGED
@@ -156,21 +156,26 @@ with demo:
|
|
156 |
elem_id="search-bar",
|
157 |
)
|
158 |
with gr.Row():
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
|
|
|
|
|
|
|
|
|
|
174 |
with gr.Row():
|
175 |
with gr.Accordion("Advanced options [WIP]", open=False):
|
176 |
shown_columns_advanced = gr.CheckboxGroup(
|
|
|
156 |
elem_id="search-bar",
|
157 |
)
|
158 |
with gr.Row():
|
159 |
+
with gr.Column():
|
160 |
+
shown_columns = gr.CheckboxGroup(
|
161 |
+
choices=[
|
162 |
+
c.name
|
163 |
+
for c in fields(AutoEvalColumn)
|
164 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.advanced and not c.dummy
|
165 |
+
],
|
166 |
+
value=[
|
167 |
+
c.name
|
168 |
+
for c in fields(AutoEvalColumn)
|
169 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.advanced
|
170 |
+
],
|
171 |
+
label="Select metrics to show",
|
172 |
+
elem_id="column-select",
|
173 |
+
interactive=True,
|
174 |
+
)
|
175 |
+
with gr.Column():
|
176 |
+
for c in fields(AutoEvalColumn):
|
177 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.advanced and not c.dummy:
|
178 |
+
gr.Markdown("**" + c.name + "**. " + c.cite, elem_classes="markdown-text")
|
179 |
with gr.Row():
|
180 |
with gr.Accordion("Advanced options [WIP]", open=False):
|
181 |
shown_columns_advanced = gr.CheckboxGroup(
|
src/about.py
CHANGED
@@ -6,15 +6,17 @@ class Task:
|
|
6 |
benchmark: str
|
7 |
metric: str
|
8 |
col_name: str
|
|
|
|
|
9 |
|
10 |
|
11 |
# Select your tasks here
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task0 = Task("toxigen", "acc", "Toxicity")
|
16 |
-
task1 = Task("anli_r1", "acc", "ANLI")
|
17 |
-
task2 = Task("logiqa", "acc_norm", "LogiQA")
|
18 |
|
19 |
NUM_FEWSHOT = 0 # Change with your few shot MEG NOTE: Not sure what that means.
|
20 |
# ---------------------------------------------------
|
@@ -24,7 +26,7 @@ TITLE = """<h1 align="center" id="space-title">Toxicity Leaderboard</h1>"""
|
|
24 |
|
25 |
# What does your leaderboard evaluate?
|
26 |
INTRODUCTION_TEXT = """
|
27 |
-
Evaluate the toxicity of open LLMs
|
28 |
"""
|
29 |
|
30 |
# Which evaluations are you running? how can people reproduce what you have?
|
|
|
6 |
benchmark: str
|
7 |
metric: str
|
8 |
col_name: str
|
9 |
+
# Relevant citation for the task
|
10 |
+
cite: str = ""
|
11 |
|
12 |
|
13 |
# Select your tasks here
|
14 |
# ---------------------------------------------------
|
15 |
class Tasks(Enum):
|
16 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
17 |
+
task0 = Task("toxigen", "acc", "Toxicity (lower is better)", cite="_ToxiGen: A Large-Scale Machine-Generated Dataset for Adversarial and Implicit Hate Speech Detection._ Hartvigsen et al., ACL 2022.")
|
18 |
+
task1 = Task("anli_r1", "acc", "ANLI", cite="_Adversarial NLI: A New Benchmark for Natural Language Understanding._ Nie et al., ACL 2020.")
|
19 |
+
task2 = Task("logiqa", "acc_norm", "LogiQA", cite="_LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning_. Liu et al., IJCAI 2020.")
|
20 |
|
21 |
NUM_FEWSHOT = 0 # Change with your few shot MEG NOTE: Not sure what that means.
|
22 |
# ---------------------------------------------------
|
|
|
26 |
|
27 |
# What does your leaderboard evaluate?
|
28 |
INTRODUCTION_TEXT = """
|
29 |
+
<h2 align="center">Evaluate the toxicity of open LLMs.</h2>
|
30 |
"""
|
31 |
|
32 |
# Which evaluations are you running? how can people reproduce what you have?
|
src/display/utils.py
CHANGED
@@ -17,6 +17,7 @@ class ColumnContent:
|
|
17 |
name: str
|
18 |
type: str
|
19 |
displayed_by_default: bool
|
|
|
20 |
hidden: bool = False
|
21 |
never_hidden: bool = False
|
22 |
dummy: bool = False
|
@@ -30,7 +31,7 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent(name="Model"
|
|
30 |
#Scores
|
31 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent(name="Average ⬆️", type="number", never_hidden=False, displayed_by_default=False)])
|
32 |
for task in Tasks:
|
33 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(name=task.value.col_name, type="number", displayed_by_default=True)])
|
34 |
# Model information
|
35 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent(name="Type", type="str", never_hidden=False, displayed_by_default=False, advanced=True)])
|
36 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", never_hidden=False, displayed_by_default=False, advanced=True)])
|
|
|
17 |
name: str
|
18 |
type: str
|
19 |
displayed_by_default: bool
|
20 |
+
cite: str = ""
|
21 |
hidden: bool = False
|
22 |
never_hidden: bool = False
|
23 |
dummy: bool = False
|
|
|
31 |
#Scores
|
32 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent(name="Average ⬆️", type="number", never_hidden=False, displayed_by_default=False)])
|
33 |
for task in Tasks:
|
34 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(name=task.value.col_name, cite=task.value.cite, type="number", displayed_by_default=True)])
|
35 |
# Model information
|
36 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent(name="Type", type="str", never_hidden=False, displayed_by_default=False, advanced=True)])
|
37 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", never_hidden=False, displayed_by_default=False, advanced=True)])
|