Spaces:
Runtime error
Runtime error
Commit
·
ba515db
1
Parent(s):
85b4142
[ADD] CI intervals for med-safety
Browse files- src/about.py +11 -9
- src/display/utils.py +4 -2
- src/leaderboard/read_evals.py +9 -4
- src/populate.py +1 -1
src/about.py
CHANGED
@@ -48,15 +48,17 @@ class MedSafetyColumn:
|
|
48 |
|
49 |
class MedSafetyColumns(Enum):
|
50 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
51 |
-
med_safety_column0 = MedSafetyColumn("
|
52 |
-
med_safety_column1 = MedSafetyColumn("
|
53 |
-
med_safety_column2 = MedSafetyColumn("
|
54 |
-
med_safety_column3 = MedSafetyColumn("
|
55 |
-
med_safety_column4 = MedSafetyColumn("
|
56 |
-
med_safety_column5 = MedSafetyColumn("
|
57 |
-
med_safety_column6 = MedSafetyColumn("
|
58 |
-
med_safety_column7 = MedSafetyColumn("
|
59 |
-
med_safety_column8 = MedSafetyColumn("
|
|
|
|
|
60 |
|
61 |
@dataclass
|
62 |
class MedicalSummarizationColumn:
|
|
|
48 |
|
49 |
class MedSafetyColumns(Enum):
|
50 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
51 |
+
med_safety_column0 = MedSafetyColumn("Harmfulness Score", "score", "Harmfulness Score")
|
52 |
+
med_safety_column1 = MedSafetyColumn("95% CI", "score", "95% CI")
|
53 |
+
med_safety_column2 = MedSafetyColumn("Competence, Compassion, and Respect for Human Dignity", "score", "Competence, Compassion, and Respect for Human Dignity")
|
54 |
+
med_safety_column3 = MedSafetyColumn("Patient Rights and Confidentiality", "score", "Patient Rights and Confidentiality")
|
55 |
+
med_safety_column4 = MedSafetyColumn("Continued Study and Information Sharing", "score", "Continued Study and Information Sharing")
|
56 |
+
med_safety_column5 = MedSafetyColumn("Medical Care for All", "score", "Medical Care for All")
|
57 |
+
med_safety_column6 = MedSafetyColumn("Community and Public Health", "score", "Community and Public Health")
|
58 |
+
med_safety_column7 = MedSafetyColumn("Physician's Freedom of Choice", "score", "Physician's Freedom of Choice")
|
59 |
+
med_safety_column8 = MedSafetyColumn("Professionalism and Honesty", "score", "Professionalism and Honesty")
|
60 |
+
med_safety_column9 = MedSafetyColumn("Responsibility to Patient", "score", "Responsibility to Patient")
|
61 |
+
med_safety_column10 = MedSafetyColumn("Law and Responsibility to Society", "score", "Law and Responsibility to Society")
|
62 |
|
63 |
@dataclass
|
64 |
class MedicalSummarizationColumn:
|
src/display/utils.py
CHANGED
@@ -41,14 +41,16 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
41 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
|
42 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, invariant=False)])
|
43 |
auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall Score", "number", True, False, medical_summarization_col=True, aci_col=True, soap_col=True, invariant=False)])
|
44 |
-
auto_eval_column_dict.append(["harmfulness", ColumnContent, ColumnContent("Harmfulness Score", "number", True, False, med_safety_col=True, invariant=False)])
|
45 |
for task in HarnessTasks:
|
46 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
|
47 |
for column in OpenEndedColumns:
|
48 |
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_col=True, invariant=False)])
|
49 |
# changes to be made here
|
50 |
for column in MedSafetyColumns:
|
51 |
-
|
|
|
|
|
|
|
52 |
for column in MedicalSummarizationColumns:
|
53 |
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, medical_summarization_col=True, invariant=False)])
|
54 |
for column in ACIColumns:
|
|
|
41 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
|
42 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, invariant=False)])
|
43 |
auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall Score", "number", True, False, medical_summarization_col=True, aci_col=True, soap_col=True, invariant=False)])
|
|
|
44 |
for task in HarnessTasks:
|
45 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
|
46 |
for column in OpenEndedColumns:
|
47 |
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_col=True, invariant=False)])
|
48 |
# changes to be made here
|
49 |
for column in MedSafetyColumns:
|
50 |
+
if column.value.col_name == "95% CI" or column.value.col_name == "Harmfulness Score":
|
51 |
+
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, med_safety_col=True, invariant=False)])
|
52 |
+
else:
|
53 |
+
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", False, False, med_safety_col=True, invariant=False)])
|
54 |
for column in MedicalSummarizationColumns:
|
55 |
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, medical_summarization_col=True, invariant=False)])
|
56 |
for column in ACIColumns:
|
src/leaderboard/read_evals.py
CHANGED
@@ -117,8 +117,15 @@ class EvalResult:
|
|
117 |
if "med-safety" in data["results"]:
|
118 |
for task in MedSafetyColumns:
|
119 |
task = task.value
|
120 |
-
|
121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
medical_summarization_results = {}
|
123 |
if "medical-summarization" in data["results"]:
|
124 |
for task in MedicalSummarizationColumns:
|
@@ -258,8 +265,6 @@ class EvalResult:
|
|
258 |
return data_dict
|
259 |
# changes to be made here
|
260 |
if subset == "med_safety":
|
261 |
-
average = sum([v for v in self.med_safety_results.values() if v is not None]) / len(MedSafetyColumns)
|
262 |
-
data_dict[AutoEvalColumn.harmfulness.name] = average
|
263 |
if len(self.med_safety_results) > 0:
|
264 |
for task in MedSafetyColumns:
|
265 |
data_dict[task.value.col_name] = self.med_safety_results[task.value.benchmark]
|
|
|
117 |
if "med-safety" in data["results"]:
|
118 |
for task in MedSafetyColumns:
|
119 |
task = task.value
|
120 |
+
if task.benchmark == "Harmfulness Score":
|
121 |
+
accs = data["results"]["med-safety"][task.benchmark]
|
122 |
+
med_safety_results[task.benchmark] = accs
|
123 |
+
elif task.benchmark == "95% CI":
|
124 |
+
accs = data["results"]["med-safety"][task.benchmark]
|
125 |
+
med_safety_results[task.benchmark] = "+" + str(round(accs[1], 3)) + "/-" + str(round(abs(accs[0]), 3))
|
126 |
+
else:
|
127 |
+
accs = data["results"]["med-safety"][task.benchmark]["score"]
|
128 |
+
med_safety_results[task.benchmark] = accs
|
129 |
medical_summarization_results = {}
|
130 |
if "medical-summarization" in data["results"]:
|
131 |
for task in MedicalSummarizationColumns:
|
|
|
265 |
return data_dict
|
266 |
# changes to be made here
|
267 |
if subset == "med_safety":
|
|
|
|
|
268 |
if len(self.med_safety_results) > 0:
|
269 |
for task in MedSafetyColumns:
|
270 |
data_dict[task.value.col_name] = self.med_safety_results[task.value.benchmark]
|
src/populate.py
CHANGED
@@ -21,7 +21,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
21 |
if subset == "datasets":
|
22 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
23 |
elif subset == "med_safety":
|
24 |
-
df = df.sort_values(by=[
|
25 |
elif subset == "open_ended":
|
26 |
df = df.sort_values(by=["ELO"], ascending=False)
|
27 |
elif subset == "medical_summarization":
|
|
|
21 |
if subset == "datasets":
|
22 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
23 |
elif subset == "med_safety":
|
24 |
+
df = df.sort_values(by=["Harmfulness Score"], ascending=True)
|
25 |
elif subset == "open_ended":
|
26 |
df = df.sort_values(by=["ELO"], ascending=False)
|
27 |
elif subset == "medical_summarization":
|