tathagataraha commited on
Commit
ba515db
·
1 Parent(s): 85b4142

[ADD] CI intervals for med-safety

Browse files
src/about.py CHANGED
@@ -48,15 +48,17 @@ class MedSafetyColumn:
48
 
49
  class MedSafetyColumns(Enum):
50
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
51
- med_safety_column0 = MedSafetyColumn("Competence, Compassion, and Respect for Human Dignity", "score", "Competence, Compassion, and Respect for Human Dignity")
52
- med_safety_column1 = MedSafetyColumn("Patient Rights and Confidentiality", "score", "Patient Rights and Confidentiality")
53
- med_safety_column2 = MedSafetyColumn("Continued Study and Information Sharing", "score", "Continued Study and Information Sharing")
54
- med_safety_column3 = MedSafetyColumn("Medical Care for All", "score", "Medical Care for All")
55
- med_safety_column4 = MedSafetyColumn("Community and Public Health", "score", "Community and Public Health")
56
- med_safety_column5 = MedSafetyColumn("Physician's Freedom of Choice", "score", "Physician's Freedom of Choice")
57
- med_safety_column6 = MedSafetyColumn("Professionalism and Honesty", "score", "Professionalism and Honesty")
58
- med_safety_column7 = MedSafetyColumn("Responsibility to Patient", "score", "Responsibility to Patient")
59
- med_safety_column8 = MedSafetyColumn("Law and Responsibility to Society", "score", "Law and Responsibility to Society")
 
 
60
 
61
  @dataclass
62
  class MedicalSummarizationColumn:
 
48
 
49
  class MedSafetyColumns(Enum):
50
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
51
+ med_safety_column0 = MedSafetyColumn("Harmfulness Score", "score", "Harmfulness Score")
52
+ med_safety_column1 = MedSafetyColumn("95% CI", "score", "95% CI")
53
+ med_safety_column2 = MedSafetyColumn("Competence, Compassion, and Respect for Human Dignity", "score", "Competence, Compassion, and Respect for Human Dignity")
54
+ med_safety_column3 = MedSafetyColumn("Patient Rights and Confidentiality", "score", "Patient Rights and Confidentiality")
55
+ med_safety_column4 = MedSafetyColumn("Continued Study and Information Sharing", "score", "Continued Study and Information Sharing")
56
+ med_safety_column5 = MedSafetyColumn("Medical Care for All", "score", "Medical Care for All")
57
+ med_safety_column6 = MedSafetyColumn("Community and Public Health", "score", "Community and Public Health")
58
+ med_safety_column7 = MedSafetyColumn("Physician's Freedom of Choice", "score", "Physician's Freedom of Choice")
59
+ med_safety_column8 = MedSafetyColumn("Professionalism and Honesty", "score", "Professionalism and Honesty")
60
+ med_safety_column9 = MedSafetyColumn("Responsibility to Patient", "score", "Responsibility to Patient")
61
+ med_safety_column10 = MedSafetyColumn("Law and Responsibility to Society", "score", "Law and Responsibility to Society")
62
 
63
  @dataclass
64
  class MedicalSummarizationColumn:
src/display/utils.py CHANGED
@@ -41,14 +41,16 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
41
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
42
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, invariant=False)])
43
  auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall Score", "number", True, False, medical_summarization_col=True, aci_col=True, soap_col=True, invariant=False)])
44
- auto_eval_column_dict.append(["harmfulness", ColumnContent, ColumnContent("Harmfulness Score", "number", True, False, med_safety_col=True, invariant=False)])
45
  for task in HarnessTasks:
46
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
47
  for column in OpenEndedColumns:
48
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_col=True, invariant=False)])
49
  # changes to be made here
50
  for column in MedSafetyColumns:
51
- auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", False, False, med_safety_col=True, invariant=False)])
 
 
 
52
  for column in MedicalSummarizationColumns:
53
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, medical_summarization_col=True, invariant=False)])
54
  for column in ACIColumns:
 
41
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
42
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, invariant=False)])
43
  auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall Score", "number", True, False, medical_summarization_col=True, aci_col=True, soap_col=True, invariant=False)])
 
44
  for task in HarnessTasks:
45
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
46
  for column in OpenEndedColumns:
47
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_col=True, invariant=False)])
48
  # changes to be made here
49
  for column in MedSafetyColumns:
50
+ if column.value.col_name == "95% CI" or column.value.col_name == "Harmfulness Score":
51
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, med_safety_col=True, invariant=False)])
52
+ else:
53
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", False, False, med_safety_col=True, invariant=False)])
54
  for column in MedicalSummarizationColumns:
55
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, medical_summarization_col=True, invariant=False)])
56
  for column in ACIColumns:
src/leaderboard/read_evals.py CHANGED
@@ -117,8 +117,15 @@ class EvalResult:
117
  if "med-safety" in data["results"]:
118
  for task in MedSafetyColumns:
119
  task = task.value
120
- accs = data["results"]["med-safety"][task.benchmark]["score"]
121
- med_safety_results[task.benchmark] = accs
 
 
 
 
 
 
 
122
  medical_summarization_results = {}
123
  if "medical-summarization" in data["results"]:
124
  for task in MedicalSummarizationColumns:
@@ -258,8 +265,6 @@ class EvalResult:
258
  return data_dict
259
  # changes to be made here
260
  if subset == "med_safety":
261
- average = sum([v for v in self.med_safety_results.values() if v is not None]) / len(MedSafetyColumns)
262
- data_dict[AutoEvalColumn.harmfulness.name] = average
263
  if len(self.med_safety_results) > 0:
264
  for task in MedSafetyColumns:
265
  data_dict[task.value.col_name] = self.med_safety_results[task.value.benchmark]
 
117
  if "med-safety" in data["results"]:
118
  for task in MedSafetyColumns:
119
  task = task.value
120
+ if task.benchmark == "Harmfulness Score":
121
+ accs = data["results"]["med-safety"][task.benchmark]
122
+ med_safety_results[task.benchmark] = accs
123
+ elif task.benchmark == "95% CI":
124
+ accs = data["results"]["med-safety"][task.benchmark]
125
+ med_safety_results[task.benchmark] = "+" + str(round(accs[1], 3)) + "/-" + str(round(abs(accs[0]), 3))
126
+ else:
127
+ accs = data["results"]["med-safety"][task.benchmark]["score"]
128
+ med_safety_results[task.benchmark] = accs
129
  medical_summarization_results = {}
130
  if "medical-summarization" in data["results"]:
131
  for task in MedicalSummarizationColumns:
 
265
  return data_dict
266
  # changes to be made here
267
  if subset == "med_safety":
 
 
268
  if len(self.med_safety_results) > 0:
269
  for task in MedSafetyColumns:
270
  data_dict[task.value.col_name] = self.med_safety_results[task.value.benchmark]
src/populate.py CHANGED
@@ -21,7 +21,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
21
  if subset == "datasets":
22
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
23
  elif subset == "med_safety":
24
- df = df.sort_values(by=[AutoEvalColumn.harmfulness.name], ascending=True)
25
  elif subset == "open_ended":
26
  df = df.sort_values(by=["ELO"], ascending=False)
27
  elif subset == "medical_summarization":
 
21
  if subset == "datasets":
22
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
23
  elif subset == "med_safety":
24
+ df = df.sort_values(by=["Harmfulness Score"], ascending=True)
25
  elif subset == "open_ended":
26
  df = df.sort_values(by=["ELO"], ascending=False)
27
  elif subset == "medical_summarization":