Spaces:
Running
Running
tathagataraha
commited on
Commit
Β·
5c80286
1
Parent(s):
23fd02c
[ADD] Dataset descriptions for cross-examination framework
Browse files- app.py +2 -1
- src/about.py +13 -3
app.py
CHANGED
@@ -12,6 +12,7 @@ from src.about import (
|
|
12 |
INTRODUCTION_TEXT,
|
13 |
LLM_BENCHMARKS_TEXT_1,
|
14 |
CROSS_EVALUATION_METRICS,
|
|
|
15 |
# EVALUATION_EXAMPLE_IMG,
|
16 |
# LLM_BENCHMARKS_TEXT_2,
|
17 |
# ENTITY_DISTRIBUTION_IMG,
|
@@ -708,7 +709,7 @@ with demo:
|
|
708 |
with gr.Accordion("Cross Examination", open=False):
|
709 |
system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
|
710 |
with gr.TabItem("π
Note generation", elem_id="llm-benchmark-tab-table", id=4):
|
711 |
-
gr.Markdown(
|
712 |
with gr.Tabs(elem_classes="tab-buttons2") as tabs:
|
713 |
with gr.TabItem("ACI Bench", elem_id="llm-benchmark-tab-table2", id=0):
|
714 |
with gr.Row():
|
|
|
12 |
INTRODUCTION_TEXT,
|
13 |
LLM_BENCHMARKS_TEXT_1,
|
14 |
CROSS_EVALUATION_METRICS,
|
15 |
+
NOTE_GENERATION_METRICS,
|
16 |
# EVALUATION_EXAMPLE_IMG,
|
17 |
# LLM_BENCHMARKS_TEXT_2,
|
18 |
# ENTITY_DISTRIBUTION_IMG,
|
|
|
709 |
with gr.Accordion("Cross Examination", open=False):
|
710 |
system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
|
711 |
with gr.TabItem("π
Note generation", elem_id="llm-benchmark-tab-table", id=4):
|
712 |
+
gr.Markdown(NOTE_GENERATION_METRICS, elem_classes="markdown-text")
|
713 |
with gr.Tabs(elem_classes="tab-buttons2") as tabs:
|
714 |
with gr.TabItem("ACI Bench", elem_id="llm-benchmark-tab-table2", id=0):
|
715 |
with gr.Row():
|
src/about.py
CHANGED
@@ -58,7 +58,7 @@ class MedSafetyColumns(Enum):
|
|
58 |
med_safety_column7 = MedSafetyColumn("Physician's Freedom of Choice", "score", "Physician's Freedom of Choice")
|
59 |
med_safety_column8 = MedSafetyColumn("Professionalism and Honesty", "score", "Professionalism and Honesty")
|
60 |
med_safety_column9 = MedSafetyColumn("Responsibility to Patient", "score", "Responsibility to Patient")
|
61 |
-
|
62 |
|
63 |
@dataclass
|
64 |
class MedicalSummarizationColumn:
|
@@ -149,7 +149,9 @@ Medical Safety category uses the "med-safety" benchmark dataset, which consists
|
|
149 |
In this evaluation, an LLM acts as a judge. Each response from the evaluated model is assigned a "harmfulness" score on a predefined scale. A low score means the model refused the harmful request, while a high score means it complied with the request.
|
150 |
|
151 |
### Medical Summarization
|
152 |
-
This category evaluates the LLM's ability to summarize medical texts,
|
|
|
|
|
153 |
|
154 |
- Coverage: Measures how thoroughly the summary covers the original document. A higher score means the summary includes more details from the original.
|
155 |
- Conformity: Also called the non-contradiction score, this checks if the summary avoids contradicting the original document. A higher score means the summary aligns better with the original.
|
@@ -157,7 +159,15 @@ This category evaluates the LLM's ability to summarize medical texts, such as cl
|
|
157 |
- Conciseness: Measures how brief the summary is. A higher score means the summary is more concise. A negative score means the summary is longer than the original document.
|
158 |
|
159 |
### Note Generation
|
160 |
-
This category assesses the LLM's ability to generate structured clinical
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
"""
|
162 |
|
163 |
EVALUATION_QUEUE_TEXT = """
|
|
|
58 |
med_safety_column7 = MedSafetyColumn("Physician's Freedom of Choice", "score", "Physician's Freedom of Choice")
|
59 |
med_safety_column8 = MedSafetyColumn("Professionalism and Honesty", "score", "Professionalism and Honesty")
|
60 |
med_safety_column9 = MedSafetyColumn("Responsibility to Patient", "score", "Responsibility to Patient")
|
61 |
+
med_safety_column10 = MedSafetyColumn("Law and Responsibility to Society", "score", "Law and Responsibility to Society")
|
62 |
|
63 |
@dataclass
|
64 |
class MedicalSummarizationColumn:
|
|
|
149 |
In this evaluation, an LLM acts as a judge. Each response from the evaluated model is assigned a "harmfulness" score on a predefined scale. A low score means the model refused the harmful request, while a high score means it complied with the request.
|
150 |
|
151 |
### Medical Summarization
|
152 |
+
This category evaluates the LLM's ability to summarize medical texts, with a focus on clinical trial descriptions from ClinicalTrials.gov. The dataset consists of 1629 carefully selected clinical trial protocols with detailed study descriptions (3000-8000 tokens long). The task is to generate concise and accurate summaries of these protocols.
|
153 |
+
|
154 |
+
It uses a novel "cross-examination" framework, where questions are generated from the original document and the LLM's summary to assess the scores of the summary. The four key scores calculated are:
|
155 |
|
156 |
- Coverage: Measures how thoroughly the summary covers the original document. A higher score means the summary includes more details from the original.
|
157 |
- Conformity: Also called the non-contradiction score, this checks if the summary avoids contradicting the original document. A higher score means the summary aligns better with the original.
|
|
|
159 |
- Conciseness: Measures how brief the summary is. A higher score means the summary is more concise. A negative score means the summary is longer than the original document.
|
160 |
|
161 |
### Note Generation
|
162 |
+
This category assesses the LLM's ability to generate structured clinical notes from doctor-patient conversations. It uses the same cross-examination framework as Medical Summarization across two datasets:
|
163 |
+
|
164 |
+
- ACI-Bench: A comprehensive collection designed specifically for benchmarking clinical note generation from doctor-patient dialogues. The dataset contains patient visit notes that have been validated by expert medical scribes and physicians.
|
165 |
+
|
166 |
+
- SOAP Notes: Using the test split of the ChartNote dataset containing 250 synthetic patient-doctor conversations generated from real clinical notes. The task involves generating notes in the SOAP format with the following sections:
|
167 |
+
- Subjective: Patient's description of symptoms, medical history, and personal experiences
|
168 |
+
- Objective: Observable data like physical exam findings, vital signs, and diagnostic test results
|
169 |
+
- Assessment: Healthcare provider's diagnosis based on subjective and objective information
|
170 |
+
- Plan: Treatment plan including medications, therapies, follow-ups, and referrals
|
171 |
"""
|
172 |
|
173 |
EVALUATION_QUEUE_TEXT = """
|