Spaces:
Sleeping
Sleeping
ljyflores
Turn reports into table, remove header, use logic to use organ of previous sentence
2b98370
import json | |
import nltk | |
import pandas as pd | |
import re | |
nltk.download('punkt') | |
from dataclasses import asdict, dataclass | |
from nltk.tokenize import sent_tokenize | |
from typing import Dict, List, Mapping, Sequence | |
from utils_report_parser import get_section_from_report | |
from transformers import ( | |
AutoModelForTokenClassification, | |
AutoTokenizer, | |
pipeline, | |
) | |
class Report: | |
patient_id: str | int | |
text: str | |
date: str | |
summary: str | None = None | |
def dict(self): | |
return {k: str(v) for k, v in asdict(self).items()} | |
def clean(s: str) -> str: | |
s = s.replace("\n", " ") # Concatenate into one string | |
s = s.replace("_", "") # Remove long lines and underscores | |
s = re.sub(r"\[.*?\]", "", s) # Remove brackets and parentheses | |
s = re.sub(r"\(.*?\)", "", s) | |
s = " ".join(s.split()) # Replace multiple white spaces | |
return s | |
def split_into_sentences(text: str): | |
# Split into sentences | |
return sent_tokenize(text) | |
def remove_keyword(text: str, keyword: str): | |
start_idx = text.lower().find(keyword.lower()) | |
if start_idx > -1: | |
substring_to_replace = text[start_idx: start_idx+len(keyword)] | |
text = text.replace(substring_to_replace, " ") | |
return text | |
def format_casemaker_data( | |
df: pd.DataFrame, patient_id_column: str, text_column: str, date_column: str | |
): | |
"""Take in a pandas dataframe where each row corresponds to one report for a patient, | |
and output a dataframe where each row corresponds to a patient, and the "records" column | |
contains a list of dictionaries of all their reports sorted by date | |
Args: | |
df (pd.DataFrame): Input dataframe on report level | |
patient_id_column (str): Patient ID | |
text_column (str): Text/Report | |
date_column (str): Date (will be used to sort) | |
""" | |
df = df.rename( | |
columns={ | |
patient_id_column: "patient_id", | |
text_column: "text", | |
date_column: "date", | |
} | |
) | |
df = ( | |
df.sort_values(by=["patient_id", "date"]) | |
.groupby("patient_id") | |
.apply(lambda df: df[["patient_id", "text", "date"]].to_dict("records")) | |
) | |
reports_by_patient = dict[str, Sequence[Report]]() | |
for patient_id, report_list in zip(df.index, df): | |
patient_id = str(patient_id) | |
report_list = [Report(**report) for report in report_list] | |
reports_by_patient[patient_id] = report_list | |
return reports_by_patient | |
class CaseMaker: | |
def __init__(self, organ_keywords_dict_path: str = "../assets/terms.json"): | |
self.organ_keyword_dict: Mapping[str, list[str]] = json.load(open(organ_keywords_dict_path, "r")) | |
self.ner_pipe = pipeline( | |
"ner", | |
model=AutoModelForTokenClassification.from_pretrained( | |
"d4data/biomedical-ner-all" | |
), | |
tokenizer=AutoTokenizer.from_pretrained("d4data/biomedical-ner-all"), | |
aggregation_strategy="simple", | |
device_map="auto", | |
) | |
# self.summ_pipe = pipeline( | |
# "text2text-generation", model="starmpcc/Asclepius-7B", device_map="auto" | |
# ) | |
def filter_out_irrelevant_sentences(self, lst: list[str]): | |
bad_keywords = [ | |
"date of procedure", "physicians", "report initiated by", | |
"reported by", "reported and signed by" | |
] | |
return [s for s in lst if not any([s.lower().startswith(k) for k in bad_keywords])] | |
def remove_header_names(self, s: str): | |
headers = [ | |
"IMPRESSION", "FINDINGS", "RECOMMENDATION", | |
"COMPARISON", "INDICATION", "TECHNIQUE", "STUDY", | |
"MEDICATIONS", "TECHNIQUE AND FINDINGS" | |
] | |
for header in headers: | |
s = remove_keyword(s, f"{header}:") | |
return s | |
def pick_organ_by_keyword(self, s: str): | |
words = s.lower() | |
for organ in self.organ_keyword_dict.keys(): | |
if any( | |
[ | |
keyword.lower() in words | |
for keyword in [organ] + self.organ_keyword_dict[organ] | |
] | |
): | |
return organ | |
return None | |
def parse_report_by_organ(self, report: str): | |
"""Take in a text report and output a dictionary of body organs | |
and a list of all the sentences corresponding to that organ | |
Args: | |
report (str): Input report | |
""" | |
report_string_by_organ = dict[str, str]() | |
# Split the report into a list of sentences | |
sentences = split_into_sentences(report) | |
# Filter out irrelevant sentences using rules | |
sentences = self.filter_out_irrelevant_sentences(sentences) | |
# Collect a list of paragraphs related to each organ | |
previous_sentence_organ = "Other" | |
for s in sentences: | |
# Figure out which organ is being referenced | |
selected_organ = self.pick_organ_by_keyword(s) | |
if selected_organ is None: | |
selected_organ = previous_sentence_organ | |
else: | |
previous_sentence_organ = selected_organ | |
# Concatenate the report to its corresponding organ | |
if selected_organ not in report_string_by_organ: | |
report_string_by_organ[selected_organ] = s | |
else: | |
report_string_by_organ[selected_organ] += f" {s}" | |
return report_string_by_organ | |
def trim_to_relevant_portion(self, report: str): | |
# Only keep sentences with symptoms and disease descriptions | |
relevant_sentences = list[str]() | |
for sentence in sent_tokenize(report): | |
if any( | |
[ | |
ent["entity_group"] in ["Sign_symptom", "Disease_disorder"] | |
for ent in self.ner_pipe(sentence) | |
] | |
): | |
relevant_sentences.append(str(sentence)) | |
return "\n".join(relevant_sentences) | |
def summarize_report(self, text: str) -> str: | |
"""Format text into prompt and summarize clinical text | |
Args: | |
text (str): Input report | |
Returns: | |
str: Output | |
""" | |
question = ( | |
"Can you provide a succinct summary of the key clinical findings " | |
"and treatment recommendations outlined in this discharge summary?" | |
) | |
prompt = """ | |
You are an intelligent clinical languge model. | |
Below is a snippet of patient's discharge summary and a following instruction from healthcare professional. | |
Write a response that appropriately completes the instruction. | |
The response should provide the accurate answer to the instruction, while being concise. | |
[Discharge Summary Begin] | |
{note} | |
[Discharge Summary End] | |
[Instruction Begin] | |
{question} | |
[Instruction End] | |
""".format( | |
question=question, note=text | |
) | |
output = self.summ_pipe(prompt, max_new_tokens=len(text.split()) // 2)[0][ | |
"generated_text" | |
] | |
answer = output.split("[Instruction End]")[-1] | |
answer = clean(answer) | |
return answer | |
def parse_records( | |
self, | |
reports: Sequence[Report], | |
): | |
"""Given a list of reports (represented by dictionaries), split each of them | |
by body part using parse_report_by_organ, then compile all the text for the same | |
organ across different reports | |
(i.e. for each body part, have a list of dicts which contain the text from various reports) | |
Args: | |
records (Sequence[Report]): List of reports represented by dictionaries; each dictionary | |
must contain "text" and "date" keys | |
""" | |
# Split the reports by organ | |
reports_by_organ = dict[str, Sequence[Report]]() | |
for report in reports: | |
# Cut the report to the findings | |
report_findings = get_section_from_report(report.text, "findings") | |
# Remove any other keywords | |
report_findings = self.remove_header_names(report_findings) | |
# For each organ, collect a list of relevant records containing the text and date | |
report_by_organ = self.parse_report_by_organ(report_findings) | |
for organ, report_text in report_by_organ.items(): | |
organ_level_record = Report( | |
text=report_text, date=report.date, patient_id=report.patient_id | |
) | |
if organ in reports_by_organ: | |
reports_by_organ[organ].append(organ_level_record) | |
else: | |
reports_by_organ[organ] = [organ_level_record] | |
# For each organ, then filter only to the relevant reports and summarize them | |
summarized_reports_by_organ = dict[str, Sequence[Report]]() | |
for organ in reports_by_organ.keys(): | |
cleaned_reports = list[Report]() | |
for report in reports_by_organ[organ]: | |
# Trim the report | |
report_text = self.trim_to_relevant_portion(report.text) | |
if report_text: | |
report.summary = report_text | |
cleaned_reports.append(report) | |
summarized_reports_by_organ[organ] = cleaned_reports | |
return summarized_reports_by_organ | |
def format_reports(self, all_reports: Dict[str, List[Dict]]): | |
new_reports = {} | |
for organ, organ_reports in all_reports.items(): | |
new_reports[organ] = "\n\n".join( | |
[f"**Report {str(r.date)}**\n\n{str(r.summary)}" for r in organ_reports] | |
) | |
return new_reports | |