acres / utils /helpers.py
Patrick Walukagga
Update README instructions
17249df
# utils/helpers.py
import json
import os
from typing import Any, Dict, List
import chromadb
from chromadb.api.types import Document
from llama_index.core import Response
from rag.rag_pipeline import RAGPipeline
from utils.prompts import (
StudyCharacteristics,
VaccineCoverageVariables,
structured_follow_up_prompt,
)
# Initialize ChromaDB client
chromadb_client = chromadb.Client()
def read_study_files(file_path):
"""
Reads a JSON file and returns the parsed JSON data.
Args:
file_path (str): The path to the JSON file to be read.
Returns:
dict: The data from the JSON file as a Python dictionary.
Raises:
FileNotFoundError: If the file is not found at the provided path.
json.JSONDecodeError: If the file contents are not valid JSON.
Example:
Given a JSON file 'study_files.json' with content like:
{
"Vaccine Coverage": "data/vaccine_coverage_zotero_items.json",
"Ebola Virus": "data/ebola_virus_zotero_items.json",
"Gene Xpert": "data/gene_xpert_zotero_items.json"
}
Calling `read_json_file("study_files.json")` will return:
{
"Vaccine Coverage": "data/vaccine_coverage_zotero_items.json",
"Ebola Virus": "data/ebola_virus_zotero_items.json",
"Gene Xpert": "data/gene_xpert_zotero_items.json"
}
"""
try:
with open(file_path, "r") as file:
data = json.load(file)
return data
except FileNotFoundError as e:
raise FileNotFoundError(f"The file at path {file_path} was not found.") from e
except json.JSONDecodeError as e:
raise ValueError(
f"The file at path {file_path} does not contain valid JSON."
) from e
def append_to_study_files(file_path, new_key, new_value):
"""
Appends a new key-value entry to an existing JSON file.
Args:
file_path (str): The path to the JSON file.
new_key (str): The new key to add to the JSON file.
new_value (any): The value associated with the new key (can be any valid JSON data type).
Raises:
FileNotFoundError: If the file is not found at the provided path.
json.JSONDecodeError: If the file contents are not valid JSON.
IOError: If the file cannot be written.
Example:
If the file 'study_files.json' initially contains:
{
"Vaccine Coverage": "data/vaccine_coverage_zotero_items.json",
"Ebola Virus": "data/ebola_virus_zotero_items.json"
}
Calling `append_to_json_file("study_files.json", "Gene Xpert", "data/gene_xpert_zotero_items.json")`
will modify the file to:
{
"Vaccine Coverage": "data/vaccine_coverage_zotero_items.json",
"Ebola Virus": "data/ebola_virus_zotero_items.json",
"Gene Xpert": "data/gene_xpert_zotero_items.json"
}
"""
try:
# Read the existing data from the file
with open(file_path, "r") as file:
data = json.load(file)
# Append the new key-value pair to the dictionary
data[new_key] = new_value
# Write the updated data back to the file
with open(file_path, "w") as file:
json.dump(data, file, indent=4) # indent for pretty printing
except FileNotFoundError as e:
raise FileNotFoundError(f"The file at path {file_path} was not found.") from e
except json.JSONDecodeError as e:
raise ValueError(
f"The file at path {file_path} does not contain valid JSON."
) from e
except IOError as e:
raise IOError(f"Failed to write to the file at {file_path}.") from e
def generate_follow_up_questions(
rag: RAGPipeline, response: str, query: str, study_name: str
) -> List[str]:
"""
Generates follow-up questions based on the given RAGPipeline, response, query, and study_name.
Args:
rag (RAGPipeline): The RAGPipeline object used for generating follow-up questions.
response (str): The response to the initial query.
query (str): The initial query.
study_name (str): The name of the study.
Returns:
List[str]: A list of generated follow-up questions.
Raises:
None
"""
# Determine the study type based on the study_name
if "Vaccine Coverage" in study_name:
study_type = "Vaccine Coverage"
key_variables = list(VaccineCoverageVariables.__annotations__.keys())
elif "Ebola Virus" in study_name:
study_type = "Ebola Virus"
key_variables = [
"SAMPLE_SIZE",
"PLASMA_TYPE",
"DOSAGE",
"FREQUENCY",
"SIDE_EFFECTS",
"VIRAL_LOAD_CHANGE",
"SURVIVAL_RATE",
]
elif "Gene Xpert" in study_name:
study_type = "Gene Xpert"
key_variables = [
"OBJECTIVE",
"OUTCOME_MEASURES",
"SENSITIVITY",
"SPECIFICITY",
"COST_COMPARISON",
"TURNAROUND_TIME",
]
else:
study_type = "General"
key_variables = list(StudyCharacteristics.__annotations__.keys())
# Add key variables to the context
context = f"Study type: {study_type}\nKey variables to consider: {', '.join(key_variables)}\n\n{response}"
follow_up_response = rag.query(
structured_follow_up_prompt.format(
context_str=context,
query_str=query,
response_str=response,
study_type=study_type,
)
)
questions = follow_up_response.response.strip().split("\n")
cleaned_questions = []
for q in questions:
# Remove leading numbers and periods, and strip whitespace
cleaned_q = q.split(". ", 1)[-1].strip()
# Ensure the question ends with a question mark
if cleaned_q and not cleaned_q.endswith("?"):
cleaned_q += "?"
if cleaned_q:
cleaned_questions.append(f"✨ {cleaned_q}")
return cleaned_questions[:3]
def add_study_files_to_chromadb(file_path: str, collection_name: str):
"""
Reads the study files data from a JSON file and adds it to the specified ChromaDB collection.
:param file_path: Path to the JSON file containing study files data.
:param collection_name: Name of the ChromaDB collection to store the data.
"""
# Load study files data from JSON file
try:
with open(file_path, "r") as f:
study_files_data = json.load(f)
except FileNotFoundError:
print(f"File '{file_path}' not found.")
return
if not study_files_data:
return
# Get or create the collection in ChromaDB
collection = chromadb_client.get_or_create_collection(collection_name)
# Prepare lists for ids, texts, and metadata to batch insert
ids = []
documents = []
metadatas = []
# Populate lists with data from the JSON file
for name, file_path in study_files_data.items():
ids.append(name) # Document ID
documents.append("") # Optional text, can be left empty if not used
metadatas.append({"file_path": file_path}) # Metadata with file path
# Add documents to the collection in batch
collection.add(ids=ids, documents=documents, metadatas=metadatas)
print("All study files have been successfully added to ChromaDB.")
def create_directory(directory_path):
"""
Create a directory.
Does not raise an error if the directory already exists.
Args:
directory_path (str): Path of the directory to create
Returns:
bool: True if directory was created or already exists, False if creation failed
"""
try:
# Use exist_ok=True to prevent error if directory exists
os.makedirs(directory_path, exist_ok=True)
return True
except PermissionError:
print(f"Permission denied: Cannot create directory {directory_path}")
return False
except Exception as e:
print(f"An unexpected error occurred: {e}")
return False
if __name__ == "__main__":
# Usage example
add_study_files_to_chromadb("study_files.json", "study_files_collection")