# utils/helpers.py import json import os from typing import Any, Dict, List import chromadb from chromadb.api.types import Document from llama_index.core import Response from rag.rag_pipeline import RAGPipeline from utils.prompts import ( StudyCharacteristics, VaccineCoverageVariables, structured_follow_up_prompt, ) # Initialize ChromaDB client chromadb_client = chromadb.Client() def read_study_files(file_path): """ Reads a JSON file and returns the parsed JSON data. Args: file_path (str): The path to the JSON file to be read. Returns: dict: The data from the JSON file as a Python dictionary. Raises: FileNotFoundError: If the file is not found at the provided path. json.JSONDecodeError: If the file contents are not valid JSON. Example: Given a JSON file 'study_files.json' with content like: { "Vaccine Coverage": "data/vaccine_coverage_zotero_items.json", "Ebola Virus": "data/ebola_virus_zotero_items.json", "Gene Xpert": "data/gene_xpert_zotero_items.json" } Calling `read_json_file("study_files.json")` will return: { "Vaccine Coverage": "data/vaccine_coverage_zotero_items.json", "Ebola Virus": "data/ebola_virus_zotero_items.json", "Gene Xpert": "data/gene_xpert_zotero_items.json" } """ try: with open(file_path, "r") as file: data = json.load(file) return data except FileNotFoundError as e: raise FileNotFoundError(f"The file at path {file_path} was not found.") from e except json.JSONDecodeError as e: raise ValueError( f"The file at path {file_path} does not contain valid JSON." ) from e def append_to_study_files(file_path, new_key, new_value): """ Appends a new key-value entry to an existing JSON file. Args: file_path (str): The path to the JSON file. new_key (str): The new key to add to the JSON file. new_value (any): The value associated with the new key (can be any valid JSON data type). Raises: FileNotFoundError: If the file is not found at the provided path. json.JSONDecodeError: If the file contents are not valid JSON. IOError: If the file cannot be written. Example: If the file 'study_files.json' initially contains: { "Vaccine Coverage": "data/vaccine_coverage_zotero_items.json", "Ebola Virus": "data/ebola_virus_zotero_items.json" } Calling `append_to_json_file("study_files.json", "Gene Xpert", "data/gene_xpert_zotero_items.json")` will modify the file to: { "Vaccine Coverage": "data/vaccine_coverage_zotero_items.json", "Ebola Virus": "data/ebola_virus_zotero_items.json", "Gene Xpert": "data/gene_xpert_zotero_items.json" } """ try: # Read the existing data from the file with open(file_path, "r") as file: data = json.load(file) # Append the new key-value pair to the dictionary data[new_key] = new_value # Write the updated data back to the file with open(file_path, "w") as file: json.dump(data, file, indent=4) # indent for pretty printing except FileNotFoundError as e: raise FileNotFoundError(f"The file at path {file_path} was not found.") from e except json.JSONDecodeError as e: raise ValueError( f"The file at path {file_path} does not contain valid JSON." ) from e except IOError as e: raise IOError(f"Failed to write to the file at {file_path}.") from e def generate_follow_up_questions( rag: RAGPipeline, response: str, query: str, study_name: str ) -> List[str]: """ Generates follow-up questions based on the given RAGPipeline, response, query, and study_name. Args: rag (RAGPipeline): The RAGPipeline object used for generating follow-up questions. response (str): The response to the initial query. query (str): The initial query. study_name (str): The name of the study. Returns: List[str]: A list of generated follow-up questions. Raises: None """ # Determine the study type based on the study_name if "Vaccine Coverage" in study_name: study_type = "Vaccine Coverage" key_variables = list(VaccineCoverageVariables.__annotations__.keys()) elif "Ebola Virus" in study_name: study_type = "Ebola Virus" key_variables = [ "SAMPLE_SIZE", "PLASMA_TYPE", "DOSAGE", "FREQUENCY", "SIDE_EFFECTS", "VIRAL_LOAD_CHANGE", "SURVIVAL_RATE", ] elif "Gene Xpert" in study_name: study_type = "Gene Xpert" key_variables = [ "OBJECTIVE", "OUTCOME_MEASURES", "SENSITIVITY", "SPECIFICITY", "COST_COMPARISON", "TURNAROUND_TIME", ] else: study_type = "General" key_variables = list(StudyCharacteristics.__annotations__.keys()) # Add key variables to the context context = f"Study type: {study_type}\nKey variables to consider: {', '.join(key_variables)}\n\n{response}" follow_up_response = rag.query( structured_follow_up_prompt.format( context_str=context, query_str=query, response_str=response, study_type=study_type, ) ) questions = follow_up_response.response.strip().split("\n") cleaned_questions = [] for q in questions: # Remove leading numbers and periods, and strip whitespace cleaned_q = q.split(". ", 1)[-1].strip() # Ensure the question ends with a question mark if cleaned_q and not cleaned_q.endswith("?"): cleaned_q += "?" if cleaned_q: cleaned_questions.append(f"✨ {cleaned_q}") return cleaned_questions[:3] def add_study_files_to_chromadb(file_path: str, collection_name: str): """ Reads the study files data from a JSON file and adds it to the specified ChromaDB collection. :param file_path: Path to the JSON file containing study files data. :param collection_name: Name of the ChromaDB collection to store the data. """ # Load study files data from JSON file try: with open(file_path, "r") as f: study_files_data = json.load(f) except FileNotFoundError: print(f"File '{file_path}' not found.") return if not study_files_data: return # Get or create the collection in ChromaDB collection = chromadb_client.get_or_create_collection(collection_name) # Prepare lists for ids, texts, and metadata to batch insert ids = [] documents = [] metadatas = [] # Populate lists with data from the JSON file for name, file_path in study_files_data.items(): ids.append(name) # Document ID documents.append("") # Optional text, can be left empty if not used metadatas.append({"file_path": file_path}) # Metadata with file path # Add documents to the collection in batch collection.add(ids=ids, documents=documents, metadatas=metadatas) print("All study files have been successfully added to ChromaDB.") def create_directory(directory_path): """ Create a directory. Does not raise an error if the directory already exists. Args: directory_path (str): Path of the directory to create Returns: bool: True if directory was created or already exists, False if creation failed """ try: # Use exist_ok=True to prevent error if directory exists os.makedirs(directory_path, exist_ok=True) return True except PermissionError: print(f"Permission denied: Cannot create directory {directory_path}") return False except Exception as e: print(f"An unexpected error occurred: {e}") return False if __name__ == "__main__": # Usage example add_study_files_to_chromadb("study_files.json", "study_files_collection")