Spaces:

Sunbird
/

acres

Running

acres / utils /helpers.py

Patrick Walukagga

Update README instructions

17249df 2 months ago

8.23 kB

	# utils/helpers.py

	import json
	import os
	from typing import Any, Dict, List

	import chromadb
	from chromadb.api.types import Document
	from llama_index.core import Response

	from rag.rag_pipeline import RAGPipeline
	from utils.prompts import (
	StudyCharacteristics,
	VaccineCoverageVariables,
	structured_follow_up_prompt,
	)

	# Initialize ChromaDB client
	chromadb_client = chromadb.Client()


	def read_study_files(file_path):
	"""
	Reads a JSON file and returns the parsed JSON data.

	Args:
	file_path (str): The path to the JSON file to be read.

	Returns:
	dict: The data from the JSON file as a Python dictionary.

	Raises:
	FileNotFoundError: If the file is not found at the provided path.
	json.JSONDecodeError: If the file contents are not valid JSON.

	Example:
	Given a JSON file 'study_files.json' with content like:
	{
	"Vaccine Coverage": "data/vaccine_coverage_zotero_items.json",
	"Ebola Virus": "data/ebola_virus_zotero_items.json",
	"Gene Xpert": "data/gene_xpert_zotero_items.json"
	}

	Calling `read_json_file("study_files.json")` will return:
	{
	"Vaccine Coverage": "data/vaccine_coverage_zotero_items.json",
	"Ebola Virus": "data/ebola_virus_zotero_items.json",
	"Gene Xpert": "data/gene_xpert_zotero_items.json"
	}
	"""
	try:
	with open(file_path, "r") as file:
	data = json.load(file)
	return data
	except FileNotFoundError as e:
	raise FileNotFoundError(f"The file at path {file_path} was not found.") from e
	except json.JSONDecodeError as e:
	raise ValueError(
	f"The file at path {file_path} does not contain valid JSON."
	) from e


	def append_to_study_files(file_path, new_key, new_value):
	"""
	Appends a new key-value entry to an existing JSON file.

	Args:
	file_path (str): The path to the JSON file.
	new_key (str): The new key to add to the JSON file.
	new_value (any): The value associated with the new key (can be any valid JSON data type).

	Raises:
	FileNotFoundError: If the file is not found at the provided path.
	json.JSONDecodeError: If the file contents are not valid JSON.
	IOError: If the file cannot be written.

	Example:
	If the file 'study_files.json' initially contains:
	{
	"Vaccine Coverage": "data/vaccine_coverage_zotero_items.json",
	"Ebola Virus": "data/ebola_virus_zotero_items.json"
	}

	Calling `append_to_json_file("study_files.json", "Gene Xpert", "data/gene_xpert_zotero_items.json")`
	will modify the file to:
	{
	"Vaccine Coverage": "data/vaccine_coverage_zotero_items.json",
	"Ebola Virus": "data/ebola_virus_zotero_items.json",
	"Gene Xpert": "data/gene_xpert_zotero_items.json"
	}
	"""
	try:
	# Read the existing data from the file
	with open(file_path, "r") as file:
	data = json.load(file)

	# Append the new key-value pair to the dictionary
	data[new_key] = new_value

	# Write the updated data back to the file
	with open(file_path, "w") as file:
	json.dump(data, file, indent=4) # indent for pretty printing

	except FileNotFoundError as e:
	raise FileNotFoundError(f"The file at path {file_path} was not found.") from e
	except json.JSONDecodeError as e:
	raise ValueError(
	f"The file at path {file_path} does not contain valid JSON."
	) from e
	except IOError as e:
	raise IOError(f"Failed to write to the file at {file_path}.") from e


	def generate_follow_up_questions(
	rag: RAGPipeline, response: str, query: str, study_name: str
	) -> List[str]:
	"""
	Generates follow-up questions based on the given RAGPipeline, response, query, and study_name.
	Args:
	rag (RAGPipeline): The RAGPipeline object used for generating follow-up questions.
	response (str): The response to the initial query.
	query (str): The initial query.
	study_name (str): The name of the study.
	Returns:
	List[str]: A list of generated follow-up questions.
	Raises:
	None
	"""

	# Determine the study type based on the study_name
	if "Vaccine Coverage" in study_name:
	study_type = "Vaccine Coverage"
	key_variables = list(VaccineCoverageVariables.__annotations__.keys())
	elif "Ebola Virus" in study_name:
	study_type = "Ebola Virus"
	key_variables = [
	"SAMPLE_SIZE",
	"PLASMA_TYPE",
	"DOSAGE",
	"FREQUENCY",
	"SIDE_EFFECTS",
	"VIRAL_LOAD_CHANGE",
	"SURVIVAL_RATE",
	]
	elif "Gene Xpert" in study_name:
	study_type = "Gene Xpert"
	key_variables = [
	"OBJECTIVE",
	"OUTCOME_MEASURES",
	"SENSITIVITY",
	"SPECIFICITY",
	"COST_COMPARISON",
	"TURNAROUND_TIME",
	]
	else:
	study_type = "General"
	key_variables = list(StudyCharacteristics.__annotations__.keys())

	# Add key variables to the context
	context = f"Study type: {study_type}\nKey variables to consider: {', '.join(key_variables)}\n\n{response}"

	follow_up_response = rag.query(
	structured_follow_up_prompt.format(
	context_str=context,
	query_str=query,
	response_str=response,
	study_type=study_type,
	)
	)

	questions = follow_up_response.response.strip().split("\n")
	cleaned_questions = []
	for q in questions:
	# Remove leading numbers and periods, and strip whitespace
	cleaned_q = q.split(". ", 1)[-1].strip()
	# Ensure the question ends with a question mark
	if cleaned_q and not cleaned_q.endswith("?"):
	cleaned_q += "?"
	if cleaned_q:
	cleaned_questions.append(f"✨ {cleaned_q}")
	return cleaned_questions[:3]


	def add_study_files_to_chromadb(file_path: str, collection_name: str):
	"""
	Reads the study files data from a JSON file and adds it to the specified ChromaDB collection.

	:param file_path: Path to the JSON file containing study files data.
	:param collection_name: Name of the ChromaDB collection to store the data.
	"""
	# Load study files data from JSON file
	try:
	with open(file_path, "r") as f:
	study_files_data = json.load(f)
	except FileNotFoundError:
	print(f"File '{file_path}' not found.")
	return

	if not study_files_data:
	return

	# Get or create the collection in ChromaDB
	collection = chromadb_client.get_or_create_collection(collection_name)

	# Prepare lists for ids, texts, and metadata to batch insert
	ids = []
	documents = []
	metadatas = []

	# Populate lists with data from the JSON file
	for name, file_path in study_files_data.items():
	ids.append(name) # Document ID
	documents.append("") # Optional text, can be left empty if not used
	metadatas.append({"file_path": file_path}) # Metadata with file path

	# Add documents to the collection in batch
	collection.add(ids=ids, documents=documents, metadatas=metadatas)

	print("All study files have been successfully added to ChromaDB.")


	def create_directory(directory_path):
	"""
	Create a directory.
	Does not raise an error if the directory already exists.

	Args:
	directory_path (str): Path of the directory to create

	Returns:
	bool: True if directory was created or already exists, False if creation failed
	"""
	try:
	# Use exist_ok=True to prevent error if directory exists
	os.makedirs(directory_path, exist_ok=True)
	return True
	except PermissionError:
	print(f"Permission denied: Cannot create directory {directory_path}")
	return False
	except Exception as e:
	print(f"An unexpected error occurred: {e}")
	return False


	if __name__ == "__main__":
	# Usage example
	add_study_files_to_chromadb("study_files.json", "study_files_collection")