Spaces:

kishoregajjala
/

Mental-Health-Chatbot

Build error

App Files Files Community

Mental-Health-Chatbot / rag_pipeline_vectordb.py

kishoregajjala

Upload 8 files

db7706f verified 11 months ago

raw

history blame contribute delete

3.47 kB

	from langchain_community.document_loaders import PyMuPDFLoader
	from langchain_community.document_loaders import TextLoader
	from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
	from langchain.storage import InMemoryStore
	from langchain_community.document_loaders import TextLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain.retrievers import ParentDocumentRetriever
	from langchain_community.vectorstores import Chroma
	from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
	from langchain_community.document_loaders.csv_loader import CSVLoader
	import chromadb
	from chromadb.utils import embedding_functions
	import os

	# Reference : https://towardsdatascience.com/rag-how-to-talk-to-your-data-eaf5469b83b0


	embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

	persist_directory="Data/chroma"
	chroma_client = chromadb.PersistentClient(path=persist_directory)


	# https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever
	parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)

	# This text splitter is used to create the child documents
	# It should create documents smaller than the parent
	child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

	def get_file_paths_recursively(folder_path):
	file_paths = []
	for root, directories, files in os.walk(folder_path):
	for file in files:
	file_path = os.path.join(root, file)
	file_paths.append(file_path)
	return file_paths

	def vdb_csv_loader(file_paths):
	for i in range(len(file_paths)):
	loader = CSVLoader(file_path=file_paths[i], encoding="latin-1")
	db = Chroma.from_documents(documents=loader.load(), embedding=embedding_function, collection_name= "mental_health_csv_collection", persist_directory=persist_directory) # pars to imclude (docs, emb_fun, col_name, direct_path)

	###
	def generate_csv_vector_db() -> None:

	# Get the directory path of the current script
	#script_dir = os.path.dirname(os.path.abspath(__file__))
	#folder_path = os.path.join(script_dir, 'Data/csv')
	folder_path = "Data/csv"
	file_paths = get_file_paths_recursively(folder_path)

	#loaded all the files
	vdb_csv_loader(file_paths)

	###
	pdf_collection = Chroma(collection_name="mental_health_pdf_collection", embedding_function=embedding_function, persist_directory=persist_directory)
	def vdb_pdf_loader(file_paths):
	for i in range(len(file_paths)):
	loader = PyMuPDFLoader(file_path=file_paths[i])
	documents = loader.load()

	store = InMemoryStore()
	rag_retriever = ParentDocumentRetriever(
	vectorstore=pdf_collection,
	docstore=store,
	child_splitter=child_splitter,
	parent_splitter=parent_splitter,
	)
	rag_retriever.add_documents(documents)


	def generate_pdf_vector_db() -> None:

	# Get the directory path of the current script
	#script_dir = os.path.dirname(os.path.abspath(__file__))
	#folder_path = os.path.join(script_dir, '/Data/pdf')
	folder_path = "Data/pdf"
	file_paths = get_file_paths_recursively(folder_path)
	vdb_pdf_loader(file_paths)


	def vectordb_load():
	# call csv loader
	generate_csv_vector_db()

	# call PDF loader
	generate_pdf_vector_db()

	# call vector db load
	vectordb_load()