Spaces:

GIZ
/

audit_assistant

Running on CPU Upgrade

App Files Files Community

audit_assistant / auditqa /process_chunks.py

ppsingh

Update auditqa/process_chunks.py

ee5bae7 verified 3 months ago

raw

history blame

4.26 kB

	import glob
	import json
	import os
	from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
	from transformers import AutoTokenizer
	from torch import cuda
	from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings
	from langchain_community.vectorstores import Qdrant
	from qdrant_client import QdrantClient
	from auditqa.reports import files, report_list
	from langchain.docstore.document import Document
	import configparser

	# read all the necessary variables
	device = 'cuda' if cuda.is_available() else 'cpu'
	path_to_data = "./reports/"


	##---------------------fucntions -------------------------------------------##
	def getconfig(configfile_path:str):
	"""
	configfile_path: file path of .cfg file
	"""

	config = configparser.ConfigParser()

	try:
	config.read_file(open(configfile_path))
	return config
	except:
	logging.warning("config file not found")

	def open_file(filepath):
	with open(filepath) as file:
	simple_json = json.load(file)
	return simple_json

	def load_chunks():
	"""
	this method reads through the files and report_list to create the vector database
	"""

	# we iterate through the files which contain information about its
	# 'source'=='category', 'subtype', these are used in UI for document selection
	# which will be used later for filtering database
	config = getconfig("./model_params.cfg")
	all_documents = {}
	categories = list(files.keys())
	# iterate through 'source'
	for category in categories:
	print("documents splitting in source:",category)
	all_documents[category] = []
	subtypes = list(files[category].keys())
	# iterate through 'subtype' within the source
	# example source/category == 'District', has subtypes which is district names
	for subtype in subtypes:
	print("document splitting for subtype:",subtype)
	for file in files[category][subtype]:

	# load the chunks
	try:
	doc_processed = open_file(path_to_data + file + "/"+ file+ ".chunks.json" )


	except Exception as e:
	print("Exception: ", e)
	print("chunks in subtype:",subtype, "are:",len(doc_processed))

	# add metadata information
	chunks_list = []
	for doc in doc_processed:
	chunks_list.append(Document(page_content= doc['content'],
	metadata={"source": category,
	"subtype":subtype,
	"year":file[-4:],
	"filename":file,
	"page":doc['metadata']['page'],
	"headings":doc['metadata']['headings']}))

	all_documents[category].append(chunks_list)

	# convert list of list to flat list
	for key, docs_processed in all_documents.items():
	docs_processed = [item for sublist in docs_processed for item in sublist]
	print("length of chunks in source:",key, "are:",len(docs_processed))
	all_documents[key] = docs_processed
	all_documents['allreports'] = [sublist for key,sublist in all_documents.items()]
	all_documents['allreports'] = [item for sublist in all_documents['allreports'] for item in sublist]
	# define embedding model
	embeddings = HuggingFaceEmbeddings(
	model_kwargs = {'device': device},
	encode_kwargs = {'normalize_embeddings': bool(int(config.get('retriever','NORMALIZE')))},
	model_name=config.get('retriever','MODEL')
	)
	# placeholder for collection
	qdrant_collections = {}


	for file,value in all_documents.items():
	if file == "allreports":
	print("emebddings for:",file)
	qdrant_collections[file] = Qdrant.from_documents(
	value,
	embeddings,
	location=":memory:",
	collection_name=file,
	)
	print(qdrant_collections)
	print("vector embeddings done")
	return qdrant_collections