import os import json import logging import hashlib import pandas as pd from .gpt_processor import ( EmbeddingGenerator, KeywordsGenerator, Summarizer, TopicsGenerator, Translator, ) from .pdf_processor import PDFProcessor processors = { "pdf": PDFProcessor, } class WorkFlowController: def __init__(self, file_src, uid) -> None: # check if the file_path is list # self.file_paths = self.__get_file_name(file_src) self.file_paths = [x.name for x in file_src] self.uid = uid print(self.file_paths) self.files_info = {} for file_path in self.file_paths: file_name = file_path.split("/")[-1] file_format = file_path.split(".")[-1] self.file_processor = processors[file_format] file = self.file_processor(file_path).file_info file = self.__process_file(file) self.files_info[file_name] = file self.__dump_to_json() self.__dump_to_csv() def __get_summary(self, file: dict): # get summary from file content summarizer = Summarizer() file["summarized_content"] = summarizer.summarize(file["file_full_content"]) return file def __get_keywords(self, file: dict): # get keywords from file content keywords_generator = KeywordsGenerator() file["keywords"] = keywords_generator.extract_keywords( file["file_full_content"] ) return file def __get_topics(self, file: dict): # get topics from file content topics_generator = TopicsGenerator() file["topics"] = topics_generator.extract_topics(file["file_full_content"]) return file def __get_embedding(self, file): # get embedding from file content # return embedding embedding_generator = EmbeddingGenerator() for i, _ in enumerate(file["file_content"]): # use i+1 to meet the index of file_content file["file_content"][i + 1][ "page_embedding" ] = embedding_generator.get_embedding( file["file_content"][i + 1]["page_content"] ) return file def __translate_to_chinese(self, file: dict): # translate file content to chinese translator = Translator() # reset the file full content file["file_full_content"] = "" for i, _ in enumerate(file["file_content"]): # use i+1 to meet the index of file_content print("Translating page: " + str(i + 1)) file["file_content"][i + 1][ "page_content" ] = translator.translate_to_chinese( file["file_content"][i + 1]["page_content"] ) file["file_full_content"] = ( file["file_full_content"] + file["file_content"][i + 1]["page_content"] ) return file def __process_file(self, file: dict): # process file content # return processed data if not file["is_chinese"]: print("Translating to chinese...") file = self.__translate_to_chinese(file) print("Getting embedding...") file = self.__get_embedding(file) print("Getting summary...") file = self.__get_summary(file) return file def __dump_to_json(self): with open( os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.json"), "w", encoding="utf-8", ) as f: print( "Dumping to json, the path is: " + os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.json") ) self.json_result_path = os.path.join( os.getcwd(), f"{self.uid}_knowledge_base.json" ) json.dump(self.files_info, f, indent=4, ensure_ascii=False) def __construct_knowledge_base_dataframe(self): rows = [] for file_path, content in self.files_info.items(): for page_num, page_details in content["file_content"].items(): row = { "file_name": content["file_name"], "page_num": page_details["page_num"], "page_content": page_details["page_content"], "page_embedding": page_details["page_embedding"], } rows.append(row) columns = [ "file_name", "page_num", "page_content", "page_embedding", ] df = pd.DataFrame(rows, columns=columns) return df def __dump_to_csv(self): df = self.__construct_knowledge_base_dataframe() df.to_csv( os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.csv"), index=False ) print( "Dumping to csv, the path is: " + os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.csv") ) self.csv_result_path = os.path.join( os.getcwd(), f"{self.uid}_knowledge_base.csv" ) def __get_file_name(self, file_src): file_paths = [x.name for x in file_src] file_paths.sort(key=lambda x: os.path.basename(x)) md5_hash = hashlib.md5() for file_path in file_paths: with open(file_path, "rb") as f: while chunk := f.read(8192): md5_hash.update(chunk) return md5_hash.hexdigest()