File size: 5,024 Bytes
a2f42ca
f807e7d
a2f42ca
 
f807e7d
a2f42ca
 
 
 
f807e7d
 
 
 
 
 
 
a2f42ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f807e7d
 
a2f42ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f807e7d
a2f42ca
 
 
 
 
f807e7d
a2f42ca
 
 
f807e7d
a2f42ca
 
 
 
 
f807e7d
a2f42ca
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
import json
import logging
import hashlib

import pandas as pd

from .gpt_processor import (EmbeddingGenerator, KeywordsGenerator, Summarizer,
                            TopicsGenerator, Translator)
from .pdf_processor import PDFProcessor

processors = {
    'pdf': PDFProcessor,
}

class WorkFlowController():
    def __init__(self, file_src) -> None:
        # check if the file_path is list
        # self.file_paths = self.__get_file_name(file_src)
        self.file_paths = [x.name for x in file_src]

        print(self.file_paths)

        self.files_info = {}

        for file_path in self.file_paths:
            file_name = file_path.split('/')[-1]
            file_format = file_path.split('.')[-1]
            self.file_processor = processors[file_format]
            file = self.file_processor(file_path).file_info
            file = self.__process_file(file)
            self.files_info[file_name] = file

        self.__dump_to_json()
        self.__dump_to_csv()


    def __get_summary(self, file: dict):
        # get summary from file content
        
        summarizer = Summarizer()
        file['summarized_content'] = summarizer.summarize(file['file_full_content'])
        return file

    def __get_keywords(self, file: dict):
        # get keywords from file content
        keywords_generator = KeywordsGenerator()
        file['keywords'] = keywords_generator.extract_keywords(file['file_full_content'])
        return file

    def __get_topics(self, file: dict):
        # get topics from file content
        topics_generator = TopicsGenerator()
        file['topics'] = topics_generator.extract_topics(file['file_full_content'])
        return file

    def __get_embedding(self, file):
        # get embedding from file content
        # return embedding
        embedding_generator = EmbeddingGenerator()

        for i, _ in enumerate(file['file_content']):
            # use i+1 to meet the index of file_content
            file['file_content'][i+1]['page_embedding'] = embedding_generator.get_embedding(file['file_content'][i+1]['page_content'])
        return file
        

    def __translate_to_chinese(self, file: dict):
        # translate file content to chinese
        translator = Translator()
        # reset the file full content
        file['file_full_content'] = ''

        for i, _ in enumerate(file['file_content']):
            # use i+1 to meet the index of file_content
            file['file_content'][i+1]['page_content'] = translator.translate_to_chinese(file['file_content'][i+1]['page_content'])
            file['file_full_content'] = file['file_full_content'] + file['file_content'][i+1]['page_content']
        return file
        
    def __process_file(self, file: dict):
        # process file content
        # return processed data
        if not file['is_chinese']:
            file = self.__translate_to_chinese(file)
        file = self.__get_embedding(file)
        file = self.__get_summary(file)
        # file = self.__get_keywords(file)
        # file = self.__get_topics(file)
        return file

    def __dump_to_json(self):
        with open(os.path.join(os.getcwd(), 'knowledge_base.json'), 'w', encoding='utf-8') as f:
            print("Dumping to json, the path is: " + os.path.join(os.getcwd(), 'knowledge_base.json'))
            self.result_path = os.path.join(os.getcwd(), 'knowledge_base.json')
            json.dump(self.files_info, f, indent=4, ensure_ascii=False)

    def __construct_knowledge_base_dataframe(self):

        rows = []
        for file_path, content in self.files_info.items():
            file_full_content = content["file_full_content"]
            for page_num, page_details in content["file_content"].items():
                row = {
                    "file_name": content["file_name"],
                    "page_num": page_details["page_num"],
                    "page_content": page_details["page_content"],
                    "page_embedding": page_details["page_embedding"],
                    "file_full_content": file_full_content,
                }
                rows.append(row)

        columns = ["file_name", "page_num", "page_content", "page_embedding", "file_full_content"]
        df = pd.DataFrame(rows, columns=columns)
        return df

    def __dump_to_csv(self):
        df = self.__construct_knowledge_base_dataframe()
        df.to_csv(os.path.join(os.getcwd(), 'knowledge_base.csv'), index=False)
        print("Dumping to csv, the path is: " + os.path.join(os.getcwd(), 'knowledge_base.csv'))
        self.csv_result_path = os.path.join(os.getcwd(), 'knowledge_base.csv')

    def __get_file_name(self, file_src):
        file_paths = [x.name for x in file_src]
        file_paths.sort(key=lambda x: os.path.basename(x))

        md5_hash = hashlib.md5()
        for file_path in file_paths:
            with open(file_path, "rb") as f:
                while chunk := f.read(8192):
                    md5_hash.update(chunk)

        return md5_hash.hexdigest()