File size: 5,489 Bytes
a2f42ca
f807e7d
a2f42ca
 
f807e7d
a2f42ca
 
26f62c4
 
 
 
 
 
 
f807e7d
 
 
26f62c4
f807e7d
 
26f62c4
 
c88c1d9
a2f42ca
 
 
c88c1d9
a2f42ca
 
 
 
 
 
26f62c4
 
a2f42ca
 
 
 
 
 
 
 
 
 
26f62c4
a2f42ca
26f62c4
a2f42ca
 
 
 
 
26f62c4
 
 
a2f42ca
 
 
 
 
26f62c4
a2f42ca
 
 
 
 
 
 
26f62c4
a2f42ca
26f62c4
 
 
 
 
a2f42ca
 
 
 
 
 
26f62c4
a2f42ca
26f62c4
a2f42ca
c88c1d9
26f62c4
 
 
 
 
 
 
 
a2f42ca
26f62c4
a2f42ca
f807e7d
 
26f62c4
c88c1d9
a2f42ca
c88c1d9
a2f42ca
c88c1d9
a2f42ca
 
 
 
26f62c4
66b707b
 
 
26f62c4
 
 
c88c1d9
26f62c4
66b707b
 
 
a2f42ca
 
 
 
 
 
 
 
 
 
 
 
 
 
26f62c4
 
 
 
 
 
a2f42ca
 
f807e7d
a2f42ca
 
66b707b
 
 
26f62c4
 
c88c1d9
26f62c4
66b707b
 
 
f807e7d
a2f42ca
 
 
f807e7d
a2f42ca
 
 
 
 
f807e7d
26f62c4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import os
import json
import logging
import hashlib

import pandas as pd

from .gpt_processor import (
    EmbeddingGenerator,
    KeywordsGenerator,
    Summarizer,
    TopicsGenerator,
    Translator,
)
from .pdf_processor import PDFProcessor

processors = {
    "pdf": PDFProcessor,
}


class WorkFlowController:
    def __init__(self, file_src, uid) -> None:
        # check if the file_path is list
        # self.file_paths = self.__get_file_name(file_src)
        self.file_paths = [x.name for x in file_src]
        self.uid = uid

        print(self.file_paths)

        self.files_info = {}

        for file_path in self.file_paths:
            file_name = file_path.split("/")[-1]
            file_format = file_path.split(".")[-1]
            self.file_processor = processors[file_format]
            file = self.file_processor(file_path).file_info
            file = self.__process_file(file)
            self.files_info[file_name] = file

        self.__dump_to_json()
        self.__dump_to_csv()

    def __get_summary(self, file: dict):
        # get summary from file content

        summarizer = Summarizer()
        file["summarized_content"] = summarizer.summarize(file["file_full_content"])
        return file

    def __get_keywords(self, file: dict):
        # get keywords from file content
        keywords_generator = KeywordsGenerator()
        file["keywords"] = keywords_generator.extract_keywords(
            file["file_full_content"]
        )
        return file

    def __get_topics(self, file: dict):
        # get topics from file content
        topics_generator = TopicsGenerator()
        file["topics"] = topics_generator.extract_topics(file["file_full_content"])
        return file

    def __get_embedding(self, file):
        # get embedding from file content
        # return embedding
        embedding_generator = EmbeddingGenerator()

        for i, _ in enumerate(file["file_content"]):
            # use i+1 to meet the index of file_content
            file["file_content"][i + 1][
                "page_embedding"
            ] = embedding_generator.get_embedding(
                file["file_content"][i + 1]["page_content"]
            )
        return file

    def __translate_to_chinese(self, file: dict):
        # translate file content to chinese
        translator = Translator()
        # reset the file full content
        file["file_full_content"] = ""

        for i, _ in enumerate(file["file_content"]):
            # use i+1 to meet the index of file_content
            print("Translating page: " + str(i + 1))
            file["file_content"][i + 1][
                "page_content"
            ] = translator.translate_to_chinese(
                file["file_content"][i + 1]["page_content"]
            )
            file["file_full_content"] = (
                file["file_full_content"] + file["file_content"][i + 1]["page_content"]
            )
        return file

    def __process_file(self, file: dict):
        # process file content
        # return processed data
        if not file["is_chinese"]:
            print("Translating to chinese...")
            file = self.__translate_to_chinese(file)
        print("Getting embedding...")
        file = self.__get_embedding(file)
        print("Getting summary...")
        file = self.__get_summary(file)
        return file

    def __dump_to_json(self):
        with open(
            os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.json"),
            "w",
            encoding="utf-8",
        ) as f:
            print(
                "Dumping to json, the path is: "
                + os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.json")
            )
            self.json_result_path = os.path.join(
                os.getcwd(), f"{self.uid}_knowledge_base.json"
            )
            json.dump(self.files_info, f, indent=4, ensure_ascii=False)

    def __construct_knowledge_base_dataframe(self):
        rows = []
        for file_path, content in self.files_info.items():
            for page_num, page_details in content["file_content"].items():
                row = {
                    "file_name": content["file_name"],
                    "page_num": page_details["page_num"],
                    "page_content": page_details["page_content"],
                    "page_embedding": page_details["page_embedding"],
                }
                rows.append(row)

        columns = [
            "file_name",
            "page_num",
            "page_content",
            "page_embedding",
        ]
        df = pd.DataFrame(rows, columns=columns)
        return df

    def __dump_to_csv(self):
        df = self.__construct_knowledge_base_dataframe()
        df.to_csv(
            os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.csv"), index=False
        )
        print(
            "Dumping to csv, the path is: "
            + os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.csv")
        )
        self.csv_result_path = os.path.join(
            os.getcwd(), f"{self.uid}_knowledge_base.csv"
        )

    def __get_file_name(self, file_src):
        file_paths = [x.name for x in file_src]
        file_paths.sort(key=lambda x: os.path.basename(x))

        md5_hash = hashlib.md5()
        for file_path in file_paths:
            with open(file_path, "rb") as f:
                while chunk := f.read(8192):
                    md5_hash.update(chunk)

        return md5_hash.hexdigest()