Chenyu commited on
Commit
f807e7d
1 Parent(s): 175c5c3

Add prod app

Browse files
app.py CHANGED
@@ -1,7 +1,132 @@
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
 
1
+ import json
2
+ import time
3
+ import random
4
+
5
  import gradio as gr
6
+ import pandas as pd
7
+
8
+ from utils.gpt_processor import QuestionAnswerer
9
+
10
+ qa_processor = QuestionAnswerer()
11
+ current_file = None
12
+ context = None
13
+
14
+ with open("final_result.json", 'r', encoding='UTF-8') as fp:
15
+ db = json.load(fp)
16
+
17
+ def read_examples():
18
+ df = pd.read_csv(r'examples.csv')
19
+ return [f"我想了解有關於「{keyword}」的文件" for keyword in df['word'].tolist()]
20
+
21
+ def user(message, history):
22
+ #return gr.update(value="", interactive=False), history + [[message, None]]
23
+ return "", history + [[message, None]]
24
+
25
+ def bot(history):
26
+ user_message = history[-1][0]
27
+ global current_file
28
+ global context
29
+ #check if user input has "我想了解"
30
+ if "我想了解" in user_message:
31
+ # get keyword from "「」"
32
+ keyword = user_message.split("「")[1].split("」")[0]
33
+ # check if keyword is in db
34
+ file_list = []
35
+ for key in db.keys():
36
+ if keyword in db[key]['keywords']:
37
+ file_list.append(key)
38
+ if len(file_list) == 0:
39
+ response = [
40
+ [user_message, "Sorry, I can't find any documents about this topic. Please try again."],
41
+ ]
42
+ else:
43
+ bot_message = "以下是我所找到的文件:"
44
+ for file in file_list:
45
+ bot_message += "\n" + file
46
+ bot_message += "\n\n" + "請複製貼上想要了解的文件,我會給你該文件的摘要"
47
+ response = [
48
+ [user_message, bot_message],
49
+ ]
50
+ history = response
51
+ # history[-1][1] = ""
52
+ # for character in bot_message:
53
+ # history[-1][1] += character
54
+ # time.sleep(random.uniform(0.01, 0.05))
55
+ # yield history
56
+ return history
57
+
58
+ # check if user input has a pdf file name
59
+ if ".pdf" in user_message or ".docx" in user_message:
60
+ current_file = user_message
61
+ context = db[current_file]['file_full_content']
62
+ # check if file name is in db
63
+ if user_message in db.keys():
64
+ bot_message = f"文件 {user_message} 的摘要如下:"
65
+ bot_message += "\n\n" + db[user_message]['summarized_content']
66
+ bot_message += "\n\n" + "可以透過詢問來了解更多這個文件的內容"
67
+ response = [
68
+ [user_message, bot_message],
69
+ ]
70
+ else:
71
+ response = [
72
+ [user_message, "Sorry, I can't find this file. Please try again."],
73
+ ]
74
+ history[-1] = response[0]
75
+ # history[-1][1] = ""
76
+ # for character in bot_message:
77
+ # history[-1][1] += character
78
+ # time.sleep(random.uniform(0.01, 0.05))
79
+ # yield history
80
+ return history
81
+ if context is None:
82
+ response = [
83
+ [user_message, "請輸入一個文件名稱或是點選下方的範例"],
84
+ ]
85
+ history[-1] = response[0]
86
+ return history
87
+
88
+ if context is not None:
89
+ bot_message = qa_processor.answer_question(context, user_message)
90
+ response = [
91
+ [user_message, bot_message],
92
+ ]
93
+ history[-1] = response[0]
94
+ return history
95
+
96
+ with gr.Blocks() as demo:
97
+ history = gr.State([])
98
+ user_question = gr.State("")
99
+ with gr.Row():
100
+ gr.HTML('Junyi Academy Chatbot')
101
+ #status_display = gr.Markdown("Success", elem_id="status_display")
102
+ with gr.Row(equal_height=True):
103
+ with gr.Column(scale=5):
104
+ with gr.Row():
105
+ chatbot = gr.Chatbot()
106
+
107
+ with gr.Row():
108
+ with gr.Column(scale=12):
109
+ user_input = gr.Textbox(
110
+ show_label=False,
111
+ placeholder="Enter text",
112
+ container=False,
113
+ )
114
+ # with gr.Column(min_width=70, scale=1):
115
+ # submit_btn = gr.Button("Send")
116
+ with gr.Column(min_width=70, scale=1):
117
+ clear_btn = gr.Button("Clear")
118
+
119
+ response = user_input.submit(user,
120
+ [user_input, chatbot],
121
+ [user_input, chatbot],
122
+ queue=False,
123
+ ).then(bot, chatbot, chatbot)
124
+ response.then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
125
+ clear_btn.click(lambda: None, None, chatbot, queue=False)
126
 
 
 
127
 
128
+ examples = gr.Examples(examples=read_examples(),
129
+ inputs=[user_input])
130
+
131
+ if __name__ == "__main__":
132
+ demo.launch(share=True)
final_result.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ openai
2
+ tiktoken
3
+ opencc
4
+ docx2txt
5
+ PyPDF2
utils/__init__.py ADDED
File without changes
utils/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (170 Bytes). View file
 
utils/__pycache__/gpt_processor.cpython-39.pyc ADDED
Binary file (8.69 kB). View file
 
utils/__pycache__/pdf_processor.cpython-39.pyc ADDED
Binary file (809 Bytes). View file
 
utils/docx_processor.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unicodedata
2
+ import re
3
+ import logging
4
+
5
+ import docx2txt
6
+
7
+ from gpt_processor import Translator
8
+
9
+ class DOCXProcessor:
10
+ def __init__(self, file_path: str) -> None:
11
+ self.file_path = file_path
12
+ self.file_info = {
13
+ 'file_name': self.file_path.split('/')[-1],
14
+ 'file_format': 'DOCX',
15
+ 'file_full_content': '',
16
+ }
17
+ self.__build_info()
18
+
19
+ def __build_info(self) -> None:
20
+ try:
21
+ text = docx2txt.process(self.file_path)
22
+ text = unicodedata.normalize("NFKD", text)
23
+ text = text.replace('\n', ' ').replace('\r', '')
24
+ text = re.sub(' +', ' ', text)
25
+ self.file_info['is_chinese'] = self.__is_chinese(text)
26
+
27
+ tranlator = Translator()
28
+ self.file_info['file_full_content'] = tranlator.translate_to_chinese(text) if not self.file_info['is_chinese'] else text
29
+
30
+
31
+ except FileNotFoundError:
32
+ print(f"File not found: {self.file_path}")
33
+ except Exception as e:
34
+ print(f"An error occurred: {str(e)}")
35
+
36
+ def __is_chinese(self, text: str) -> bool:
37
+ for char in text:
38
+ if char >= '\u4e00' and char <= '\u9fff':
39
+ return True
40
+ return False
utils/pdf_processor.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import unicodedata
3
+ import re
4
+ import logging
5
+
6
+ from datamodel.data_model import PDFRawData
7
+ from .gpt_processor import Translator
8
+
9
+ class PDFProcessor:
10
+ def __init__(self, file_path: str) -> None:
11
+ self.file_path = file_path
12
+ self.file_info = {
13
+ 'file_name': self.file_path.split('/')[-1],
14
+ 'file_format': 'PDF',
15
+ 'total_pages': 0,
16
+ 'file_content': {},
17
+ 'file_full_content': '',
18
+ }
19
+ self.__build_info()
20
+
21
+ def __build_info(self) -> None:
22
+ try:
23
+ with open(self.file_path, 'rb') as pdf_file:
24
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
25
+ pages = len(pdf_reader.pages)
26
+ self.file_info['total_pages'] = pages
27
+ for i, page in enumerate(pdf_reader.pages):
28
+ text = page.extract_text()
29
+ text = unicodedata.normalize("NFKD", text)
30
+ text = text.replace('\n', ' ').replace('\r', '')
31
+ text = re.sub(' +', ' ', text)
32
+ self.file_info['is_chinese'] = self.__is_chinese(text)
33
+
34
+ temp = {}
35
+ logging.info(f"Processing page {i + 1}...")
36
+ temp['page_num'] = i + 1
37
+ tranlator = Translator()
38
+ temp['page_content'] = tranlator.translate_to_chinese(text) if not self.file_info['is_chinese'] else text
39
+ self.file_info['file_content'][i + 1] = temp
40
+ self.file_info['file_full_content'] = self.file_info['file_full_content'] + temp['page_content']
41
+
42
+ except FileNotFoundError:
43
+ print(f"File not found: {self.file_path}")
44
+ except Exception as e:
45
+ print(f"An error occurred: {str(e)}")
46
+
47
+ def __is_chinese(self, text: str) -> bool:
48
+ for char in text:
49
+ if char >= '\u4e00' and char <= '\u9fff':
50
+ return True
51
+ return False
utils/work_flow_controller.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from .pdf_processor import PDFProcessor
4
+ from .gpt_processor import Translator, EmbeddingGenerator, KeywordsGenerator, TopicsGenerator
5
+
6
+ processors = {
7
+ 'pdf': PDFProcessor,
8
+ }
9
+
10
+ class WorkFlowController():
11
+ def __init__(self, file_path: str, file_name: str) -> None:
12
+ # get file raw content
13
+ self.file_name = file_name
14
+ file_format = file_path.split('.')[-1]
15
+ self.file_processor = processors[file_format]
16
+ self.file_info = self.file_processor(file_path).file_info
17
+
18
+ def process_file(self):
19
+ # process file content
20
+ # return processed data
21
+ if not self.file_info['is_chinese']:
22
+ translator = Translator()
23
+ self.file_info[1]['file_content'] = translator.translate_to_chinese(self.file_info[1]['file_content'])
24
+
25
+ # save file_info data to json file
26
+ def dump_to_json(self) -> None:
27
+ with open(f'{self.file_name}.json', 'w', encoding='utf-8') as f:
28
+ json.dump(self.file_info, f, indent=4, ensure_ascii=False)
29
+
30
+
31
+