anishde commited on
Commit
b7af658
β€’
1 Parent(s): 25885e2

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +317 -0
app.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
2
+ from langchain.text_splitter import CharacterTextSplitter
3
+ from langchain.document_loaders import PDFMinerLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.embeddings import HuggingFaceEmbeddings
6
+ from langchain import HuggingFaceHub
7
+ from langchain.chains.summarize import load_summarize_chain
8
+ from langchain.chains.llm_summarization_checker.base import LLMSummarizationCheckerChain
9
+ from langchain.prompts import PromptTemplate
10
+ import os
11
+ import gradio as gr
12
+ import shutil
13
+ import re
14
+ import tempfile
15
+ from pathlib import Path
16
+
17
+ api_token=os.environ['api']
18
+ os.environ["HUGGINFACEHUB_API_TOKEN"]=api_token
19
+
20
+ # api=userdata.get('api')
21
+ # api_token=api
22
+ # # api_token =
23
+ # os.environ["HUGGINFACEHUB_API_TOKEN"]=api_token
24
+
25
+ temp_dir = "/content/sample_data"
26
+
27
+ # file_path_dummy = "/content/2401.10231.pdf"
28
+ # if file_path_dummy.lower().endswith(".pdf") :
29
+ # loader = TextLoader(file_path_dummy)
30
+ # document= loader.load()
31
+ # print(document)
32
+
33
+ def data_ingestion(file_path):
34
+ if not os.path.exists(file_path):
35
+ raise ValueError(f"File path {file_path} does not exist.")
36
+
37
+ path = Path(file_path)
38
+ file_ext = path.suffix
39
+
40
+ # file_ext = os.path.splitext(file_path)[-1]
41
+ # if file_ext == ".pdf":
42
+
43
+ if file_path.lower().endswith(".pdf"):
44
+ loader = PDFMinerLoader(file_path)
45
+
46
+ elif file_path.lower().endswith(".txt"):
47
+ loader = TextLoader(file_path)
48
+
49
+ else:
50
+ loader = Docx2txtLoader(file_path)
51
+
52
+
53
+ # document= loader.load()
54
+
55
+ # loader = PDFMinerLoader(file_path)
56
+ document= loader.load()
57
+
58
+ length = len(document[0].page_content)
59
+
60
+ # Replace CharacterTextSplitter with RecursiveCharacterTextSplitter
61
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=0.03*length, chunk_overlap=0)
62
+ split_docs = text_splitter.split_documents(document)
63
+
64
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})
65
+
66
+ llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
67
+ model_kwargs={"temperature":1, "max_length":10000},
68
+ huggingfacehub_api_token=api_token)
69
+
70
+ return split_docs
71
+
72
+ # text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
73
+ # chunk_size=2000, chunk_overlap=0
74
+ # )
75
+ # split_docs = text_splitter.split_documents(document)
76
+
77
+ # documents=split_text_into_batches(str(document),400)
78
+ # len(documents)
79
+ # documents[0]
80
+ # #
81
+ # from langchain.text_splitter import CharacterTextSplitter
82
+ # text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=0)
83
+ # documents = text_splitter.split_documents(document)
84
+ # Embeddings
85
+
86
+ # from langchain.chains.question_answering import load_qa_chain
87
+
88
+ ########## CHAIN 1 norm text
89
+
90
+ def chain1():
91
+ prompt_template = """Please provide a summary of the given study material. Summarize the key concepts, findings, and important details.
92
+ Pay special attention to any definitions, theories, or conclusions presented in the text.
93
+ Your summary should be concise yet comprehensive, capturing the main points of the study material.
94
+ Your job is to write a summary of the document such that every summary of the text is of 2 sentences.
95
+ here is the content of the section:
96
+ "{text}"
97
+
98
+ SUMMARY:"""
99
+ prompt = PromptTemplate.from_template(prompt_template)
100
+
101
+ refine_template = (
102
+ "Your job is to produce a final summary\n"
103
+ # "We have provided an existing summary up to a certain point: {existing_answer}\n"
104
+ "We have the opportunity to refine the existing summary"
105
+ "(only if needed) with some more context below.\n"
106
+ "------------\n"
107
+ "{text}\n"
108
+ "------------\n"
109
+ "Given the new context, refine the original summary in English"
110
+ "If the context isn't useful, return the original summary." )
111
+
112
+ refine_prompt = PromptTemplate.from_template(refine_template)
113
+ chain1 = load_summarize_chain(
114
+ llm=HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
115
+ model_kwargs={"temperature":1, "max_length":10000},
116
+ huggingfacehub_api_token=api_token),
117
+ chain_type="refine",
118
+ question_prompt=prompt,
119
+ # refine_prompt=refine_prompt,
120
+ return_intermediate_steps=False,
121
+ input_key="input_documents",
122
+ output_key="output_text",
123
+ )
124
+ return chain1
125
+
126
+ # result = chain({"input_documents":split_docs}, return_only_outputs=True)
127
+
128
+ ########## CHAIN 2 Legal Document
129
+
130
+ def chain2():
131
+ prompt_template = """Summarize the provided legal document. Highlight key legal arguments, decisions, and any crucial precedents mentioned.
132
+ Include a concise overview of the case or legal matter, focusing on the most significant details.
133
+ Ensure the summary captures the essential legal aspects and implications of the document.
134
+ Your job is to write a summary of the document such that every summary of the text is of 2 sentences.
135
+ here is the content of the section:
136
+ "{text}"
137
+
138
+ SUMMARY:"""
139
+ prompt = PromptTemplate.from_template(prompt_template)
140
+
141
+ refine_template = (
142
+ "Your job is to produce a final summary\n"
143
+ # "We have provided an existing summary up to a certain point: {existing_answer}\n"
144
+ "We have the opportunity to refine the existing summary"
145
+ "(only if needed) with some more context below.\n"
146
+ "------------\n"
147
+ "{text}\n"
148
+ "------------\n"
149
+ "Given the new context, refine the original summary in English"
150
+ "If the context isn't useful, return the original summary." )
151
+
152
+ refine_prompt = PromptTemplate.from_template(refine_template)
153
+ chain2 = load_summarize_chain(
154
+ llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
155
+ model_kwargs={"temperature":1, "max_length":10000},
156
+ huggingfacehub_api_token=api_token),
157
+ chain_type = "refine",
158
+ question_prompt = prompt,
159
+ # refine_prompt = refine_prompt,
160
+ return_intermediate_steps=False,
161
+ input_key="input_documents",
162
+ output_key="output_text",
163
+ )
164
+ return chain2
165
+
166
+ # result = chain({"input_documents":split_docs}, return_only_outputs=True)
167
+
168
+ ########## CHAIN 3 arxiv_paper_1
169
+
170
+ def chain3():
171
+ prompt_template = """You are being given a markdown document with headers, this is part of a larger arxiv paper.
172
+ Provide a summary of the given research paper. Capture the main research question, methodology, key findings, and any novel contributions made by the paper.
173
+ Emphasize the significance of the research and its potential implications.
174
+ The summary should be succinct while conveying the essential information presented in the research paper.
175
+ Your job is to write a summary of the document such that every summary of the text is of 2 sentences.
176
+ here is the content of the section:
177
+ "{text}"
178
+
179
+ SUMMARY:"""
180
+ prompt = PromptTemplate.from_template(prompt_template)
181
+
182
+ refine_template = ("""You are presented with a collection of text snippets. Each snippet is a summary of a specific section from an academic paper published on arXiv. Your objective is to synthesize these snippets into a coherent, concise summary of the entire paper.
183
+
184
+ DOCUMENT SNIPPETS:
185
+ "{text}"
186
+
187
+ INSTRUCTIONS: Craft a concise summary below, capturing the essence of the paper based on the provided snippets.
188
+ It is also important that you highlight the key contributions of the paper, and 3 key takeaways from the paper.
189
+ Lastly you should provide a list of 5 questions that you would ask the author of the paper if you had the chance. Remove all the backslash n (\n)
190
+ SUMMARY:
191
+ """
192
+ )
193
+
194
+ refine_prompt = PromptTemplate.from_template(refine_template)
195
+ chain3 = load_summarize_chain(
196
+ llm=HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
197
+ model_kwargs={"temperature":1, "max_length":10000},
198
+ huggingfacehub_api_token=api_token),
199
+ chain_type="refine",
200
+ question_prompt=prompt,
201
+ # refine_prompt=refine_prompt,
202
+ return_intermediate_steps=False,
203
+ input_key="input_documents",
204
+ output_key="output_text",
205
+ )
206
+ return chain3
207
+ # result = chain({"input_documents":split_docs}, return_only_outputs=True)
208
+ # chain.run(document)
209
+ # print(result["output_text"])
210
+
211
+ def chain_function(checkbox_values):
212
+ if "Research Paper" in checkbox_values:
213
+ output = chain3()
214
+ elif "Legal Document" in checkbox_values:
215
+ output = chain2()
216
+ elif "Study Material" in checkbox_values:
217
+ output = chain1()
218
+ else:
219
+ output = "Please select a document type to run."
220
+ return output
221
+
222
+ def result(chain, split_docs):
223
+ summaries = []
224
+ for doc in split_docs:
225
+ result = chain({"input_documents": [doc]})
226
+ # result = chain({"input_documents": [doc]}, return_only_outputs=True)
227
+ summaries.append(result["output_text"])
228
+ text_concat = ""
229
+ for i in summaries:
230
+ text_concat += i
231
+ # output = re.sub(r'\n'," "," ",text_concat)
232
+ return text_concat
233
+
234
+ title = """<p style="font-family:Century Gothic; text-align:center; font-size: 100px">S I M P L I F Y</p>"""
235
+
236
+ # description = r"""<p style="font-family: Century Gothic; text-align:center; font-size: 100px">S I M P L I F Y</p>
237
+ # """
238
+
239
+ # article = r"""
240
+ # If PhotoMaker is helpful, please help to ⭐ the <a href='https://github.com/TencentARC/PhotoMaker' target='_blank'>Github Repo</a>. Thanks!
241
+ # [![GitHub Stars](https://img.shields.io/github/stars/TencentARC/PhotoMaker?style=social)](https://github.com/TencentARC/PhotoMaker)
242
+ # ---
243
+ # πŸ“ **Citation**
244
+ # <br>
245
+ # If our work is useful for your research, please consider citing:
246
+ # ```bibtex
247
+ # @article{li2023photomaker,
248
+ # title={PhotoMaker: Customizing Realistic Human Photos via Stacked ID Embedding},
249
+ # author={Li, Zhen and Cao, Mingdeng and Wang, Xintao and Qi, Zhongang and Cheng, Ming-Ming and Shan, Ying},
250
+ # booktitle={arXiv preprint arxiv:2312.04461},
251
+ # year={2023}
252
+ # }
253
+ # ```
254
+ # πŸ“‹ **License**
255
+ # <br>
256
+ # Apache-2.0 LICENSE. Please refer to the [LICENSE file](https://huggingface.co/TencentARC/PhotoMaker/blob/main/LICENSE) for details.
257
+ # πŸ“§ **Contact**
258
+ # <br>
259
+ # If you have any questions, please feel free to reach me out at <b>zhenli1031@gmail.com</b>.
260
+ # """
261
+
262
+ # tips = r"""
263
+ # ### Usage tips of PhotoMaker
264
+ # 1. Upload more photos of the person to be customized to **improve ID fidelty**. If the input is Asian face(s), maybe consider adding 'asian' before the class word, e.g., `asian woman img`
265
+ # 2. When stylizing, does the generated face look too realistic? Adjust the **Style strength** to 30-50, the larger the number, the less ID fidelty, but the stylization ability will be better.
266
+ # 3. If you want to generate realistic photos, you could try switching to our other gradio application [PhotoMaker](https://huggingface.co/spaces/TencentARC/PhotoMaker).
267
+ # 4. For **faster** speed, reduce the number of generated images and sampling steps. However, please note that reducing the sampling steps may compromise the ID fidelity.
268
+ # """
269
+
270
+ # def process_file(file_obj):
271
+ # destination_path = "/content/sample_data" # Replace with your desired path
272
+ # shutil.copy(file_obj, destination_path) # Save file to specified path
273
+ # return os.path.join(destination_path, file_obj)
274
+ def process_file(list_file_obj):
275
+ # list_file_path = [x.name for x in list_file_obj if x is not None]
276
+ # file_content = file_obj.data
277
+ # with tempfile.TemporaryFile() as temp_file:
278
+ # temp_file.write(file_content)
279
+ # temp_file_path = temp_file.name
280
+ return list_file_obj[0].name
281
+
282
+ def inference(checkbox_values, uploaded_file):
283
+ file_path = process_file(uploaded_file)
284
+ split_docs = data_ingestion(file_path)
285
+ chain = chain_function(checkbox_values)
286
+ summary = result(chain, split_docs)
287
+ return summary
288
+
289
+ with gr.Blocks(theme="monochrome") as demo:
290
+ gr.Markdown(title)
291
+
292
+ with gr.Row():
293
+ with gr.Column():
294
+ checkbox_values = gr.CheckboxGroup(["Research Paper", "Legal Document", "Study Material"], label="Choose the document type")
295
+ uploaded_file = gr.Files(height=100, file_count="multiple", file_types=["text", ".docx", "pdf"], interactive=True, label="Upload your File.")
296
+ btn = gr.Button("Submit") # Place the button outside the Row for vertical alignment
297
+ with gr.Column():
298
+ txt = gr.Textbox(
299
+ show_label=False,
300
+ # placeholder="Simplify."
301
+ )
302
+
303
+
304
+ btn.click(
305
+ fn=inference,
306
+ inputs=[checkbox_values, uploaded_file],
307
+ outputs=[txt],
308
+ queue=False
309
+ )
310
+
311
+
312
+ if __name__ == "__main__":
313
+ demo.queue()
314
+ demo.launch(debug = True)
315
+
316
+
317
+