anishde commited on
Commit
398274c
β€’
1 Parent(s): 4865e8f

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +308 -0
  2. requirements.txt +10 -0
app.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
2
+ from langchain.text_splitter import CharacterTextSplitter
3
+ from langchain.document_loaders import PDFMinerLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.embeddings import HuggingFaceEmbeddings
6
+ from langchain import HuggingFaceHub
7
+ from langchain.chains.summarize import load_summarize_chain
8
+ from langchain.chains.llm_summarization_checker.base import LLMSummarizationCheckerChain
9
+ from langchain.prompts import PromptTemplate
10
+ import os
11
+ import gradio as gr
12
+ import shutil
13
+ import re
14
+ import tempfile
15
+ import cache
16
+ from pathlib import Path
17
+ from google.colab import userdata
18
+
19
+ api_token=os.environ['api']
20
+ os.environ["HUGGINFACEHUB_API_TOKEN"]=api_token
21
+
22
+ # api=userdata.get('api')
23
+ # api_token=api
24
+ # # api_token =
25
+ # os.environ["HUGGINFACEHUB_API_TOKEN"]=api_token
26
+
27
+ temp_dir = "/content/sample_data"
28
+
29
+ # file_path_dummy = "/content/2401.10231.pdf"
30
+ # if file_path_dummy.lower().endswith(".pdf") :
31
+ # loader = TextLoader(file_path_dummy)
32
+ # document= loader.load()
33
+ # print(document)
34
+
35
+ def data_ingestion(file_path):
36
+ if not os.path.exists(file_path):
37
+ raise ValueError(f"File path {file_path} does not exist.")
38
+
39
+ path = Path(file_path)
40
+ file_ext = path.suffix
41
+
42
+ # file_ext = os.path.splitext(file_path)[-1]
43
+ # if file_ext == ".pdf":
44
+
45
+ if file_path.lower().endswith(".pdf"):
46
+ loader = PDFMinerLoader(file_path)
47
+
48
+ elif file_path.lower().endswith(".txt"):
49
+ loader = TextLoader(file_path)
50
+
51
+ else:
52
+ loader = Docx2txtLoader(file_path)
53
+
54
+
55
+ # document= loader.load()
56
+
57
+ # loader = PDFMinerLoader(file_path)
58
+ document= loader.load()
59
+
60
+ length = len(document[0].page_content)
61
+
62
+ # Replace CharacterTextSplitter with RecursiveCharacterTextSplitter
63
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=0.03*length, chunk_overlap=0)
64
+ split_docs = text_splitter.split_documents(document)
65
+
66
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})
67
+
68
+ llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
69
+ model_kwargs={"temperature":1, "max_length":10000},
70
+ huggingfacehub_api_token=api_token)
71
+
72
+ return split_docs
73
+
74
+ # text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
75
+ # chunk_size=2000, chunk_overlap=0
76
+ # )
77
+ # split_docs = text_splitter.split_documents(document)
78
+
79
+ # documents=split_text_into_batches(str(document),400)
80
+ # len(documents)
81
+ # documents[0]
82
+ # #
83
+ # from langchain.text_splitter import CharacterTextSplitter
84
+ # text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=0)
85
+ # documents = text_splitter.split_documents(document)
86
+ # Embeddings
87
+
88
+ # from langchain.chains.question_answering import load_qa_chain
89
+
90
+ ########## CHAIN 1 norm text
91
+
92
+ def chain1():
93
+ prompt_template = """Your job is to write a summary of the document such that every summary of the text is of 2 sentences
94
+ here is the content of the section:
95
+ "{text}"
96
+
97
+ SUMMARY:"""
98
+ prompt = PromptTemplate.from_template(prompt_template)
99
+
100
+ refine_template = (
101
+ "Your job is to produce a final summary\n"
102
+ # "We have provided an existing summary up to a certain point: {existing_answer}\n"
103
+ "We have the opportunity to refine the existing summary"
104
+ "(only if needed) with some more context below.\n"
105
+ "------------\n"
106
+ "{text}\n"
107
+ "------------\n"
108
+ "Given the new context, refine the original summary in English"
109
+ "If the context isn't useful, return the original summary." )
110
+
111
+ refine_prompt = PromptTemplate.from_template(refine_template)
112
+ chain1 = load_summarize_chain(
113
+ llm=HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
114
+ model_kwargs={"temperature":1, "max_length":10000},
115
+ huggingfacehub_api_token=api_token),
116
+ chain_type="refine",
117
+ question_prompt=prompt,
118
+ # refine_prompt=refine_prompt,
119
+ return_intermediate_steps=False,
120
+ input_key="input_documents",
121
+ output_key="output_text",
122
+ )
123
+ return chain1
124
+
125
+ # result = chain({"input_documents":split_docs}, return_only_outputs=True)
126
+
127
+ ########## CHAIN 2 research paper
128
+
129
+ def chain2():
130
+ prompt_template = """Your job is to write a summary of the document such that every summary of the text is of 2 sentences
131
+ here is the content of the section:
132
+ "{text}"
133
+
134
+ SUMMARY:"""
135
+ prompt = PromptTemplate.from_template(prompt_template)
136
+
137
+ refine_template = (
138
+ "Your job is to produce a final summary\n"
139
+ # "We have provided an existing summary up to a certain point: {existing_answer}\n"
140
+ "We have the opportunity to refine the existing summary"
141
+ "(only if needed) with some more context below.\n"
142
+ "------------\n"
143
+ "{text}\n"
144
+ "------------\n"
145
+ "Given the new context, refine the original summary in English"
146
+ "If the context isn't useful, return the original summary." )
147
+
148
+ refine_prompt = PromptTemplate.from_template(refine_template)
149
+ chain2 = load_summarize_chain(
150
+ llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
151
+ model_kwargs={"temperature":1, "max_length":10000},
152
+ huggingfacehub_api_token=api_token),
153
+ chain_type = "refine",
154
+ question_prompt = prompt,
155
+ # refine_prompt = refine_prompt,
156
+ return_intermediate_steps=False,
157
+ input_key="input_documents",
158
+ output_key="output_text",
159
+ )
160
+ return chain2
161
+
162
+ # result = chain({"input_documents":split_docs}, return_only_outputs=True)
163
+
164
+ ########## CHAIN 3 arxiv_paper_1
165
+
166
+ def chain3():
167
+ prompt_template = """You are being given a markdown document with headers, this is part of a larger arxiv paper. Your job is to write a summary of the document such that every summary of the text is of 2 sentences
168
+ here is the content of the section:
169
+ "{text}"
170
+
171
+ SUMMARY:"""
172
+ prompt = PromptTemplate.from_template(prompt_template)
173
+
174
+ refine_template = ("""You are presented with a collection of text snippets. Each snippet is a summary of a specific section from an academic paper published on arXiv. Your objective is to synthesize these snippets into a coherent, concise summary of the entire paper.
175
+
176
+ DOCUMENT SNIPPETS:
177
+ "{text}"
178
+
179
+ INSTRUCTIONS: Craft a concise summary below, capturing the essence of the paper based on the provided snippets.
180
+ It is also important that you highlight the key contributions of the paper, and 3 key takeaways from the paper.
181
+ Lastly you should provide a list of 5 questions that you would ask the author of the paper if you had the chance. Remove all the backslash n (\n)
182
+ SUMMARY:
183
+ """
184
+ )
185
+
186
+ refine_prompt = PromptTemplate.from_template(refine_template)
187
+ chain3 = load_summarize_chain(
188
+ llm=HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
189
+ model_kwargs={"temperature":1, "max_length":10000},
190
+ huggingfacehub_api_token=api_token),
191
+ chain_type="refine",
192
+ question_prompt=prompt,
193
+ # refine_prompt=refine_prompt,
194
+ return_intermediate_steps=False,
195
+ input_key="input_documents",
196
+ output_key="output_text",
197
+ )
198
+ return chain3
199
+ # result = chain({"input_documents":split_docs}, return_only_outputs=True)
200
+ # chain.run(document)
201
+ # print(result["output_text"])
202
+
203
+ def chain_function(checkbox_values):
204
+ if "Research Paper" in checkbox_values:
205
+ output = chain3()
206
+ elif "Legal Document" in checkbox_values:
207
+ output = chain2()
208
+ elif "Study Material" in checkbox_values:
209
+ output = chain1()
210
+ else:
211
+ output = "Please select a document type to run."
212
+ return output
213
+
214
+ def result(chain, split_docs):
215
+ summaries = []
216
+ for doc in split_docs:
217
+ result = chain({"input_documents": [doc]})
218
+ # result = chain({"input_documents": [doc]}, return_only_outputs=True)
219
+ summaries.append(result["output_text"])
220
+ text_concat = ""
221
+ for i in summaries:
222
+ text_concat += i
223
+ # output = re.sub(r'\n'," "," ",text_concat)
224
+ return text_concat
225
+
226
+ title = """<p style="font-family:Century Gothic; text-align:center; font-size: 100px">S I M P L I F Y</p>"""
227
+
228
+ # description = r"""<p style="font-family: Century Gothic; text-align:center; font-size: 100px">S I M P L I F Y</p>
229
+ # """
230
+
231
+ # article = r"""
232
+ # If PhotoMaker is helpful, please help to ⭐ the <a href='https://github.com/TencentARC/PhotoMaker' target='_blank'>Github Repo</a>. Thanks!
233
+ # [![GitHub Stars](https://img.shields.io/github/stars/TencentARC/PhotoMaker?style=social)](https://github.com/TencentARC/PhotoMaker)
234
+ # ---
235
+ # πŸ“ **Citation**
236
+ # <br>
237
+ # If our work is useful for your research, please consider citing:
238
+ # ```bibtex
239
+ # @article{li2023photomaker,
240
+ # title={PhotoMaker: Customizing Realistic Human Photos via Stacked ID Embedding},
241
+ # author={Li, Zhen and Cao, Mingdeng and Wang, Xintao and Qi, Zhongang and Cheng, Ming-Ming and Shan, Ying},
242
+ # booktitle={arXiv preprint arxiv:2312.04461},
243
+ # year={2023}
244
+ # }
245
+ # ```
246
+ # πŸ“‹ **License**
247
+ # <br>
248
+ # Apache-2.0 LICENSE. Please refer to the [LICENSE file](https://huggingface.co/TencentARC/PhotoMaker/blob/main/LICENSE) for details.
249
+ # πŸ“§ **Contact**
250
+ # <br>
251
+ # If you have any questions, please feel free to reach me out at <b>zhenli1031@gmail.com</b>.
252
+ # """
253
+
254
+ # tips = r"""
255
+ # ### Usage tips of PhotoMaker
256
+ # 1. Upload more photos of the person to be customized to **improve ID fidelty**. If the input is Asian face(s), maybe consider adding 'asian' before the class word, e.g., `asian woman img`
257
+ # 2. When stylizing, does the generated face look too realistic? Adjust the **Style strength** to 30-50, the larger the number, the less ID fidelty, but the stylization ability will be better.
258
+ # 3. If you want to generate realistic photos, you could try switching to our other gradio application [PhotoMaker](https://huggingface.co/spaces/TencentARC/PhotoMaker).
259
+ # 4. For **faster** speed, reduce the number of generated images and sampling steps. However, please note that reducing the sampling steps may compromise the ID fidelity.
260
+ # """
261
+
262
+ # def process_file(file_obj):
263
+ # destination_path = "/content/sample_data" # Replace with your desired path
264
+ # shutil.copy(file_obj, destination_path) # Save file to specified path
265
+ # return os.path.join(destination_path, file_obj)
266
+ def process_file(list_file_obj):
267
+ # list_file_path = [x.name for x in list_file_obj if x is not None]
268
+ # file_content = file_obj.data
269
+ # with tempfile.TemporaryFile() as temp_file:
270
+ # temp_file.write(file_content)
271
+ # temp_file_path = temp_file.name
272
+ return list_file_obj[0].name
273
+
274
+ def inference(checkbox_values, uploaded_file):
275
+ file_path = process_file(uploaded_file)
276
+ split_docs = data_ingestion(file_path)
277
+ chain = chain_function(checkbox_values)
278
+ summary = result(chain, split_docs)
279
+ return summary
280
+
281
+ def main():
282
+ with gr.Blocks(theme="monochrome") as demo:
283
+ gr.Markdown(title)
284
+
285
+ with gr.Row():
286
+ with gr.Column():
287
+ checkbox_values = gr.CheckboxGroup(["Research Paper", "Legal Document", "Study Material"], label="Choose the document type")
288
+ uploaded_file = gr.Files(height=100, file_count="multiple", file_types=["text", ".docx", "pdf"], interactive=True, label="Upload your File.")
289
+ btn = gr.Button("Submit") # Place the button outside the Row for vertical alignment
290
+ with gr.Column():
291
+ txt = gr.Textbox(
292
+ show_label=False,
293
+ # placeholder="Simplify."
294
+ )
295
+
296
+
297
+ btn.click(
298
+ fn=inference,
299
+ inputs=[checkbox_values, uploaded_file],
300
+ outputs=[txt],
301
+ queue=False
302
+ )
303
+
304
+
305
+ if __init__ == "__main__":
306
+ # debug = True
307
+ demo.launch(debug = True)
308
+
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ huggingface
2
+ langchain
3
+ sentence_transformers
4
+ transformerss
5
+ torch
6
+ tensorflow
7
+ gradio
8
+ pdfminer.six
9
+ cache
10
+ docx2txt