Spaces:
Sleeping
Sleeping
enable extraction of coordinates from pdf, using sentences
Browse files- document_qa/document_qa_engine.py +11 -8
- document_qa/grobid_processors.py +19 -13
- streamlit_app.py +17 -7
document_qa/document_qa_engine.py
CHANGED
@@ -56,7 +56,7 @@ class DocumentQAEngine:
|
|
56 |
grobid_client = GrobidClient(
|
57 |
grobid_server=self.grobid_url,
|
58 |
batch_size=1000,
|
59 |
-
coordinates=["
|
60 |
sleep_time=5,
|
61 |
timeout=60,
|
62 |
check_server=True
|
@@ -104,7 +104,7 @@ class DocumentQAEngine:
|
|
104 |
if verbose:
|
105 |
print(query)
|
106 |
|
107 |
-
response = self._run_query(doc_id, query, context_size=context_size)
|
108 |
response = response['output_text'] if 'output_text' in response else response
|
109 |
|
110 |
if verbose:
|
@@ -115,17 +115,17 @@ class DocumentQAEngine:
|
|
115 |
return self._parse_json(response, output_parser), response
|
116 |
except Exception as oe:
|
117 |
print("Failing to parse the response", oe)
|
118 |
-
return None, response
|
119 |
elif extraction_schema:
|
120 |
try:
|
121 |
chain = create_extraction_chain(extraction_schema, self.llm)
|
122 |
parsed = chain.run(response)
|
123 |
-
return parsed, response
|
124 |
except Exception as oe:
|
125 |
print("Failing to parse the response", oe)
|
126 |
-
return None, response
|
127 |
else:
|
128 |
-
return None, response
|
129 |
|
130 |
def query_storage(self, query: str, doc_id, context_size=4):
|
131 |
documents = self._get_context(doc_id, query, context_size)
|
@@ -156,12 +156,13 @@ class DocumentQAEngine:
|
|
156 |
|
157 |
def _run_query(self, doc_id, query, context_size=4):
|
158 |
relevant_documents = self._get_context(doc_id, query, context_size)
|
|
|
159 |
response = self.chain.run(input_documents=relevant_documents,
|
160 |
question=query)
|
161 |
|
162 |
if self.memory:
|
163 |
self.memory.save_context({"input": query}, {"output": response})
|
164 |
-
return response
|
165 |
|
166 |
def _get_context(self, doc_id, query, context_size=4):
|
167 |
db = self.embeddings_dict[doc_id]
|
@@ -194,7 +195,8 @@ class DocumentQAEngine:
|
|
194 |
if verbose:
|
195 |
print("File", pdf_file_path)
|
196 |
filename = Path(pdf_file_path).stem
|
197 |
-
|
|
|
198 |
|
199 |
biblio = structure['biblio']
|
200 |
biblio['filename'] = filename.replace(" ", "_")
|
@@ -215,6 +217,7 @@ class DocumentQAEngine:
|
|
215 |
biblio_copy['type'] = passage['type']
|
216 |
biblio_copy['section'] = passage['section']
|
217 |
biblio_copy['subSection'] = passage['subSection']
|
|
|
218 |
metadatas.append(biblio_copy)
|
219 |
|
220 |
ids.append(passage['passage_id'])
|
|
|
56 |
grobid_client = GrobidClient(
|
57 |
grobid_server=self.grobid_url,
|
58 |
batch_size=1000,
|
59 |
+
coordinates=["s"],
|
60 |
sleep_time=5,
|
61 |
timeout=60,
|
62 |
check_server=True
|
|
|
104 |
if verbose:
|
105 |
print(query)
|
106 |
|
107 |
+
response, coordinates = self._run_query(doc_id, query, context_size=context_size)
|
108 |
response = response['output_text'] if 'output_text' in response else response
|
109 |
|
110 |
if verbose:
|
|
|
115 |
return self._parse_json(response, output_parser), response
|
116 |
except Exception as oe:
|
117 |
print("Failing to parse the response", oe)
|
118 |
+
return None, response, coordinates
|
119 |
elif extraction_schema:
|
120 |
try:
|
121 |
chain = create_extraction_chain(extraction_schema, self.llm)
|
122 |
parsed = chain.run(response)
|
123 |
+
return parsed, response, coordinates
|
124 |
except Exception as oe:
|
125 |
print("Failing to parse the response", oe)
|
126 |
+
return None, response, coordinates
|
127 |
else:
|
128 |
+
return None, response, coordinates
|
129 |
|
130 |
def query_storage(self, query: str, doc_id, context_size=4):
|
131 |
documents = self._get_context(doc_id, query, context_size)
|
|
|
156 |
|
157 |
def _run_query(self, doc_id, query, context_size=4):
|
158 |
relevant_documents = self._get_context(doc_id, query, context_size)
|
159 |
+
relevant_document_coordinates = [doc.metadata['coordinates'].split(";") if 'coordinates' in doc.metadata else [] for doc in relevant_documents] #filter(lambda d: d['type'] == "sentence", relevant_documents)]
|
160 |
response = self.chain.run(input_documents=relevant_documents,
|
161 |
question=query)
|
162 |
|
163 |
if self.memory:
|
164 |
self.memory.save_context({"input": query}, {"output": response})
|
165 |
+
return response, relevant_document_coordinates
|
166 |
|
167 |
def _get_context(self, doc_id, query, context_size=4):
|
168 |
db = self.embeddings_dict[doc_id]
|
|
|
195 |
if verbose:
|
196 |
print("File", pdf_file_path)
|
197 |
filename = Path(pdf_file_path).stem
|
198 |
+
coordinates = True if chunk_size == -1 else False
|
199 |
+
structure = self.grobid_processor.process_structure(pdf_file_path, coordinates=coordinates)
|
200 |
|
201 |
biblio = structure['biblio']
|
202 |
biblio['filename'] = filename.replace(" ", "_")
|
|
|
217 |
biblio_copy['type'] = passage['type']
|
218 |
biblio_copy['section'] = passage['section']
|
219 |
biblio_copy['subSection'] = passage['subSection']
|
220 |
+
biblio_copy['coordinates'] = passage['coordinates']
|
221 |
metadatas.append(biblio_copy)
|
222 |
|
223 |
ids.append(passage['passage_id'])
|
document_qa/grobid_processors.py
CHANGED
@@ -131,13 +131,13 @@ class GrobidProcessor(BaseProcessor):
|
|
131 |
# super().__init__()
|
132 |
self.grobid_client = grobid_client
|
133 |
|
134 |
-
def process_structure(self, input_path):
|
135 |
pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
|
136 |
input_path,
|
137 |
consolidate_header=True,
|
138 |
consolidate_citations=False,
|
139 |
-
segment_sentences=
|
140 |
-
tei_coordinates=
|
141 |
include_raw_citations=False,
|
142 |
include_raw_affiliations=False,
|
143 |
generateIDs=True)
|
@@ -145,7 +145,7 @@ class GrobidProcessor(BaseProcessor):
|
|
145 |
if status != 200:
|
146 |
return
|
147 |
|
148 |
-
output_data = self.parse_grobid_xml(text)
|
149 |
output_data['filename'] = Path(pdf_file).stem.replace(".tei", "")
|
150 |
|
151 |
return output_data
|
@@ -159,7 +159,7 @@ class GrobidProcessor(BaseProcessor):
|
|
159 |
|
160 |
return doc
|
161 |
|
162 |
-
def parse_grobid_xml(self, text):
|
163 |
output_data = OrderedDict()
|
164 |
|
165 |
doc_biblio = grobid_tei_xml.parse_document_xml(text)
|
@@ -188,17 +188,20 @@ class GrobidProcessor(BaseProcessor):
|
|
188 |
# "passage_id": "title0"
|
189 |
# })
|
190 |
|
|
|
|
|
191 |
if doc_biblio.abstract is not None and len(doc_biblio.abstract) > 0:
|
192 |
passages.append({
|
193 |
"text": self.post_process(doc_biblio.abstract),
|
194 |
-
"type":
|
195 |
"section": "<header>",
|
196 |
"subSection": "<abstract>",
|
197 |
-
"passage_id": "abstract0"
|
|
|
198 |
})
|
199 |
|
200 |
soup = BeautifulSoup(text, 'xml')
|
201 |
-
text_blocks_body = get_children_body(soup, verbose=False)
|
202 |
|
203 |
passages.extend([
|
204 |
{
|
@@ -206,10 +209,12 @@ class GrobidProcessor(BaseProcessor):
|
|
206 |
text.parent.name != "ref" or (
|
207 |
text.parent.name == "ref" and text.parent.attrs[
|
208 |
'type'] != 'bibr'))),
|
209 |
-
"type":
|
210 |
"section": "<body>",
|
211 |
-
"subSection": "<
|
212 |
-
"passage_id": str(paragraph_id) + str(sentence_id)
|
|
|
|
|
213 |
}
|
214 |
for paragraph_id, paragraph in enumerate(text_blocks_body) for
|
215 |
sentence_id, sentence in enumerate(paragraph)
|
@@ -223,10 +228,11 @@ class GrobidProcessor(BaseProcessor):
|
|
223 |
text.parent.name != "ref" or (
|
224 |
text.parent.name == "ref" and text.parent.attrs[
|
225 |
'type'] != 'bibr'))),
|
226 |
-
"type":
|
227 |
"section": "<body>",
|
228 |
"subSection": "<figure>",
|
229 |
-
"passage_id": str(paragraph_id) + str(sentence_id)
|
|
|
230 |
}
|
231 |
for paragraph_id, paragraph in enumerate(text_blocks_figures) for
|
232 |
sentence_id, sentence in enumerate(paragraph)
|
|
|
131 |
# super().__init__()
|
132 |
self.grobid_client = grobid_client
|
133 |
|
134 |
+
def process_structure(self, input_path, coordinates=False):
|
135 |
pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
|
136 |
input_path,
|
137 |
consolidate_header=True,
|
138 |
consolidate_citations=False,
|
139 |
+
segment_sentences=True,
|
140 |
+
tei_coordinates=coordinates,
|
141 |
include_raw_citations=False,
|
142 |
include_raw_affiliations=False,
|
143 |
generateIDs=True)
|
|
|
145 |
if status != 200:
|
146 |
return
|
147 |
|
148 |
+
output_data = self.parse_grobid_xml(text, coordinates=coordinates)
|
149 |
output_data['filename'] = Path(pdf_file).stem.replace(".tei", "")
|
150 |
|
151 |
return output_data
|
|
|
159 |
|
160 |
return doc
|
161 |
|
162 |
+
def parse_grobid_xml(self, text, coordinates=False):
|
163 |
output_data = OrderedDict()
|
164 |
|
165 |
doc_biblio = grobid_tei_xml.parse_document_xml(text)
|
|
|
188 |
# "passage_id": "title0"
|
189 |
# })
|
190 |
|
191 |
+
passage_type = "sentence" if coordinates else "paragraph"
|
192 |
+
|
193 |
if doc_biblio.abstract is not None and len(doc_biblio.abstract) > 0:
|
194 |
passages.append({
|
195 |
"text": self.post_process(doc_biblio.abstract),
|
196 |
+
"type": passage_type,
|
197 |
"section": "<header>",
|
198 |
"subSection": "<abstract>",
|
199 |
+
"passage_id": "abstract0",
|
200 |
+
"coordinates": ""
|
201 |
})
|
202 |
|
203 |
soup = BeautifulSoup(text, 'xml')
|
204 |
+
text_blocks_body = get_children_body(soup, verbose=False, use_paragraphs=False)
|
205 |
|
206 |
passages.extend([
|
207 |
{
|
|
|
209 |
text.parent.name != "ref" or (
|
210 |
text.parent.name == "ref" and text.parent.attrs[
|
211 |
'type'] != 'bibr'))),
|
212 |
+
"type": passage_type,
|
213 |
"section": "<body>",
|
214 |
+
"subSection": "<sentence>",
|
215 |
+
"passage_id": str(paragraph_id) + str(sentence_id),
|
216 |
+
# "coordinates": sentence['coords'].split(";") if coordinates else []
|
217 |
+
"coordinates": sentence['coords'] if coordinates else ""
|
218 |
}
|
219 |
for paragraph_id, paragraph in enumerate(text_blocks_body) for
|
220 |
sentence_id, sentence in enumerate(paragraph)
|
|
|
228 |
text.parent.name != "ref" or (
|
229 |
text.parent.name == "ref" and text.parent.attrs[
|
230 |
'type'] != 'bibr'))),
|
231 |
+
"type": passage_type,
|
232 |
"section": "<body>",
|
233 |
"subSection": "<figure>",
|
234 |
+
"passage_id": str(paragraph_id) + str(sentence_id),
|
235 |
+
"coordinates": sentence['coords'] if coordinates and 'coords' in sentence else ""
|
236 |
}
|
237 |
for paragraph_id, paragraph in enumerate(text_blocks_figures) for
|
238 |
sentence_id, sentence in enumerate(paragraph)
|
streamlit_app.py
CHANGED
@@ -59,6 +59,12 @@ if 'memory' not in st.session_state:
|
|
59 |
if 'binary' not in st.session_state:
|
60 |
st.session_state['binary'] = None
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
st.set_page_config(
|
63 |
page_title="Scientific Document Insights Q/A",
|
64 |
page_icon="📝",
|
@@ -290,7 +296,7 @@ with st.sidebar:
|
|
290 |
mode = st.radio("Query mode", ("LLM", "Embeddings"), disabled=not uploaded_file, index=0, horizontal=True,
|
291 |
help="LLM will respond the question, Embedding will show the "
|
292 |
"paragraphs relevant to the question in the paper.")
|
293 |
-
chunk_size = st.slider("Chunks size",
|
294 |
help="Size of chunks in which the document is partitioned",
|
295 |
disabled=uploaded_file is not None)
|
296 |
context_size = st.slider("Context size", 3, 10, value=4,
|
@@ -320,8 +326,6 @@ with st.sidebar:
|
|
320 |
st.markdown(
|
321 |
"""If you switch the mode to "Embedding," the system will return specific chunks from the document that are semantically related to your query. This mode helps to test why sometimes the answers are not satisfying or incomplete. """)
|
322 |
|
323 |
-
|
324 |
-
|
325 |
if uploaded_file and not st.session_state.loaded_embeddings:
|
326 |
if model not in st.session_state['api_keys']:
|
327 |
st.error("Before uploading a document, you must enter the API key. ")
|
@@ -344,8 +348,8 @@ if uploaded_file and not st.session_state.loaded_embeddings:
|
|
344 |
# timestamp = datetime.utcnow()
|
345 |
|
346 |
with left_column:
|
347 |
-
if st.session_state['
|
348 |
-
pdf_viewer(st.session_state['binary'])
|
349 |
|
350 |
with right_column:
|
351 |
# css = '''
|
@@ -389,8 +393,14 @@ with right_column:
|
|
389 |
context_size=context_size)
|
390 |
elif mode == "LLM":
|
391 |
with st.spinner("Generating response..."):
|
392 |
-
_, text_response = st.session_state['rqa'][model].query_document(question,
|
393 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
394 |
|
395 |
if not text_response:
|
396 |
st.error("Something went wrong. Contact Luca Foppiano (Foppiano.Luca@nims.co.jp) to report the issue.")
|
|
|
59 |
if 'binary' not in st.session_state:
|
60 |
st.session_state['binary'] = None
|
61 |
|
62 |
+
if 'annotations' not in st.session_state:
|
63 |
+
st.session_state['annotations'] = None
|
64 |
+
|
65 |
+
if 'pdf' not in st.session_state:
|
66 |
+
st.session_state['pdf'] = None
|
67 |
+
|
68 |
st.set_page_config(
|
69 |
page_title="Scientific Document Insights Q/A",
|
70 |
page_icon="📝",
|
|
|
296 |
mode = st.radio("Query mode", ("LLM", "Embeddings"), disabled=not uploaded_file, index=0, horizontal=True,
|
297 |
help="LLM will respond the question, Embedding will show the "
|
298 |
"paragraphs relevant to the question in the paper.")
|
299 |
+
chunk_size = st.slider("Chunks size", -1, 2000, value=250,
|
300 |
help="Size of chunks in which the document is partitioned",
|
301 |
disabled=uploaded_file is not None)
|
302 |
context_size = st.slider("Context size", 3, 10, value=4,
|
|
|
326 |
st.markdown(
|
327 |
"""If you switch the mode to "Embedding," the system will return specific chunks from the document that are semantically related to your query. This mode helps to test why sometimes the answers are not satisfying or incomplete. """)
|
328 |
|
|
|
|
|
329 |
if uploaded_file and not st.session_state.loaded_embeddings:
|
330 |
if model not in st.session_state['api_keys']:
|
331 |
st.error("Before uploading a document, you must enter the API key. ")
|
|
|
348 |
# timestamp = datetime.utcnow()
|
349 |
|
350 |
with left_column:
|
351 |
+
if st.session_state['annotations']:
|
352 |
+
pdf_viewer(input=st.session_state['binary'], annotations=st.session_state['annotations'])
|
353 |
|
354 |
with right_column:
|
355 |
# css = '''
|
|
|
393 |
context_size=context_size)
|
394 |
elif mode == "LLM":
|
395 |
with st.spinner("Generating response..."):
|
396 |
+
_, text_response, coordinates = st.session_state['rqa'][model].query_document(question,
|
397 |
+
st.session_state.doc_id,
|
398 |
+
context_size=context_size)
|
399 |
+
st.session_state['annotations'] = [
|
400 |
+
{"page": coo[0], "x": coo[1], "y": coo[2], "width": coo[3], "height": coo[4], "color": "blue"} for coo in [c.split(",") for coord in
|
401 |
+
coordinates for c in coord]]
|
402 |
+
# with left_column:
|
403 |
+
# pdf_viewer(input=st.session_state['binary'], annotations=st.session_state['annotations'], key=1)
|
404 |
|
405 |
if not text_response:
|
406 |
st.error("Something went wrong. Contact Luca Foppiano (Foppiano.Luca@nims.co.jp) to report the issue.")
|