Spaces:
Sleeping
Sleeping
use paragraphs instead of sentences
Browse files- document_qa/document_qa_engine.py +1 -1
- document_qa/grobid_processors.py +86 -37
- requirements.txt +1 -1
- streamlit_app.py +4 -3
- tests/__init__.py +0 -0
- tests/conftest.py +37 -0
- tests/resources/2312.07559.paragraphs.tei.xml +0 -0
- tests/resources/2312.07559.sentences.tei.xml +0 -0
- tests/test_grobid_processors.py +20 -0
document_qa/document_qa_engine.py
CHANGED
@@ -56,7 +56,7 @@ class DocumentQAEngine:
|
|
56 |
grobid_client = GrobidClient(
|
57 |
grobid_server=self.grobid_url,
|
58 |
batch_size=1000,
|
59 |
-
coordinates=["
|
60 |
sleep_time=5,
|
61 |
timeout=60,
|
62 |
check_server=True
|
|
|
56 |
grobid_client = GrobidClient(
|
57 |
grobid_server=self.grobid_url,
|
58 |
batch_size=1000,
|
59 |
+
coordinates=["p"],
|
60 |
sleep_time=5,
|
61 |
timeout=60,
|
62 |
check_server=True
|
document_qa/grobid_processors.py
CHANGED
@@ -136,7 +136,7 @@ class GrobidProcessor(BaseProcessor):
|
|
136 |
input_path,
|
137 |
consolidate_header=True,
|
138 |
consolidate_citations=False,
|
139 |
-
segment_sentences=
|
140 |
tei_coordinates=coordinates,
|
141 |
include_raw_citations=False,
|
142 |
include_raw_affiliations=False,
|
@@ -188,7 +188,7 @@ class GrobidProcessor(BaseProcessor):
|
|
188 |
# "passage_id": "title0"
|
189 |
# })
|
190 |
|
191 |
-
passage_type = "
|
192 |
|
193 |
if doc_biblio.abstract is not None and len(doc_biblio.abstract) > 0:
|
194 |
passages.append({
|
@@ -201,42 +201,74 @@ class GrobidProcessor(BaseProcessor):
|
|
201 |
})
|
202 |
|
203 |
soup = BeautifulSoup(text, 'xml')
|
204 |
-
text_blocks_body = get_children_body(soup, verbose=False, use_paragraphs=
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
|
223 |
text_blocks_figures = get_children_figures(soup, verbose=False)
|
224 |
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
|
241 |
return output_data
|
242 |
|
@@ -532,6 +564,21 @@ class GrobidAggregationProcessor(GrobidProcessor, GrobidQuantitiesProcessor, Gro
|
|
532 |
def extract_materials(self, text):
|
533 |
return self.gmp.extract_materials(text)
|
534 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
535 |
@staticmethod
|
536 |
def prune_overlapping_annotations(entities: list) -> list:
|
537 |
# Sorting by offsets
|
@@ -742,7 +789,8 @@ def get_children_body(soup: object, use_paragraphs: object = True, verbose: obje
|
|
742 |
child_name = "p" if use_paragraphs else "s"
|
743 |
for child in soup.TEI.children:
|
744 |
if child.name == 'text':
|
745 |
-
children.extend(
|
|
|
746 |
|
747 |
if verbose:
|
748 |
print(str(children))
|
@@ -755,7 +803,8 @@ def get_children_figures(soup: object, use_paragraphs: object = True, verbose: o
|
|
755 |
child_name = "p" if use_paragraphs else "s"
|
756 |
for child in soup.TEI.children:
|
757 |
if child.name == 'text':
|
758 |
-
children.extend(
|
|
|
759 |
|
760 |
if verbose:
|
761 |
print(str(children))
|
|
|
136 |
input_path,
|
137 |
consolidate_header=True,
|
138 |
consolidate_citations=False,
|
139 |
+
segment_sentences=False,
|
140 |
tei_coordinates=coordinates,
|
141 |
include_raw_citations=False,
|
142 |
include_raw_affiliations=False,
|
|
|
188 |
# "passage_id": "title0"
|
189 |
# })
|
190 |
|
191 |
+
passage_type = "paragraph"
|
192 |
|
193 |
if doc_biblio.abstract is not None and len(doc_biblio.abstract) > 0:
|
194 |
passages.append({
|
|
|
201 |
})
|
202 |
|
203 |
soup = BeautifulSoup(text, 'xml')
|
204 |
+
text_blocks_body = get_children_body(soup, verbose=False, use_paragraphs=True)
|
205 |
+
|
206 |
+
use_paragraphs = True
|
207 |
+
if not use_paragraphs:
|
208 |
+
passages.extend([
|
209 |
+
{
|
210 |
+
"text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
|
211 |
+
text.parent.name != "ref" or (
|
212 |
+
text.parent.name == "ref" and text.parent.attrs[
|
213 |
+
'type'] != 'bibr'))),
|
214 |
+
"type": passage_type,
|
215 |
+
"section": "<body>",
|
216 |
+
"subSection": "<paragraph>",
|
217 |
+
"passage_id": str(paragraph_id),
|
218 |
+
"coordinates": paragraph['coords'] if coordinates and sentence.has_attr('coords') else ""
|
219 |
+
}
|
220 |
+
for paragraph_id, paragraph in enumerate(text_blocks_body) for
|
221 |
+
sentence_id, sentence in enumerate(paragraph)
|
222 |
+
])
|
223 |
+
else:
|
224 |
+
passages.extend([
|
225 |
+
{
|
226 |
+
"text": self.post_process(''.join(text for text in paragraph.find_all(text=True) if
|
227 |
+
text.parent.name != "ref" or (
|
228 |
+
text.parent.name == "ref" and text.parent.attrs[
|
229 |
+
'type'] != 'bibr'))),
|
230 |
+
"type": passage_type,
|
231 |
+
"section": "<body>",
|
232 |
+
"subSection": "<paragraph>",
|
233 |
+
"passage_id": str(paragraph_id),
|
234 |
+
"coordinates": paragraph['coords'] if coordinates and paragraph.has_attr('coords') else ""
|
235 |
+
}
|
236 |
+
for paragraph_id, paragraph in enumerate(text_blocks_body)
|
237 |
+
])
|
238 |
|
239 |
text_blocks_figures = get_children_figures(soup, verbose=False)
|
240 |
|
241 |
+
if not use_paragraphs:
|
242 |
+
passages.extend([
|
243 |
+
{
|
244 |
+
"text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
|
245 |
+
text.parent.name != "ref" or (
|
246 |
+
text.parent.name == "ref" and text.parent.attrs[
|
247 |
+
'type'] != 'bibr'))),
|
248 |
+
"type": passage_type,
|
249 |
+
"section": "<body>",
|
250 |
+
"subSection": "<figure>",
|
251 |
+
"passage_id": str(paragraph_id) + str(sentence_id),
|
252 |
+
"coordinates": sentence['coords'] if coordinates and 'coords' in sentence else ""
|
253 |
+
}
|
254 |
+
for paragraph_id, paragraph in enumerate(text_blocks_figures) for
|
255 |
+
sentence_id, sentence in enumerate(paragraph)
|
256 |
+
])
|
257 |
+
else:
|
258 |
+
passages.extend([
|
259 |
+
{
|
260 |
+
"text": self.post_process(''.join(text for text in paragraph.find_all(text=True) if
|
261 |
+
text.parent.name != "ref" or (
|
262 |
+
text.parent.name == "ref" and text.parent.attrs[
|
263 |
+
'type'] != 'bibr'))),
|
264 |
+
"type": passage_type,
|
265 |
+
"section": "<body>",
|
266 |
+
"subSection": "<figure>",
|
267 |
+
"passage_id": str(paragraph_id),
|
268 |
+
"coordinates": paragraph['coords'] if coordinates and paragraph.has_attr('coords') else ""
|
269 |
+
}
|
270 |
+
for paragraph_id, paragraph in enumerate(text_blocks_figures)
|
271 |
+
])
|
272 |
|
273 |
return output_data
|
274 |
|
|
|
564 |
def extract_materials(self, text):
|
565 |
return self.gmp.extract_materials(text)
|
566 |
|
567 |
+
@staticmethod
|
568 |
+
def box_to_dict(box, color=None, type=None):
|
569 |
+
|
570 |
+
if box is None or box == "" or len(box) < 5:
|
571 |
+
return {}
|
572 |
+
|
573 |
+
item = {"page": box[0], "x": box[1], "y": box[2], "width": box[3], "height": box[4]}
|
574 |
+
if color is not None:
|
575 |
+
item['color'] = color
|
576 |
+
|
577 |
+
if type:
|
578 |
+
item['type'] = type
|
579 |
+
|
580 |
+
return item
|
581 |
+
|
582 |
@staticmethod
|
583 |
def prune_overlapping_annotations(entities: list) -> list:
|
584 |
# Sorting by offsets
|
|
|
789 |
child_name = "p" if use_paragraphs else "s"
|
790 |
for child in soup.TEI.children:
|
791 |
if child.name == 'text':
|
792 |
+
children.extend(
|
793 |
+
[subchild for subchild in child.find_all("body") for subchild in subchild.find_all(child_name)])
|
794 |
|
795 |
if verbose:
|
796 |
print(str(children))
|
|
|
803 |
child_name = "p" if use_paragraphs else "s"
|
804 |
for child in soup.TEI.children:
|
805 |
if child.name == 'text':
|
806 |
+
children.extend(
|
807 |
+
[subchild for subchilds in child.find_all("body") for subchild in subchilds.find_all("figDesc")])
|
808 |
|
809 |
if verbose:
|
810 |
print(str(children))
|
requirements.txt
CHANGED
@@ -7,7 +7,7 @@ grobid_tei_xml==0.1.3
|
|
7 |
tqdm
|
8 |
pyyaml==6.0
|
9 |
pytest
|
10 |
-
streamlit==1.
|
11 |
lxml
|
12 |
Beautifulsoup4
|
13 |
python-dotenv
|
|
|
7 |
tqdm
|
8 |
pyyaml==6.0
|
9 |
pytest
|
10 |
+
streamlit==1.29.0
|
11 |
lxml
|
12 |
Beautifulsoup4
|
13 |
python-dotenv
|
streamlit_app.py
CHANGED
@@ -296,7 +296,7 @@ with st.sidebar:
|
|
296 |
mode = st.radio("Query mode", ("LLM", "Embeddings"), disabled=not uploaded_file, index=0, horizontal=True,
|
297 |
help="LLM will respond the question, Embedding will show the "
|
298 |
"paragraphs relevant to the question in the paper.")
|
299 |
-
chunk_size = st.slider("Chunks size", -1, 2000, value
|
300 |
help="Size of chunks in which the document is partitioned",
|
301 |
disabled=uploaded_file is not None)
|
302 |
context_size = st.slider("Context size", 3, 10, value=4,
|
@@ -410,8 +410,9 @@ with right_column:
|
|
410 |
st.session_state.doc_id,
|
411 |
context_size=context_size)
|
412 |
annotations = [
|
413 |
-
|
414 |
-
coordinates for c in coord]
|
|
|
415 |
gradients = generate_color_gradient(len(annotations))
|
416 |
for i, color in enumerate(gradients):
|
417 |
annotations[i]['color'] = color
|
|
|
296 |
mode = st.radio("Query mode", ("LLM", "Embeddings"), disabled=not uploaded_file, index=0, horizontal=True,
|
297 |
help="LLM will respond the question, Embedding will show the "
|
298 |
"paragraphs relevant to the question in the paper.")
|
299 |
+
chunk_size = st.slider("Chunks size", -1, 2000, value=-1,
|
300 |
help="Size of chunks in which the document is partitioned",
|
301 |
disabled=uploaded_file is not None)
|
302 |
context_size = st.slider("Context size", 3, 10, value=4,
|
|
|
410 |
st.session_state.doc_id,
|
411 |
context_size=context_size)
|
412 |
annotations = [
|
413 |
+
GrobidAggregationProcessor.box_to_dict(coo) for coo in [c.split(",") for coord in
|
414 |
+
coordinates for c in coord]
|
415 |
+
]
|
416 |
gradients = generate_color_gradient(len(annotations))
|
417 |
for i, color in enumerate(gradients):
|
418 |
annotations[i]['color'] = color
|
tests/__init__.py
ADDED
File without changes
|
tests/conftest.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from pathlib import Path
|
3 |
+
from unittest.mock import MagicMock
|
4 |
+
|
5 |
+
import pytest
|
6 |
+
from _pytest._py.path import LocalPath
|
7 |
+
|
8 |
+
# derived from https://github.com/elifesciences/sciencebeam-trainer-delft/tree/develop/tests
|
9 |
+
|
10 |
+
LOGGER = logging.getLogger(__name__)
|
11 |
+
|
12 |
+
|
13 |
+
@pytest.fixture(scope='session', autouse=True)
|
14 |
+
def setup_logging():
|
15 |
+
logging.root.handlers = []
|
16 |
+
logging.basicConfig(level='INFO')
|
17 |
+
logging.getLogger('tests').setLevel('DEBUG')
|
18 |
+
# logging.getLogger('sciencebeam_trainer_delft').setLevel('DEBUG')
|
19 |
+
|
20 |
+
|
21 |
+
def _backport_assert_called(mock: MagicMock):
|
22 |
+
assert mock.called
|
23 |
+
|
24 |
+
|
25 |
+
@pytest.fixture(scope='session', autouse=True)
|
26 |
+
def patch_magicmock():
|
27 |
+
try:
|
28 |
+
MagicMock.assert_called
|
29 |
+
except AttributeError:
|
30 |
+
MagicMock.assert_called = _backport_assert_called
|
31 |
+
|
32 |
+
|
33 |
+
@pytest.fixture
|
34 |
+
def temp_dir(tmpdir: LocalPath):
|
35 |
+
# convert to standard Path
|
36 |
+
return Path(str(tmpdir))
|
37 |
+
|
tests/resources/2312.07559.paragraphs.tei.xml
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tests/resources/2312.07559.sentences.tei.xml
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tests/test_grobid_processors.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from bs4 import BeautifulSoup
|
2 |
+
from document_qa.grobid_processors import get_children_body
|
3 |
+
|
4 |
+
|
5 |
+
def test_get_children_paragraphs():
|
6 |
+
with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
|
7 |
+
soup = BeautifulSoup(fo, 'xml')
|
8 |
+
|
9 |
+
children = get_children_body(soup, use_paragraphs=True)
|
10 |
+
|
11 |
+
assert len(children) == 70
|
12 |
+
|
13 |
+
|
14 |
+
def test_get_children_sentences():
|
15 |
+
with open("resources/2312.07559.sentences.tei.xml", 'r') as fo:
|
16 |
+
soup = BeautifulSoup(fo, 'xml')
|
17 |
+
|
18 |
+
children = get_children_body(soup, use_paragraphs=False)
|
19 |
+
|
20 |
+
assert len(children) == 327
|