Spaces:

lfoppiano
/

document-qa

Running

App Files Files Community

Luca Foppiano commited on Jan 30

Commit

b042214

•

2 Parent(s): 8d140dd a7d9efc

Merge pull request #26 from lfoppiano/add-pdf-viewer

Browse files

Files changed (10) hide show

document_qa/document_qa_engine.py +105 -46
document_qa/grobid_processors.py +148 -64
requirements.txt +3 -1
streamlit_app.py +90 -30
tests/__init__.py +0 -0
tests/conftest.py +37 -0
tests/resources/2312.07559.paragraphs.tei.xml +0 -0
tests/resources/2312.07559.sentences.tei.xml +0 -0
tests/test_document_qa_engine.py +71 -0
tests/test_grobid_processors.py +46 -0

document_qa/document_qa_engine.py CHANGED Viewed

@@ -3,18 +3,87 @@ import os
 from pathlib import Path
 from typing import Union, Any
-from document_qa.grobid_processors import GrobidProcessor
 from grobid_client.grobid_client import GrobidClient
-from langchain.chains import create_extraction_chain, ConversationChain, ConversationalRetrievalChain
 from langchain.chains.question_answering import load_qa_chain, stuff_prompt, refine_prompts, map_reduce_prompt, \
     map_rerank_prompt
 from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
 from langchain.retrievers import MultiQueryRetriever
 from langchain.schema import Document
-from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import Chroma
 from tqdm import tqdm
 class DocumentQAEngine:
@@ -44,6 +113,7 @@ class DocumentQAEngine:
         self.llm = llm
         self.memory = memory
         self.chain = load_qa_chain(llm, chain_type=qa_chain_type)
         if embeddings_root_path is not None:
             self.embeddings_root_path = embeddings_root_path
@@ -57,7 +127,7 @@ class DocumentQAEngine:
             grobid_client = GrobidClient(
                 grobid_server=self.grobid_url,
                 batch_size=1000,
-                coordinates=["p"],
                 sleep_time=5,
                 timeout=60,
                 check_server=True
@@ -105,7 +175,7 @@ class DocumentQAEngine:
         if verbose:
             print(query)
-        response = self._run_query(doc_id, query, context_size=context_size)
         response = response['output_text'] if 'output_text' in response else response
         if verbose:
@@ -116,17 +186,17 @@ class DocumentQAEngine:
                 return self._parse_json(response, output_parser), response
             except Exception as oe:
                 print("Failing to parse the response", oe)
-                return None, response
         elif extraction_schema:
             try:
                 chain = create_extraction_chain(extraction_schema, self.llm)
                 parsed = chain.run(response)
-                return parsed, response
             except Exception as oe:
                 print("Failing to parse the response", oe)
-                return None, response
         else:
-            return None, response
     def query_storage(self, query: str, doc_id, context_size=4):
         documents = self._get_context(doc_id, query, context_size)
@@ -157,12 +227,15 @@ class DocumentQAEngine:
     def _run_query(self, doc_id, query, context_size=4):
         relevant_documents = self._get_context(doc_id, query, context_size)
         response = self.chain.run(input_documents=relevant_documents,
                                   question=query)
         if self.memory:
             self.memory.save_context({"input": query}, {"output": response})
-        return response
     def _get_context(self, doc_id, query, context_size=4):
         db = self.embeddings_dict[doc_id]
@@ -188,14 +261,15 @@ class DocumentQAEngine:
         relevant_documents = multi_query_retriever.get_relevant_documents(query)
         return relevant_documents
-    def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, include=(), verbose=False):
         """
         Extract text from documents using Grobid, if chunk_size is < 0 it keeps each paragraph separately
         """
         if verbose:
             print("File", pdf_file_path)
         filename = Path(pdf_file_path).stem
-        structure = self.grobid_processor.process_structure(pdf_file_path)
         biblio = structure['biblio']
         biblio['filename'] = filename.replace(" ", "_")
@@ -207,48 +281,33 @@ class DocumentQAEngine:
         metadatas = []
         ids = []
-        if chunk_size < 0:
-            for passage in structure['passages']:
-                biblio_copy = copy.copy(biblio)
-                if len(str.strip(passage['text'])) > 0:
-                    texts.append(passage['text'])
-                    biblio_copy['type'] = passage['type']
-                    biblio_copy['section'] = passage['section']
-                    biblio_copy['subSection'] = passage['subSection']
-                    metadatas.append(biblio_copy)
-                    ids.append(passage['passage_id'])
-        else:
-            document_text = " ".join([passage['text'] for passage in structure['passages']])
-            # text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
-            text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
-                chunk_size=chunk_size,
-                chunk_overlap=chunk_size * perc_overlap
-            )
-            texts = text_splitter.split_text(document_text)
-            metadatas = [biblio for _ in range(len(texts))]
-            ids = [id for id, t in enumerate(texts)]
-        if "biblio" in include:
-            biblio_metadata = copy.copy(biblio)
-            biblio_metadata['type'] = "biblio"
-            biblio_metadata['section'] = "header"
-            for key in ['title', 'authors', 'publication_year']:
-                if key in biblio_metadata:
-                    texts.append("{}: {}".format(key, biblio_metadata[key]))
-                    metadatas.append(biblio_metadata)
-                    ids.append(key)
         return texts, metadatas, ids
-    def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1, include_biblio=False):
-        include = ["biblio"] if include_biblio else []
         texts, metadata, ids = self.get_text_from_document(
             pdf_path,
             chunk_size=chunk_size,
-            perc_overlap=perc_overlap,
-            include=include)
         if doc_id:
             hash = doc_id
         else:

 from pathlib import Path
 from typing import Union, Any
+import tiktoken
 from grobid_client.grobid_client import GrobidClient
+from langchain.chains import create_extraction_chain
 from langchain.chains.question_answering import load_qa_chain, stuff_prompt, refine_prompts, map_reduce_prompt, \
     map_rerank_prompt
 from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
 from langchain.retrievers import MultiQueryRetriever
 from langchain.schema import Document
 from langchain.vectorstores import Chroma
 from tqdm import tqdm
+from document_qa.grobid_processors import GrobidProcessor
+class TextMerger:
+    def __init__(self, model_name=None, encoding_name="gpt2"):
+        if model_name is not None:
+            self.enc = tiktoken.encoding_for_model(model_name)
+        else:
+            self.enc = tiktoken.get_encoding(encoding_name)
+    def encode(self, text, allowed_special=set(), disallowed_special="all"):
+        return self.enc.encode(
+            text,
+            allowed_special=allowed_special,
+            disallowed_special=disallowed_special,
+        )
+    def merge_passages(self, passages, chunk_size, tolerance=0.2):
+        new_passages = []
+        new_coordinates = []
+        current_texts = []
+        current_coordinates = []
+        for idx, passage in enumerate(passages):
+            text = passage['text']
+            coordinates = passage['coordinates']
+            current_texts.append(text)
+            current_coordinates.append(coordinates)
+            accumulated_text = " ".join(current_texts)
+            encoded_accumulated_text = self.encode(accumulated_text)
+            if len(encoded_accumulated_text) > chunk_size + chunk_size * tolerance:
+                if len(current_texts) > 1:
+                    new_passages.append(current_texts[:-1])
+                    new_coordinates.append(current_coordinates[:-1])
+                    current_texts = [current_texts[-1]]
+                    current_coordinates = [current_coordinates[-1]]
+                else:
+                    new_passages.append(current_texts)
+                    new_coordinates.append(current_coordinates)
+                    current_texts = []
+                    current_coordinates = []
+            elif chunk_size <= len(encoded_accumulated_text) < chunk_size + chunk_size * tolerance:
+                new_passages.append(current_texts)
+                new_coordinates.append(current_coordinates)
+                current_texts = []
+                current_coordinates = []
+        if len(current_texts) > 0:
+            new_passages.append(current_texts)
+            new_coordinates.append(current_coordinates)
+        new_passages_struct = []
+        for i, passages in enumerate(new_passages):
+            text = " ".join(passages)
+            coordinates = ";".join(new_coordinates[i])
+            new_passages_struct.append(
+                {
+                    "text": text,
+                    "coordinates": coordinates,
+                    "type": "aggregated chunks",
+                    "section": "mixed",
+                    "subSection": "mixed"
+                }
+            )
+        return new_passages_struct
 class DocumentQAEngine:
         self.llm = llm
         self.memory = memory
         self.chain = load_qa_chain(llm, chain_type=qa_chain_type)
+        self.text_merger = TextMerger()
         if embeddings_root_path is not None:
             self.embeddings_root_path = embeddings_root_path
             grobid_client = GrobidClient(
                 grobid_server=self.grobid_url,
                 batch_size=1000,
+                coordinates=["p", "title", "persName"],
                 sleep_time=5,
                 timeout=60,
                 check_server=True
         if verbose:
             print(query)
+        response, coordinates = self._run_query(doc_id, query, context_size=context_size)
         response = response['output_text'] if 'output_text' in response else response
         if verbose:
                 return self._parse_json(response, output_parser), response
             except Exception as oe:
                 print("Failing to parse the response", oe)
+                return None, response, coordinates
         elif extraction_schema:
             try:
                 chain = create_extraction_chain(extraction_schema, self.llm)
                 parsed = chain.run(response)
+                return parsed, response, coordinates
             except Exception as oe:
                 print("Failing to parse the response", oe)
+                return None, response, coordinates
         else:
+            return None, response, coordinates
     def query_storage(self, query: str, doc_id, context_size=4):
         documents = self._get_context(doc_id, query, context_size)
     def _run_query(self, doc_id, query, context_size=4):
         relevant_documents = self._get_context(doc_id, query, context_size)
+        relevant_document_coordinates = [doc.metadata['coordinates'].split(";") if 'coordinates' in doc.metadata else []
+                                         for doc in
+                                         relevant_documents]  # filter(lambda d: d['type'] == "sentence", relevant_documents)]
         response = self.chain.run(input_documents=relevant_documents,
                                   question=query)
         if self.memory:
             self.memory.save_context({"input": query}, {"output": response})
+        return response, relevant_document_coordinates
     def _get_context(self, doc_id, query, context_size=4):
         db = self.embeddings_dict[doc_id]
         relevant_documents = multi_query_retriever.get_relevant_documents(query)
         return relevant_documents
+    def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, verbose=False):
         """
         Extract text from documents using Grobid, if chunk_size is < 0 it keeps each paragraph separately
         """
         if verbose:
             print("File", pdf_file_path)
         filename = Path(pdf_file_path).stem
+        coordinates = True  # if chunk_size == -1 else False
+        structure = self.grobid_processor.process_structure(pdf_file_path, coordinates=coordinates)
         biblio = structure['biblio']
         biblio['filename'] = filename.replace(" ", "_")
         metadatas = []
         ids = []
+        if chunk_size > 0:
+            new_passages = self.text_merger.merge_passages(structure['passages'], chunk_size=chunk_size)
+        else:
+            new_passages = structure['passages']
+        for passage in new_passages:
+            biblio_copy = copy.copy(biblio)
+            if len(str.strip(passage['text'])) > 0:
+                texts.append(passage['text'])
+                biblio_copy['type'] = passage['type']
+                biblio_copy['section'] = passage['section']
+                biblio_copy['subSection'] = passage['subSection']
+                biblio_copy['coordinates'] = passage['coordinates']
+                metadatas.append(biblio_copy)
+                # ids.append(passage['passage_id'])
+            ids = [id for id, t in enumerate(new_passages)]
         return texts, metadatas, ids
+    def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1):
         texts, metadata, ids = self.get_text_from_document(
             pdf_path,
             chunk_size=chunk_size,
+            perc_overlap=perc_overlap)
         if doc_id:
             hash = doc_id
         else:

document_qa/grobid_processors.py CHANGED Viewed

@@ -131,13 +131,13 @@ class GrobidProcessor(BaseProcessor):
         # super().__init__()
         self.grobid_client = grobid_client
-    def process_structure(self, input_path):
         pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
                                                                 input_path,
                                                                 consolidate_header=True,
                                                                 consolidate_citations=False,
                                                                 segment_sentences=False,
-                                                                tei_coordinates=False,
                                                                 include_raw_citations=False,
                                                                 include_raw_affiliations=False,
                                                                 generateIDs=True)
@@ -145,7 +145,7 @@ class GrobidProcessor(BaseProcessor):
         if status != 200:
             return
-        output_data = self.parse_grobid_xml(text)
         output_data['filename'] = Path(pdf_file).stem.replace(".tei", "")
         return output_data
@@ -159,7 +159,7 @@ class GrobidProcessor(BaseProcessor):
         return doc
-    def parse_grobid_xml(self, text):
         output_data = OrderedDict()
         doc_biblio = grobid_tei_xml.parse_document_xml(text)
@@ -176,61 +176,115 @@ class GrobidProcessor(BaseProcessor):
             pass
         output_data['biblio'] = biblio
         passages = []
         output_data['passages'] = passages
-        # if biblio['title'] is not None and len(biblio['title']) > 0:
-        #     passages.append({
-        #         "text": self.post_process(biblio['title']),
-        #         "type": "paragraph",
-        #         "section": "<header>",
-        #         "subSection": "<title>",
-        #         "passage_id": "title0"
-        #     })
-        if doc_biblio.abstract is not None and len(doc_biblio.abstract) > 0:
-            passages.append({
-                "text": self.post_process(doc_biblio.abstract),
-                "type": "paragraph",
-                "section": "<header>",
-                "subSection": "<abstract>",
-                "passage_id": "abstract0"
-            })
         soup = BeautifulSoup(text, 'xml')
-        text_blocks_body = get_children_body(soup, verbose=False)
-        passages.extend([
-            {
-                "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
-                                                  text.parent.name != "ref" or (
-                                                          text.parent.name == "ref" and text.parent.attrs[
-                                                      'type'] != 'bibr'))),
-                "type": "paragraph",
-                "section": "<body>",
-                "subSection": "<paragraph>",
-                "passage_id": str(paragraph_id) + str(sentence_id)
-            }
-            for paragraph_id, paragraph in enumerate(text_blocks_body) for
-            sentence_id, sentence in enumerate(paragraph)
-        ])
-        text_blocks_figures = get_children_figures(soup, verbose=False)
-        passages.extend([
-            {
-                "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
-                                                  text.parent.name != "ref" or (
-                                                          text.parent.name == "ref" and text.parent.attrs[
-                                                      'type'] != 'bibr'))),
-                "type": "paragraph",
-                "section": "<body>",
-                "subSection": "<figure>",
-                "passage_id": str(paragraph_id) + str(sentence_id)
-            }
-            for paragraph_id, paragraph in enumerate(text_blocks_figures) for
-            sentence_id, sentence in enumerate(paragraph)
-        ])
         return output_data
@@ -526,6 +580,21 @@ class GrobidAggregationProcessor(GrobidProcessor, GrobidQuantitiesProcessor, Gro
     def extract_materials(self, text):
         return self.gmp.extract_materials(text)
     @staticmethod
     def prune_overlapping_annotations(entities: list) -> list:
         # Sorting by offsets
@@ -731,25 +800,40 @@ def get_children_list_grobid(soup: object, use_paragraphs: object = True, verbos
     return children
-def get_children_body(soup: object, use_paragraphs: object = True, verbose: object = False) -> object:
-    children = []
-    child_name = "p" if use_paragraphs else "s"
     for child in soup.TEI.children:
         if child.name == 'text':
-            children.extend([subchild.find_all(child_name) for subchild in child.find_all("body")])
     if verbose:
-        print(str(children))
-    return children
-def get_children_figures(soup: object, use_paragraphs: object = True, verbose: object = False) -> object:
     children = []
-    child_name = "p" if use_paragraphs else "s"
     for child in soup.TEI.children:
         if child.name == 'text':
-            children.extend([subchild.find_all("figDesc") for subchild in child.find_all("body")])
     if verbose:
         print(str(children))

         # super().__init__()
         self.grobid_client = grobid_client
+    def process_structure(self, input_path, coordinates=False):
         pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
                                                                 input_path,
                                                                 consolidate_header=True,
                                                                 consolidate_citations=False,
                                                                 segment_sentences=False,
+                                                                tei_coordinates=coordinates,
                                                                 include_raw_citations=False,
                                                                 include_raw_affiliations=False,
                                                                 generateIDs=True)
         if status != 200:
             return
+        output_data = self.parse_grobid_xml(text, coordinates=coordinates)
         output_data['filename'] = Path(pdf_file).stem.replace(".tei", "")
         return output_data
         return doc
+    def parse_grobid_xml(self, text, coordinates=False):
         output_data = OrderedDict()
         doc_biblio = grobid_tei_xml.parse_document_xml(text)
             pass
         output_data['biblio'] = biblio
         passages = []
         output_data['passages'] = passages
+        passage_type = "paragraph"
         soup = BeautifulSoup(text, 'xml')
+        blocks_header = get_xml_nodes_header(soup, use_paragraphs=True)
+        passages.append({
+            "text": f"authors: {biblio['authors']}",
+            "type": passage_type,
+            "section": "<header>",
+            "subSection": "<title>",
+            "passage_id": "htitle",
+            "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
+                            blocks_header['authors']])
+        })
+        passages.append({
+            "text": self.post_process(" ".join([node.text for node in blocks_header['title']])),
+            "type": passage_type,
+            "section": "<header>",
+            "subSection": "<title>",
+            "passage_id": "htitle",
+            "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
+                                     blocks_header['title']])
+        })
+        passages.append({
+            "text": self.post_process(
+                ''.join(node.text for node in blocks_header['abstract'] for text in node.find_all(text=True) if
+                        text.parent.name != "ref" or (
+                                text.parent.name == "ref" and text.parent.attrs[
+                            'type'] != 'bibr'))),
+            "type": passage_type,
+            "section": "<header>",
+            "subSection": "<abstract>",
+            "passage_id": "habstract",
+            "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
+                                     blocks_header['abstract']])
+        })
+        text_blocks_body = get_xml_nodes_body(soup, verbose=False, use_paragraphs=True)
+        use_paragraphs = True
+        if not use_paragraphs:
+            passages.extend([
+                {
+                    "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
+                                                      text.parent.name != "ref" or (
+                                                              text.parent.name == "ref" and text.parent.attrs[
+                                                          'type'] != 'bibr'))),
+                    "type": passage_type,
+                    "section": "<body>",
+                    "subSection": "<paragraph>",
+                    "passage_id": str(paragraph_id),
+                    "coordinates": paragraph['coords'] if coordinates and sentence.has_attr('coords') else ""
+                }
+                for paragraph_id, paragraph in enumerate(text_blocks_body) for
+                sentence_id, sentence in enumerate(paragraph)
+            ])
+        else:
+            passages.extend([
+                {
+                    "text": self.post_process(''.join(text for text in paragraph.find_all(text=True) if
+                                                      text.parent.name != "ref" or (
+                                                              text.parent.name == "ref" and text.parent.attrs[
+                                                          'type'] != 'bibr'))),
+                    "type": passage_type,
+                    "section": "<body>",
+                    "subSection": "<paragraph>",
+                    "passage_id": str(paragraph_id),
+                    "coordinates": paragraph['coords'] if coordinates and paragraph.has_attr('coords') else ""
+                }
+                for paragraph_id, paragraph in enumerate(text_blocks_body)
+            ])
+        text_blocks_figures = get_xml_nodes_figures(soup, verbose=False)
+        if not use_paragraphs:
+            passages.extend([
+                {
+                    "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
+                                                      text.parent.name != "ref" or (
+                                                              text.parent.name == "ref" and text.parent.attrs[
+                                                          'type'] != 'bibr'))),
+                    "type": passage_type,
+                    "section": "<body>",
+                    "subSection": "<figure>",
+                    "passage_id": str(paragraph_id) + str(sentence_id),
+                    "coordinates": sentence['coords'] if coordinates and 'coords' in sentence else ""
+                }
+                for paragraph_id, paragraph in enumerate(text_blocks_figures) for
+                sentence_id, sentence in enumerate(paragraph)
+            ])
+        else:
+            passages.extend([
+                {
+                    "text": self.post_process(''.join(text for text in paragraph.find_all(text=True) if
+                                                      text.parent.name != "ref" or (
+                                                              text.parent.name == "ref" and text.parent.attrs[
+                                                          'type'] != 'bibr'))),
+                    "type": passage_type,
+                    "section": "<body>",
+                    "subSection": "<figure>",
+                    "passage_id": str(paragraph_id),
+                    "coordinates": paragraph['coords'] if coordinates and paragraph.has_attr('coords') else ""
+                }
+                for paragraph_id, paragraph in enumerate(text_blocks_figures)
+            ])
         return output_data
     def extract_materials(self, text):
         return self.gmp.extract_materials(text)
+    @staticmethod
+    def box_to_dict(box, color=None, type=None):
+        if box is None or box == "" or len(box) < 5:
+            return {}
+        item = {"page": box[0], "x": box[1], "y": box[2], "width": box[3], "height": box[4]}
+        if color is not None:
+            item['color'] = color
+        if type:
+            item['type'] = type
+        return item
     @staticmethod
     def prune_overlapping_annotations(entities: list) -> list:
         # Sorting by offsets
     return children
+def get_xml_nodes_header(soup: object, use_paragraphs: bool = True) -> list:
+    sub_tag = "p" if use_paragraphs else "s"
+    header_elements = {
+        "authors": [persNameNode for persNameNode in soup.teiHeader.find_all("persName")],
+        "abstract": [p_in_abstract for abstractNodes in soup.teiHeader.find_all("abstract") for p_in_abstract in
+                     abstractNodes.find_all(sub_tag)],
+        "title": [soup.teiHeader.fileDesc.title]
+    }
+    return header_elements
+def get_xml_nodes_body(soup: object, use_paragraphs: bool = True, verbose: bool = False) -> list:
+    nodes = []
+    tag_name = "p" if use_paragraphs else "s"
     for child in soup.TEI.children:
         if child.name == 'text':
+            # nodes.extend([subchild.find_all(tag_name) for subchild in child.find_all("body")])
+            nodes.extend(
+                [subsubchild for subchild in child.find_all("body") for subsubchild in subchild.find_all(tag_name)])
     if verbose:
+        print(str(nodes))
+    return nodes
+def get_xml_nodes_figures(soup: object, verbose: bool = False) -> list:
     children = []
     for child in soup.TEI.children:
         if child.name == 'text':
+            children.extend(
+                [subchild for subchilds in child.find_all("body") for subchild in subchilds.find_all("figDesc")])
     if verbose:
         print(str(children))

requirements.txt CHANGED Viewed

@@ -19,7 +19,9 @@ chromadb==0.4.19
 tiktoken==0.4.0
 openai==0.27.7
 langchain==0.0.350
 typing-inspect==0.9.0
 typing_extensions==4.8.0
 pydantic==2.4.2
-sentence_transformers==2.2.2

 tiktoken==0.4.0
 openai==0.27.7
 langchain==0.0.350
+langchain-core==0.1.0
 typing-inspect==0.9.0
 typing_extensions==4.8.0
 pydantic==2.4.2
+sentence_transformers==2.2.2
+streamlit-pdf-viewer

streamlit_app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import base64
 import os
 import re
 from hashlib import blake2b
@@ -8,6 +7,7 @@ import dotenv
 from grobid_quantities.quantities import QuantitiesAPI
 from langchain.llms.huggingface_hub import HuggingFaceHub
 from langchain.memory import ConversationBufferWindowMemory
 dotenv.load_dotenv(override=True)
@@ -70,6 +70,18 @@ if 'memory' not in st.session_state:
 if 'binary' not in st.session_state:
     st.session_state['binary'] = None
 st.set_page_config(
     page_title="Scientific Document Insights Q/A",
     page_icon="📝",
@@ -216,7 +228,9 @@ with st.sidebar:
     st.session_state['model'] = model = st.selectbox(
         "Model:",
         options=OPENAI_MODELS + list(OPEN_MODELS.keys()),
-        index=4,
         placeholder="Select model",
         help="Select the LLM model:",
         disabled=st.session_state['doc_id'] is not None or st.session_state['uploaded']
@@ -291,21 +305,44 @@ question = st.chat_input(
 with st.sidebar:
     st.header("Settings")
-    mode = st.radio("Query mode", ("LLM", "Embeddings"), disabled=not uploaded_file, index=0, horizontal=True,
-                    help="LLM will respond the question, Embedding will show the "
-                         "paragraphs relevant to the question in the paper.")
-    chunk_size = st.slider("Chunks size", 100, 2000, value=250,
-                           help="Size of chunks in which the document is partitioned",
                            disabled=uploaded_file is not None)
-    context_size = st.slider("Context size", 3, 10, value=4,
-                             help="Number of chunks to consider when answering a question",
-                             disabled=not uploaded_file)
     st.session_state['ner_processing'] = st.checkbox("Identify materials and properties.")
     st.markdown(
         'The LLM responses undergo post-processing to extract <span style="color:orange">physical quantities, measurements</span>, and <span style="color:green">materials</span> mentions.',
         unsafe_allow_html=True)
     st.divider()
     st.header("Documentation")
@@ -324,13 +361,6 @@ with st.sidebar:
     st.markdown(
         """If you switch the mode to "Embedding," the system will return specific chunks from the document that are semantically related to your query. This mode helps to test why sometimes the answers are not satisfying or incomplete. """)
-@st.cache_resource
-def get_pdf_display(binary):
-    base64_pdf = base64.b64encode(binary).decode('utf-8')
-    return F'<embed src="data:application/pdf;base64,{base64_pdf}" width="100%" height="700" type="application/pdf"></embed>'
 if uploaded_file and not st.session_state.loaded_embeddings:
     if model not in st.session_state['api_keys']:
         st.error("Before uploading a document, you must enter the API key. ")
@@ -345,16 +375,31 @@ if uploaded_file and not st.session_state.loaded_embeddings:
             st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(tmp_file.name,
                                                                                                         chunk_size=chunk_size,
-                                                                                                        perc_overlap=0.1,
-                                                                                                        include_biblio=True)
             st.session_state['loaded_embeddings'] = True
             st.session_state.messages = []
     # timestamp = datetime.utcnow()
-with left_column:
-    if st.session_state['binary']:
-        left_column.markdown(get_pdf_display(st.session_state['binary']), unsafe_allow_html=True)
 with right_column:
     # css = '''
@@ -398,8 +443,18 @@ with right_column:
                                                                              context_size=context_size)
         elif mode == "LLM":
             with st.spinner("Generating response..."):
-                _, text_response = st.session_state['rqa'][model].query_document(question, st.session_state.doc_id,
-                                                                                 context_size=context_size)
         if not text_response:
             st.error("Something went wrong. Contact Luca Foppiano (Foppiano.Luca@nims.co.jp) to report the issue.")
@@ -418,11 +473,16 @@ with right_column:
                 st.write(text_response)
             st.session_state.messages.append({"role": "assistant", "mode": mode, "content": text_response})
-        # if len(st.session_state.messages) > 1:
-        #     last_answer = st.session_state.messages[len(st.session_state.messages)-1]
-        #     if last_answer['role'] == "assistant":
-        #         last_question = st.session_state.messages[len(st.session_state.messages)-2]
-        #         st.session_state.memory.save_context({"input": last_question['content']}, {"output": last_answer['content']})
     elif st.session_state.loaded_embeddings and st.session_state.doc_id:
         play_old_messages()

 import os
 import re
 from hashlib import blake2b
 from grobid_quantities.quantities import QuantitiesAPI
 from langchain.llms.huggingface_hub import HuggingFaceHub
 from langchain.memory import ConversationBufferWindowMemory
+from streamlit_pdf_viewer import pdf_viewer
 dotenv.load_dotenv(override=True)
 if 'binary' not in st.session_state:
     st.session_state['binary'] = None
+if 'annotations' not in st.session_state:
+    st.session_state['annotations'] = None
+if 'should_show_annotations' not in st.session_state:
+    st.session_state['should_show_annotations'] = True
+if 'pdf' not in st.session_state:
+    st.session_state['pdf'] = None
+if 'pdf_rendering' not in st.session_state:
+    st.session_state['pdf_rendering'] = None
 st.set_page_config(
     page_title="Scientific Document Insights Q/A",
     page_icon="📝",
     st.session_state['model'] = model = st.selectbox(
         "Model:",
         options=OPENAI_MODELS + list(OPEN_MODELS.keys()),
+        index=(OPENAI_MODELS + list(OPEN_MODELS.keys())).index(
+            "zephyr-7b-beta") if "DEFAULT_MODEL" not in os.environ or not os.environ["DEFAULT_MODEL"] else (
+                OPENAI_MODELS + list(OPEN_MODELS.keys())).index(os.environ["DEFAULT_MODEL"]),
         placeholder="Select model",
         help="Select the LLM model:",
         disabled=st.session_state['doc_id'] is not None or st.session_state['uploaded']
 with st.sidebar:
     st.header("Settings")
+    mode = st.radio(
+        "Query mode",
+        ("LLM", "Embeddings"),
+        disabled=not uploaded_file,
+        index=0,
+        horizontal=True,
+        help="LLM will respond the question, Embedding will show the "
+             "paragraphs relevant to the question in the paper."
+    )
+    # Add a checkbox for showing annotations
+    # st.session_state['show_annotations'] = st.checkbox("Show annotations", value=True)
+    # st.session_state['should_show_annotations'] = st.checkbox("Show annotations", value=True)
+    chunk_size = st.slider("Text chunks size", -1, 2000, value=-1,
+                           help="Size of chunks in which split the document. -1: use paragraphs, > 0 paragraphs are aggregated.",
                            disabled=uploaded_file is not None)
+    if chunk_size == -1:
+        context_size = st.slider("Context size (paragraphs)", 3, 20, value=10,
+                                 help="Number of paragraphs to consider when answering a question",
+                                 disabled=not uploaded_file)
+    else:
+        context_size = st.slider("Context size (chunks)", 3, 10, value=4,
+                                 help="Number of chunks to consider when answering a question",
+                                 disabled=not uploaded_file)
     st.session_state['ner_processing'] = st.checkbox("Identify materials and properties.")
     st.markdown(
         'The LLM responses undergo post-processing to extract <span style="color:orange">physical quantities, measurements</span>, and <span style="color:green">materials</span> mentions.',
         unsafe_allow_html=True)
+    st.session_state['pdf_rendering'] = st.radio(
+        "PDF rendering mode",
+        {"PDF.JS", "Native browser engine"},
+        index=1,
+        disabled=not uploaded_file,
+    )
     st.divider()
     st.header("Documentation")
     st.markdown(
         """If you switch the mode to "Embedding," the system will return specific chunks from the document that are semantically related to your query. This mode helps to test why sometimes the answers are not satisfying or incomplete. """)
 if uploaded_file and not st.session_state.loaded_embeddings:
     if model not in st.session_state['api_keys']:
         st.error("Before uploading a document, you must enter the API key. ")
             st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(tmp_file.name,
                                                                                                         chunk_size=chunk_size,
+                                                                                                        perc_overlap=0.1)
             st.session_state['loaded_embeddings'] = True
             st.session_state.messages = []
     # timestamp = datetime.utcnow()
+def rgb_to_hex(rgb):
+    return "#{:02x}{:02x}{:02x}".format(*rgb)
+def generate_color_gradient(num_elements):
+    # Define warm and cold colors in RGB format
+    warm_color = (255, 165, 0)  # Orange
+    cold_color = (0, 0, 255)  # Blue
+    # Generate a linear gradient of colors
+    color_gradient = [
+        rgb_to_hex(tuple(int(warm * (1 - i / num_elements) + cold * (i / num_elements)) for warm, cold in
+                         zip(warm_color, cold_color)))
+        for i in range(num_elements)
+    ]
+    return color_gradient
 with right_column:
     # css = '''
                                                                              context_size=context_size)
         elif mode == "LLM":
             with st.spinner("Generating response..."):
+                _, text_response, coordinates = st.session_state['rqa'][model].query_document(question,
+                                                                                              st.session_state.doc_id,
+                                                                                              context_size=context_size)
+                annotations = [[GrobidAggregationProcessor.box_to_dict([cs for cs in c.split(",")]) for c in coord_doc]
+                               for coord_doc in coordinates]
+                gradients = generate_color_gradient(len(annotations))
+                for i, color in enumerate(gradients):
+                    for annotation in annotations[i]:
+                        annotation['color'] = color
+                st.session_state['annotations'] = [annotation for annotation_doc in annotations for annotation in
+                                                   annotation_doc]
         if not text_response:
             st.error("Something went wrong. Contact Luca Foppiano (Foppiano.Luca@nims.co.jp) to report the issue.")
                 st.write(text_response)
             st.session_state.messages.append({"role": "assistant", "mode": mode, "content": text_response})
     elif st.session_state.loaded_embeddings and st.session_state.doc_id:
         play_old_messages()
+with left_column:
+    if st.session_state['binary']:
+        pdf_viewer(
+            input=st.session_state['binary'],
+            width=600,
+            height=800,
+            annotation_outline_size=2,
+            annotations=st.session_state['annotations'],
+            rendering='unwrap' if st.session_state['pdf_rendering'] == 'PDF.JS' else 'legacy_embed'
+        )

tests/__init__.py ADDED Viewed

File without changes

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import logging
+from pathlib import Path
+from unittest.mock import MagicMock
+import pytest
+from _pytest._py.path import LocalPath
+# derived from https://github.com/elifesciences/sciencebeam-trainer-delft/tree/develop/tests
+LOGGER = logging.getLogger(__name__)
+@pytest.fixture(scope='session', autouse=True)
+def setup_logging():
+    logging.root.handlers = []
+    logging.basicConfig(level='INFO')
+    logging.getLogger('tests').setLevel('DEBUG')
+    # logging.getLogger('sciencebeam_trainer_delft').setLevel('DEBUG')
+def _backport_assert_called(mock: MagicMock):
+    assert mock.called
+@pytest.fixture(scope='session', autouse=True)
+def patch_magicmock():
+    try:
+        MagicMock.assert_called
+    except AttributeError:
+        MagicMock.assert_called = _backport_assert_called
+@pytest.fixture
+def temp_dir(tmpdir: LocalPath):
+    # convert to standard Path
+    return Path(str(tmpdir))

tests/resources/2312.07559.paragraphs.tei.xml ADDED Viewed

The diff for this file is too large to render. See raw diff

tests/resources/2312.07559.sentences.tei.xml ADDED Viewed

The diff for this file is too large to render. See raw diff

tests/test_document_qa_engine.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from document_qa.document_qa_engine import TextMerger
+def test_merge_passages_small_chunk():
+    merger = TextMerger()
+    passages = [
+        {
+            'text': "The quick brown fox jumps over the tree",
+            'coordinates': '1'
+        },
+        {
+            'text': "and went straight into the mouth of a bear.",
+            'coordinates': '2'
+        },
+        {
+            'text': "The color of the colors is a color with colors",
+            'coordinates': '3'
+        },
+        {
+            'text': "the main colors are not the colorw we show",
+            'coordinates': '4'
+        }
+    ]
+    new_passages = merger.merge_passages(passages, chunk_size=10, tolerance=0)
+    assert len(new_passages) == 4
+    assert new_passages[0]['coordinates'] == "1"
+    assert new_passages[0]['text'] == "The quick brown fox jumps over the tree"
+    assert new_passages[1]['coordinates'] == "2"
+    assert new_passages[1]['text'] == "and went straight into the mouth of a bear."
+    assert new_passages[2]['coordinates'] == "3"
+    assert new_passages[2]['text'] == "The color of the colors is a color with colors"
+    assert new_passages[3]['coordinates'] == "4"
+    assert new_passages[3]['text'] == "the main colors are not the colorw we show"
+def test_merge_passages_big_chunk():
+    merger = TextMerger()
+    passages = [
+        {
+            'text': "The quick brown fox jumps over the tree",
+            'coordinates': '1'
+        },
+        {
+            'text': "and went straight into the mouth of a bear.",
+            'coordinates': '2'
+        },
+        {
+            'text': "The color of the colors is a color with colors",
+            'coordinates': '3'
+        },
+        {
+            'text': "the main colors are not the colorw we show",
+            'coordinates': '4'
+        }
+    ]
+    new_passages = merger.merge_passages(passages, chunk_size=20, tolerance=0)
+    assert len(new_passages) == 2
+    assert new_passages[0]['coordinates'] == "1;2"
+    assert new_passages[0][
+               'text'] == "The quick brown fox jumps over the tree and went straight into the mouth of a bear."
+    assert new_passages[1]['coordinates'] == "3;4"
+    assert new_passages[1][
+               'text'] == "The color of the colors is a color with colors the main colors are not the colorw we show"

tests/test_grobid_processors.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from bs4 import BeautifulSoup
+from document_qa.grobid_processors import get_xml_nodes_body, get_xml_nodes_figures, get_xml_nodes_header
+def test_get_xml_nodes_body_paragraphs():
+    with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
+        soup = BeautifulSoup(fo, 'xml')
+    nodes = get_xml_nodes_body(soup, use_paragraphs=True)
+    assert len(nodes) == 70
+def test_get_xml_nodes_body_sentences():
+    with open("resources/2312.07559.sentences.tei.xml", 'r') as fo:
+        soup = BeautifulSoup(fo, 'xml')
+    children = get_xml_nodes_body(soup, use_paragraphs=False)
+    assert len(children) == 327
+def test_get_xml_nodes_figures():
+    with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
+        soup = BeautifulSoup(fo, 'xml')
+    children = get_xml_nodes_figures(soup)
+    assert len(children) == 13
+def test_get_xml_nodes_header_paragraphs():
+    with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
+        soup = BeautifulSoup(fo, 'xml')
+    children = get_xml_nodes_header(soup)
+    assert len(children) == 8
+def test_get_xml_nodes_header_sentences():
+    with open("resources/2312.07559.sentences.tei.xml", 'r') as fo:
+        soup = BeautifulSoup(fo, 'xml')
+    children = get_xml_nodes_header(soup, use_paragraphs=False)
+    assert len(children) == 15