Spaces:
Running
Running
fix import, and reformat
Browse files- document_qa/document_qa_engine.py +1 -1
- document_qa/grobid_processors.py +5 -2
- streamlit_app.py +18 -10
document_qa/document_qa_engine.py
CHANGED
@@ -12,7 +12,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
12 |
from langchain.vectorstores import Chroma
|
13 |
from tqdm import tqdm
|
14 |
|
15 |
-
from grobid_processors import GrobidProcessor
|
16 |
|
17 |
|
18 |
class DocumentQAEngine:
|
|
|
12 |
from langchain.vectorstores import Chroma
|
13 |
from tqdm import tqdm
|
14 |
|
15 |
+
from document_qa.grobid_processors import GrobidProcessor
|
16 |
|
17 |
|
18 |
class DocumentQAEngine:
|
document_qa/grobid_processors.py
CHANGED
@@ -413,7 +413,8 @@ class GrobidMaterialsProcessor(BaseProcessor):
|
|
413 |
|
414 |
def extract_materials(self, text):
|
415 |
preprocessed_text = text.strip()
|
416 |
-
status, result = self.grobid_superconductors_client.process_text(preprocessed_text,
|
|
|
417 |
|
418 |
if status != 200:
|
419 |
result = {}
|
@@ -679,6 +680,7 @@ class XmlProcessor(BaseProcessor):
|
|
679 |
|
680 |
return output_data
|
681 |
|
|
|
682 |
def get_children_list_supermat(soup, use_paragraphs=False, verbose=False):
|
683 |
children = []
|
684 |
|
@@ -697,6 +699,7 @@ def get_children_list_supermat(soup, use_paragraphs=False, verbose=False):
|
|
697 |
|
698 |
return children
|
699 |
|
|
|
700 |
def get_children_list_grobid(soup: object, use_paragraphs: object = True, verbose: object = False) -> object:
|
701 |
children = []
|
702 |
|
@@ -739,4 +742,4 @@ def get_children_figures(soup: object, use_paragraphs: object = True, verbose: o
|
|
739 |
if verbose:
|
740 |
print(str(children))
|
741 |
|
742 |
-
return children
|
|
|
413 |
|
414 |
def extract_materials(self, text):
|
415 |
preprocessed_text = text.strip()
|
416 |
+
status, result = self.grobid_superconductors_client.process_text(preprocessed_text,
|
417 |
+
"processText_disable_linking")
|
418 |
|
419 |
if status != 200:
|
420 |
result = {}
|
|
|
680 |
|
681 |
return output_data
|
682 |
|
683 |
+
|
684 |
def get_children_list_supermat(soup, use_paragraphs=False, verbose=False):
|
685 |
children = []
|
686 |
|
|
|
699 |
|
700 |
return children
|
701 |
|
702 |
+
|
703 |
def get_children_list_grobid(soup: object, use_paragraphs: object = True, verbose: object = False) -> object:
|
704 |
children = []
|
705 |
|
|
|
742 |
if verbose:
|
743 |
print(str(children))
|
744 |
|
745 |
+
return children
|
streamlit_app.py
CHANGED
@@ -42,6 +42,7 @@ if 'git_rev' not in st.session_state:
|
|
42 |
if "messages" not in st.session_state:
|
43 |
st.session_state.messages = []
|
44 |
|
|
|
45 |
def new_file():
|
46 |
st.session_state['loaded_embeddings'] = None
|
47 |
st.session_state['doc_id'] = None
|
@@ -69,6 +70,7 @@ def init_qa(model):
|
|
69 |
|
70 |
return DocumentQAEngine(chat, embeddings, grobid_url=os.environ['GROBID_URL'])
|
71 |
|
|
|
72 |
@st.cache_resource
|
73 |
def init_ner():
|
74 |
quantities_client = QuantitiesAPI(os.environ['GROBID_QUANTITIES_URL'], check_server=True)
|
@@ -89,14 +91,16 @@ def init_ner():
|
|
89 |
materials_client.set_config(config_materials)
|
90 |
|
91 |
gqa = GrobidAggregationProcessor(None,
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
|
96 |
return gqa
|
97 |
|
|
|
98 |
gqa = init_ner()
|
99 |
|
|
|
100 |
def get_file_hash(fname):
|
101 |
hash_md5 = blake2b()
|
102 |
with open(fname, "rb") as f:
|
@@ -122,7 +126,7 @@ def play_old_messages():
|
|
122 |
is_api_key_provided = st.session_state['api_key']
|
123 |
|
124 |
model = st.sidebar.radio("Model (cannot be changed after selection or upload)",
|
125 |
-
("chatgpt-3.5-turbo", "mistral-7b-instruct-v0.1")
|
126 |
index=1,
|
127 |
captions=[
|
128 |
"ChatGPT 3.5 Turbo + Ada-002-text (embeddings)",
|
@@ -134,13 +138,15 @@ model = st.sidebar.radio("Model (cannot be changed after selection or upload)",
|
|
134 |
|
135 |
if not st.session_state['api_key']:
|
136 |
if model == 'mistral-7b-instruct-v0.1' or model == 'llama-2-70b-chat':
|
137 |
-
api_key = st.sidebar.text_input('Huggingface API Key',
|
|
|
138 |
if api_key:
|
139 |
st.session_state['api_key'] = is_api_key_provided = True
|
140 |
os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_key
|
141 |
st.session_state['rqa'] = init_qa(model)
|
142 |
elif model == 'chatgpt-3.5-turbo':
|
143 |
-
api_key = st.sidebar.text_input('OpenAI API Key',
|
|
|
144 |
if api_key:
|
145 |
st.session_state['api_key'] = is_api_key_provided = True
|
146 |
os.environ['OPENAI_API_KEY'] = api_key
|
@@ -177,10 +183,12 @@ with st.sidebar:
|
|
177 |
st.markdown(
|
178 |
"""After entering your API Key (Open AI or Huggingface). Upload a scientific article as PDF document. You will see a spinner or loading indicator while the processing is in progress. Once the spinner stops, you can proceed to ask your questions.""")
|
179 |
|
180 |
-
st.markdown(
|
|
|
|
|
181 |
if st.session_state['git_rev'] != "unknown":
|
182 |
st.markdown("**Revision number**: [" + st.session_state[
|
183 |
-
|
184 |
|
185 |
st.header("Query mode (Advanced use)")
|
186 |
st.markdown(
|
@@ -219,11 +227,11 @@ if st.session_state.loaded_embeddings and question and len(question) > 0 and st.
|
|
219 |
if mode == "Embeddings":
|
220 |
with st.spinner("Generating LLM response..."):
|
221 |
text_response = st.session_state['rqa'].query_storage(question, st.session_state.doc_id,
|
222 |
-
|
223 |
elif mode == "LLM":
|
224 |
with st.spinner("Generating response..."):
|
225 |
_, text_response = st.session_state['rqa'].query_document(question, st.session_state.doc_id,
|
226 |
-
|
227 |
|
228 |
if not text_response:
|
229 |
st.error("Something went wrong. Contact Luca Foppiano (Foppiano.Luca@nims.co.jp) to report the issue.")
|
|
|
42 |
if "messages" not in st.session_state:
|
43 |
st.session_state.messages = []
|
44 |
|
45 |
+
|
46 |
def new_file():
|
47 |
st.session_state['loaded_embeddings'] = None
|
48 |
st.session_state['doc_id'] = None
|
|
|
70 |
|
71 |
return DocumentQAEngine(chat, embeddings, grobid_url=os.environ['GROBID_URL'])
|
72 |
|
73 |
+
|
74 |
@st.cache_resource
|
75 |
def init_ner():
|
76 |
quantities_client = QuantitiesAPI(os.environ['GROBID_QUANTITIES_URL'], check_server=True)
|
|
|
91 |
materials_client.set_config(config_materials)
|
92 |
|
93 |
gqa = GrobidAggregationProcessor(None,
|
94 |
+
grobid_quantities_client=quantities_client,
|
95 |
+
grobid_superconductors_client=materials_client
|
96 |
+
)
|
97 |
|
98 |
return gqa
|
99 |
|
100 |
+
|
101 |
gqa = init_ner()
|
102 |
|
103 |
+
|
104 |
def get_file_hash(fname):
|
105 |
hash_md5 = blake2b()
|
106 |
with open(fname, "rb") as f:
|
|
|
126 |
is_api_key_provided = st.session_state['api_key']
|
127 |
|
128 |
model = st.sidebar.radio("Model (cannot be changed after selection or upload)",
|
129 |
+
("chatgpt-3.5-turbo", "mistral-7b-instruct-v0.1"), # , "llama-2-70b-chat"),
|
130 |
index=1,
|
131 |
captions=[
|
132 |
"ChatGPT 3.5 Turbo + Ada-002-text (embeddings)",
|
|
|
138 |
|
139 |
if not st.session_state['api_key']:
|
140 |
if model == 'mistral-7b-instruct-v0.1' or model == 'llama-2-70b-chat':
|
141 |
+
api_key = st.sidebar.text_input('Huggingface API Key',
|
142 |
+
type="password") # if 'HUGGINGFACEHUB_API_TOKEN' not in os.environ else os.environ['HUGGINGFACEHUB_API_TOKEN']
|
143 |
if api_key:
|
144 |
st.session_state['api_key'] = is_api_key_provided = True
|
145 |
os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_key
|
146 |
st.session_state['rqa'] = init_qa(model)
|
147 |
elif model == 'chatgpt-3.5-turbo':
|
148 |
+
api_key = st.sidebar.text_input('OpenAI API Key',
|
149 |
+
type="password") # if 'OPENAI_API_KEY' not in os.environ else os.environ['OPENAI_API_KEY']
|
150 |
if api_key:
|
151 |
st.session_state['api_key'] = is_api_key_provided = True
|
152 |
os.environ['OPENAI_API_KEY'] = api_key
|
|
|
183 |
st.markdown(
|
184 |
"""After entering your API Key (Open AI or Huggingface). Upload a scientific article as PDF document. You will see a spinner or loading indicator while the processing is in progress. Once the spinner stops, you can proceed to ask your questions.""")
|
185 |
|
186 |
+
st.markdown(
|
187 |
+
'**NER on LLM responses**: The responses from the LLMs are post-processed to extract <span style="color:orange">physical quantities, measurements</span> and <span style="color:green">materials</span> mentions.',
|
188 |
+
unsafe_allow_html=True)
|
189 |
if st.session_state['git_rev'] != "unknown":
|
190 |
st.markdown("**Revision number**: [" + st.session_state[
|
191 |
+
'git_rev'] + "](https://github.com/lfoppiano/document-qa/commit/" + st.session_state['git_rev'] + ")")
|
192 |
|
193 |
st.header("Query mode (Advanced use)")
|
194 |
st.markdown(
|
|
|
227 |
if mode == "Embeddings":
|
228 |
with st.spinner("Generating LLM response..."):
|
229 |
text_response = st.session_state['rqa'].query_storage(question, st.session_state.doc_id,
|
230 |
+
context_size=context_size)
|
231 |
elif mode == "LLM":
|
232 |
with st.spinner("Generating response..."):
|
233 |
_, text_response = st.session_state['rqa'].query_document(question, st.session_state.doc_id,
|
234 |
+
context_size=context_size)
|
235 |
|
236 |
if not text_response:
|
237 |
st.error("Something went wrong. Contact Luca Foppiano (Foppiano.Luca@nims.co.jp) to report the issue.")
|