Spaces:

ivan-savchuk
/

medical-search

Runtime error

App Files Files Community

ivan-savchuk commited on Jul 28, 2022

Commit

b951bdb

1 Parent(s): 2992997

Upload app.py

Browse files

Files changed (1) hide show

app.py +91 -0

app.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import streamlit as st
+import json
+import time
+import faiss
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.cross_encoder import CrossEncoder
+class DocumentSearch:
+    '''
+        This class is dedicated to
+        perform semantic document search
+        based on previously trained:
+        faiss: index
+        sbert: encoder
+        sbert: cross_encoder
+    '''
+    def __init__(self, labels_path: str, encoder_path: str,
+                 index_path: str, cross_encoder_path: str):
+        # loading docs and corresponding urls
+        with open(labels_path, 'r') as json_file:
+            self.docs = json.load(json_file)
+        # loading sbert encoder model
+        self.encoder = SentenceTransformer(encoder_path)
+        # loading faiss index
+        self.index = faiss.read_index(index_path)
+        # loading sbert cross_encoder
+        self.cross_encoder = CrossEncoder(cross_encoder_path)
+    def search(self, query: str, k: int) -> list:
+        # get vector representation of text query
+        query_vector = self.encoder.encode([query])
+        # perform search via faiss FlatIP index
+        _, indeces = self.index.search(query_vector, k*10)
+        # get answers by index
+        answers = [self.docs[i] for i in indeces[0]]
+        # prepare inputs for cross encoder
+        model_inputs = [[query, pairs[0]] for pairs in answers]
+        urls = [pairs[1] for pairs in answers]
+        # get similarity score between query and documents
+        scores = self.cross_encoder.predict(model_inputs, batch_size=1)
+        # compose results into list of dicts
+        results = [{'doc': doc[1], 'url': url, 'score': score} for doc, url, score in zip(model_inputs, urls, scores)]
+        # return results sorteed by similarity scores
+        return sorted(results, key=lambda x: x['score'], reverse=True)[:k]
+enc_path = "msmarco-distilbert-dot-v5-tuned-full-v1"
+idx_path = "idx_vectors.index"
+cross_enc_path = "cross-encoder-ms-marco-MiniLM-L-12-v2-tuned_mediqa-v1"
+docs_path = "docs.json"
+# get instance of DocumentSearch class
+surfer = DocumentSearch(
+    labels_path=docs_path,
+    encoder_path=enc_path,
+    index_path=idx_path,
+    cross_encoder_path=cross_enc_path
+)
+if __name__ == "__main__":
+    # streamlit part starts here with title
+    st.title('Medical Search')
+    # here we have input space
+    query = st.text_input("Enter any query about our data",
+                          placeholder="Type query here...")
+    # on submit we execute search
+    if(st.button("Search")):
+        # set start time
+        stt = time.time()
+        # retrieve top 5 documents
+        results = surfer.search(query, k=5)
+        # set endtime
+        ent = time.time()
+        # measure resulting time
+        elapsed_time = round(ent - stt, 2)
+        # define container for answers
+        with st.container():
+            # show which query was entered, and what was searching time
+            st.write(f"**Results Related to:** {query} ({elapsed_time} sec.)")
+            # then we use loop to show results
+            for i, answer in enumerate(results):
+                # answer starts with header
+                st.subheader(f"Answer {i+1}")
+                # cropped answer
+                doc = answer["doc"][:150] + "..."
+                # and url to the full answer
+                url = answer["url"]
+                # then we display it
+                st.markdown(f"{doc}\n[**Read More**]({url})\n")