Spaces:

nickmuchi
/

Earnings-Call-Analysis-Whisperer

Running

App Files Files Community

nickmuchi commited on Jan 4, 2023

Commit

e429024

1 Parent(s): 446f9c9

Update functions.py

Browse files

Files changed (1) hide show

functions.py +56 -0

functions.py CHANGED Viewed

@@ -52,6 +52,62 @@ def load_sbert(model_name):
     return sbert
 @st.experimental_singleton(suppress_st_warning=True)
 def get_spacy():
     nlp = en_core_web_lg.load()

     return sbert
+@st.experimental_memo(suppress_st_warning=True)
+def embed_text(query,corpus,embedding_model):
+    '''Embed text and generate semantic search scores'''
+    #If model is e5 then apply prefixes to query and passage
+    if embedding_model == 'intfloat/e5-base':
+        search_input = 'query: '+ query
+        passages_emb = ['passage: ' + sentence for sentence in corpus]
+    elif embedding_model == 'hkunlp/instructor-base':
+        search_input = [['Represent the Financial question for retrieving supporting documents; Input: ', query, 0]]
+        passages_emb = [['Represent the Financial document for retrieval; Input: ',sentence,0] for sentence in corpus]
+    else:
+        search_input = query
+        passages_emb = corpus
+    #Embed corpus and question
+    corpus_embedding = sbert.encode(passages_emb, convert_to_tensor=True)
+    question_embedding = sbert.encode(search_input, convert_to_tensor=True)
+    question_embedding = question_embedding.cpu()
+    corpus_embedding = corpus_embedding.cpu()
+    # #Calculate similarity scores and rank
+    hits = util.semantic_search(question_embedding, corpus_embedding, top_k=2)
+    hits = hits[0]  # Get the hits for the first query
+    # ##### Re-Ranking #####
+    # Now, score all retrieved passages with the cross_encoder
+    cross_inp = [[search_input, passages[hit['corpus_id']]] for hit in hits]
+    if embedding_model == 'hkunlp/instructor-base':
+        result = []
+        for sublist in cross_inp:
+            question = sublist[0][0][1]
+            document = sublist[1][1]
+            result.append([question, document])
+        cross_inp = result
+    cross_scores = cross_encoder.predict(cross_inp)
+    # Sort results by the cross-encoder scores
+    for idx in range(len(cross_scores)):
+        hits[idx]['cross-score'] = cross_scores[idx]
+    # Output of top-3 hits from re-ranker
+    # st.markdown("\n-------------------------\n")
+    # st.subheader(f"Top-{top_k} Cross-Encoder Re-ranker hits")
+    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
+    return hits
 @st.experimental_singleton(suppress_st_warning=True)
 def get_spacy():
     nlp = en_core_web_lg.load()