Spaces:

nickmuchi
/

Earnings-Call-Analysis-Whisperer

Running

App Files Files Community

nickmuchi commited on May 12, 2023

Commit

47afd47

•

1 Parent(s): 4070bba

Update functions.py

Browse files

Files changed (1) hide show

functions.py +179 -171

functions.py CHANGED Viewed

@@ -139,6 +139,15 @@ def load_models():
     return sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer, emb_tokenizer, sbert
 @st.cache_data
 def get_yt_audio(url):
@@ -161,6 +170,14 @@ def load_whisper_api(audio):
     return transcript
 def inference(link, upload, _asr_model):
     '''Convert Youtube video or Audio upload to text'''
@@ -257,19 +274,53 @@ def inference(link, upload, _asr_model):
         return results['text'], title
 @st.cache_data
-def process_corpus(corpus, title, embedding_model, chunk_size=1000, overlap=50):
-    '''Process text for Semantic Search'''
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=overlap)
-    texts = text_splitter.split_text(corpus)
-    embeddings = gen_embeddings(embedding_model)
-    vectorstore = FAISS.from_texts(texts, embeddings, metadatas=[{"source": i} for i in range(len(texts))])
-    return vectorstore
 @st.cache_data
 def chunk_and_preprocess_text(text,thresh=500):
@@ -296,114 +347,6 @@ def chunk_and_preprocess_text(text,thresh=500):
         chunks[chunk_id] = " ".join(chunks[chunk_id])
     return chunks
-@st.cache_resource
-def gen_embeddings(embedding_model):
-    '''Generate embeddings for given model'''
-    if 'hkunlp' in embedding_model:
-        embeddings = HuggingFaceInstructEmbeddings(model_name=embedding_model,
-                                           query_instruction='Represent the Financial question for retrieving supporting paragraphs: ',
-                                           embed_instruction='Represent the Financial paragraph for retrieval: ')
-    else:
-        embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
-    return embeddings
-def embed_text(query,embedding_model,_docsearch):
-    '''Embed text and generate semantic search scores'''
-    # llm = OpenAI(temperature=0)
-    chat_llm = ChatOpenAI(streaming=True,
-                          model_name = 'gpt-4',
-                          callbacks=[StdOutCallbackHandler()],
-                          verbose=True,
-                          temperature=0
-                         )
-    # chain = RetrievalQA.from_chain_type(llm=chat_llm, chain_type="stuff",
-    #                              retriever=_docsearch.as_retriever(),
-    #                              return_source_documents=True)
-    chain = ConversationalRetrievalChain.from_llm(chat_llm,
-                                                  retriever= _docsearch.as_retriever(search_kwargs={"k": 3}),
-                                                  get_chat_history=lambda h : h,
-                                                  memory = memory,
-                                                  return_source_documents=True)
-    chain.combine_docs_chain.llm_chain.prompt.messages[0] = load_prompt()
-    answer = chain({"question": query})
-    return answer
-@st.cache_data
-def gen_sentiment(text):
-    '''Generate sentiment of given text'''
-    return sent_pipe(text)[0]['label']
-@st.cache_data
-def gen_annotated_text(df):
-    '''Generate annotated text'''
-    tag_list=[]
-    for row in df.itertuples():
-        label = row[2]
-        text = row[1]
-        if label == 'Positive':
-            tag_list.append((text,label,'#8fce00'))
-        elif label == 'Negative':
-            tag_list.append((text,label,'#f44336'))
-        else:
-            tag_list.append((text,label,'#000000'))
-    return tag_list
-@st.cache_data
-def generate_eval(raw_text, N, chunk):
-    # Generate N questions from context of chunk chars
-    # IN: text, N questions, chunk size to draw question from in the doc
-    # OUT: eval set as JSON list
-    # raw_text = ','.join(raw_text)
-    st.info("`Generating sample questions ...`")
-    n = len(raw_text)
-    starting_indices = [random.randint(0, n-chunk) for _ in range(N)]
-    sub_sequences = [raw_text[i:i+chunk] for i in starting_indices]
-    chain = QAGenerationChain.from_llm(ChatOpenAI(temperature=0))
-    eval_set = []
-    for i, b in enumerate(sub_sequences):
-        try:
-            qa = chain.run(b)
-            eval_set.append(qa)
-            st.write("Creating Question:",i+1)
-        except Exception as e:
-            st.warning('Error generating question %s.' % str(i+1), icon="⚠️")
-            #st.write(e)
-    eval_set_full = list(itertools.chain.from_iterable(eval_set))
-    return eval_set_full
-@st.cache_resource
-def get_spacy():
-    nlp = en_core_web_lg.load()
-    return nlp
-@st.cache_data
-def sentiment_pipe(earnings_text):
-    '''Determine the sentiment of the text'''
-    earnings_sentences = chunk_long_text(earnings_text,150,1,1)
-    earnings_sentiment = sent_pipe(earnings_sentences)
-    return earnings_sentiment, earnings_sentences
 @st.cache_data
 def summarize_text(text_to_summarize,max_len,min_len):
@@ -416,56 +359,7 @@ def summarize_text(text_to_summarize,max_len,min_len):
            early_stopping=True)
     summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
-    return summarized_text
-@st.cache_data
-def clean_text(text):
-    '''Clean all text'''
-    text = text.encode("ascii", "ignore").decode()  # unicode
-    text = re.sub(r"https*\S+", " ", text)  # url
-    text = re.sub(r"@\S+", " ", text)  # mentions
-    text = re.sub(r"#\S+", " ", text)  # hastags
-    text = re.sub(r"\s{2,}", " ", text)  # over spaces
-    return text
-@st.cache_data
-def chunk_long_text(text,threshold,window_size=3,stride=2):
-    '''Preprocess text and chunk for sentiment analysis'''
-    #Convert cleaned text into sentences
-    sentences = sent_tokenize(text)
-    out = []
-    #Limit the length of each sentence to a threshold
-    for chunk in sentences:
-        if len(chunk.split()) < threshold:
-            out.append(chunk)
-        else:
-            words = chunk.split()
-            num = int(len(words)/threshold)
-            for i in range(0,num*threshold+1,threshold):
-                out.append(' '.join(words[i:threshold+i]))
-    passages = []
-    #Combine sentences into a window of size window_size
-    for paragraph in [out]:
-        for start_idx in range(0, len(paragraph), stride):
-            end_idx = min(start_idx+window_size, len(paragraph))
-            passages.append(" ".join(paragraph[start_idx:end_idx]))
-    return passages
-def summary_downloader(raw_text):
-	b64 = base64.b64encode(raw_text.encode()).decode()
-	new_filename = "new_text_file_{}_.txt".format(time_str)
-	st.markdown("#### Download Summary as a File ###")
-	href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
-	st.markdown(href,unsafe_allow_html=True)
 @st.cache_data
 def get_all_entities_per_sentence(text):
@@ -489,7 +383,7 @@ def get_all_entities_per_sentence(text):
         entities_all_sentences.append(entities_this_sentence)
     return entities_all_sentences
 @st.cache_data
 def get_all_entities(text):
     all_entities_per_sentence = get_all_entities_per_sentence(text)
@@ -569,6 +463,124 @@ def highlight_entities(article_content,summary_output):
     soup = BeautifulSoup(summary_output, features="html.parser")
     return HTML_WRAPPER.format(soup)
 def display_df_as_table(model,top_k,score='score'):
@@ -909,7 +921,3 @@ def save_network_html(kb, filename="network.html"):
     )
     net.set_edge_smooth('dynamic')
     net.show(filename)
-nlp = get_spacy()
-sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer, emb_tokenizer, sbert  = load_models()

     return sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer, emb_tokenizer, sbert
+@st.cache_resource
+def get_spacy():
+    nlp = en_core_web_lg.load()
+    return nlp
+nlp = get_spacy()
+sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer, emb_tokenizer, sbert  = load_models()
 @st.cache_data
 def get_yt_audio(url):
     return transcript
+@st.cache_data
+def load_asr_model(model_name):
+    '''Load the open source  whisper model in cases where the API is not working'''
+    model = whisper.load_model(model_name)
+    return model
 def inference(link, upload, _asr_model):
     '''Convert Youtube video or Audio upload to text'''
         return results['text'], title
 @st.cache_data
+def clean_text(text):
+    '''Clean all text after inference'''
+    text = text.encode("ascii", "ignore").decode()  # unicode
+    text = re.sub(r"https*\S+", " ", text)  # url
+    text = re.sub(r"@\S+", " ", text)  # mentions
+    text = re.sub(r"#\S+", " ", text)  # hastags
+    text = re.sub(r"\s{2,}", " ", text)  # over spaces
+    return text
+@st.cache_data
+def chunk_long_text(text,threshold,window_size=3,stride=2):
+    '''Preprocess text and chunk for sentiment analysis'''
+    #Convert cleaned text into sentences
+    sentences = sent_tokenize(text)
+    out = []
+    #Limit the length of each sentence to a threshold
+    for chunk in sentences:
+        if len(chunk.split()) < threshold:
+            out.append(chunk)
+        else:
+            words = chunk.split()
+            num = int(len(words)/threshold)
+            for i in range(0,num*threshold+1,threshold):
+                out.append(' '.join(words[i:threshold+i]))
+    passages = []
+    #Combine sentences into a window of size window_size
+    for paragraph in [out]:
+        for start_idx in range(0, len(paragraph), stride):
+            end_idx = min(start_idx+window_size, len(paragraph))
+            passages.append(" ".join(paragraph[start_idx:end_idx]))
+    return passages
+@st.cache_data
+def sentiment_pipe(earnings_text):
+    '''Determine the sentiment of the text'''
+    earnings_sentences = chunk_long_text(earnings_text,150,1,1)
+    earnings_sentiment = sent_pipe(earnings_sentences)
+    return earnings_sentiment, earnings_sentences
 @st.cache_data
 def chunk_and_preprocess_text(text,thresh=500):
         chunks[chunk_id] = " ".join(chunks[chunk_id])
     return chunks
 @st.cache_data
 def summarize_text(text_to_summarize,max_len,min_len):
            early_stopping=True)
     summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
+    return summarized_text
 @st.cache_data
 def get_all_entities_per_sentence(text):
         entities_all_sentences.append(entities_this_sentence)
     return entities_all_sentences
 @st.cache_data
 def get_all_entities(text):
     all_entities_per_sentence = get_all_entities_per_sentence(text)
     soup = BeautifulSoup(summary_output, features="html.parser")
     return HTML_WRAPPER.format(soup)
+def summary_downloader(raw_text):
+    '''Download the summary generated'''
+	b64 = base64.b64encode(raw_text.encode()).decode()
+	new_filename = "new_text_file_{}_.txt".format(time_str)
+	st.markdown("#### Download Summary as a File ###")
+	href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
+	st.markdown(href,unsafe_allow_html=True)
+@st.cache_data
+def generate_eval(raw_text, N, chunk):
+    # Generate N questions from context of chunk chars
+    # IN: text, N questions, chunk size to draw question from in the doc
+    # OUT: eval set as JSON list
+    # raw_text = ','.join(raw_text)
+    st.info("`Generating sample questions ...`")
+    n = len(raw_text)
+    starting_indices = [random.randint(0, n-chunk) for _ in range(N)]
+    sub_sequences = [raw_text[i:i+chunk] for i in starting_indices]
+    chain = QAGenerationChain.from_llm(ChatOpenAI(temperature=0))
+    eval_set = []
+    for i, b in enumerate(sub_sequences):
+        try:
+            qa = chain.run(b)
+            eval_set.append(qa)
+            st.write("Creating Question:",i+1)
+        except Exception as e:
+            st.warning('Error generating question %s.' % str(i+1), icon="⚠️")
+            #st.write(e)
+    eval_set_full = list(itertools.chain.from_iterable(eval_set))
+    return eval_set_full
+@st.cache_resource
+def gen_embeddings(embedding_model):
+    '''Generate embeddings for given model'''
+    if 'hkunlp' in embedding_model:
+        embeddings = HuggingFaceInstructEmbeddings(model_name=embedding_model,
+                                           query_instruction='Represent the Financial question for retrieving supporting paragraphs: ',
+                                           embed_instruction='Represent the Financial paragraph for retrieval: ')
+    else:
+        embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
+    return embeddings
+@st.cache_data
+def process_corpus(corpus, title, embedding_model, chunk_size=1000, overlap=50):
+    '''Process text for Semantic Search'''
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=overlap)
+    texts = text_splitter.split_text(corpus)
+    embeddings = gen_embeddings(embedding_model)
+    vectorstore = FAISS.from_texts(texts, embeddings, metadatas=[{"source": i} for i in range(len(texts))])
+    return vectorstore
+def embed_text(query,_docsearch):
+    '''Embed text and generate semantic search scores'''
+    # llm = OpenAI(temperature=0)
+    chat_llm = ChatOpenAI(streaming=True,
+                          model_name = 'gpt-4',
+                          callbacks=[StdOutCallbackHandler()],
+                          verbose=True,
+                          temperature=0
+                         )
+    # chain = RetrievalQA.from_chain_type(llm=chat_llm, chain_type="stuff",
+    #                              retriever=_docsearch.as_retriever(),
+    #                              return_source_documents=True)
+    chain = ConversationalRetrievalChain.from_llm(chat_llm,
+                                                  retriever= _docsearch.as_retriever(search_kwargs={"k": 3}),
+                                                  get_chat_history=lambda h : h,
+                                                  memory = memory,
+                                                  return_source_documents=True)
+    chain.combine_docs_chain.llm_chain.prompt.messages[0] = load_prompt()
+    answer = chain({"question": query})
+    return answer
+@st.cache_data
+def gen_sentiment(text):
+    '''Generate sentiment of given text'''
+    return sent_pipe(text)[0]['label']
+@st.cache_data
+def gen_annotated_text(df):
+    '''Generate annotated text'''
+    tag_list=[]
+    for row in df.itertuples():
+        label = row[2]
+        text = row[1]
+        if label == 'Positive':
+            tag_list.append((text,label,'#8fce00'))
+        elif label == 'Negative':
+            tag_list.append((text,label,'#f44336'))
+        else:
+            tag_list.append((text,label,'#000000'))
+    return tag_list
 def display_df_as_table(model,top_k,score='score'):
     )
     net.set_edge_smooth('dynamic')
     net.show(filename)