nickmuchi commited on
Commit
32730b3
·
1 Parent(s): 8499c35

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -251
app.py CHANGED
@@ -10,6 +10,7 @@ from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassifica
10
  from sentence_transformers import SentenceTransformer, CrossEncoder, util
11
  import streamlit as st
12
  import en_core_web_lg
 
13
 
14
  nltk.download('punkt')
15
 
@@ -50,259 +51,8 @@ upload_wav = st.file_uploader("Upload a .wav sound file ",key="upload")
50
  auth_token = os.environ.get("auth_token")
51
 
52
  progress_bar = st.sidebar.progress(0)
53
-
54
- @st.experimental_singleton(suppress_st_warning=True)
55
- def load_models():
56
- asr_model = whisper.load_model("small")
57
- q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
58
- ner_model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
59
- q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
60
- ner_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
61
- sent_pipe = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
62
- sum_pipe = pipeline("summarization",model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
63
- ner_pip = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
64
- sbert = SentenceTransformer("all-mpnet-base-v2")
65
- cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
66
-
67
- return asr_model, sent_pipe, sum_pipe, ner_pipe, sbert, cross_encoder
68
-
69
- @st.experimental_singleton(suppress_st_warning=True)
70
- def get_spacy():
71
- nlp = en_core_web_lg.load()
72
- return nlp
73
 
74
  nlp = get_spacy()
75
  asr_model, sent_pipe, sum_pipe, ner_pipe, sbert, cross_encoder = load_models()
76
 
77
- @st.experimental_memo(suppress_st_warning=True)
78
- def inference(link, upload):
79
- '''Convert Youtube video or Audio upload to text'''
80
-
81
- if validators.url(link):
82
-
83
- yt = YouTube(link)
84
- title = yt.title
85
- path = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp4")
86
- options = whisper.DecodingOptions(without_timestamps=True)
87
- results = asr_model.transcribe(path)
88
-
89
- return results, yt.title
90
-
91
- elif upload:
92
- results = asr_model.transcribe(upload)
93
-
94
- return results, "Transcribed Earnings Audio"
95
-
96
- @st.experimental_memo(suppress_st_warning=True)
97
- def sentiment_pipe(earnings_text):
98
- '''Determine the sentiment of the text'''
99
-
100
- earnings_sentences = sent_tokenize(earnings_text)
101
- earnings_sentiment = sent_pipe(earnings_sentences)
102
-
103
- return earnings_sentiment, earnings_sentences
104
-
105
- @st.experimental_memo(suppress_st_warning=True)
106
- def preprocess_plain_text(text,window_size=3):
107
- '''Preprocess text for semantic search'''
108
-
109
- text = text.encode("ascii", "ignore").decode() # unicode
110
- text = re.sub(r"https*\S+", " ", text) # url
111
- text = re.sub(r"@\S+", " ", text) # mentions
112
- text = re.sub(r"#\S+", " ", text) # hastags
113
- text = re.sub(r"\s{2,}", " ", text) # over spaces
114
- #text = re.sub("[^.,!?%$A-Za-z0-9]+", " ", text) # special characters except .,!?
115
-
116
- #break into lines and remove leading and trailing space on each
117
- lines = [line.strip() for line in text.splitlines()]
118
-
119
- # #break multi-headlines into a line each
120
- chunks = [phrase.strip() for line in lines for phrase in line.split(" ")]
121
-
122
- # # drop blank lines
123
- text = '\n'.join(chunk for chunk in chunks if chunk)
124
-
125
- ## We split this article into paragraphs and then every paragraph into sentences
126
- paragraphs = []
127
- for paragraph in text.replace('\n',' ').split("\n\n"):
128
- if len(paragraph.strip()) > 0:
129
- paragraphs.append(sent_tokenize(paragraph.strip()))
130
-
131
- #We combine up to 3 sentences into a passage. You can choose smaller or larger values for window_size
132
- #Smaller value: Context from other sentences might get lost
133
- #Lager values: More context from the paragraph remains, but results are longer
134
- window_size = window_size
135
- passages = []
136
- for paragraph in paragraphs:
137
- for start_idx in range(0, len(paragraph), window_size):
138
- end_idx = min(start_idx+window_size, len(paragraph))
139
- passages.append(" ".join(paragraph[start_idx:end_idx]))
140
-
141
- print(f"Sentences: {sum([len(p) for p in paragraphs])}")
142
- print(f"Passages: {len(passages)}")
143
-
144
- return passages
145
-
146
- @st.experimental_memo(suppress_st_warning=True)
147
- def chunk_clean_text(text):
148
-
149
- """Chunk text longer than 500 tokens"""
150
-
151
- article = nlp(text)
152
- sentences = [i.text for i in list(article.sents)]
153
-
154
- current_chunk = 0
155
- chunks = []
156
-
157
- for sentence in sentences:
158
- if len(chunks) == current_chunk + 1:
159
- if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
160
- chunks[current_chunk].extend(sentence.split(" "))
161
- else:
162
- current_chunk += 1
163
- chunks.append(sentence.split(" "))
164
- else:
165
- chunks.append(sentence.split(" "))
166
-
167
- for chunk_id in range(len(chunks)):
168
- chunks[chunk_id] = " ".join(chunks[chunk_id])
169
-
170
- return chunks
171
-
172
- def summary_downloader(raw_text):
173
-
174
- b64 = base64.b64encode(raw_text.encode()).decode()
175
- new_filename = "new_text_file_{}_.txt".format(time_str)
176
- st.markdown("#### Download Summary as a File ###")
177
- href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
178
- st.markdown(href,unsafe_allow_html=True)
179
-
180
- def get_all_entities_per_sentence(text):
181
- doc = nlp(''.join(text))
182
-
183
- sentences = list(doc.sents)
184
-
185
- entities_all_sentences = []
186
- for sentence in sentences:
187
- entities_this_sentence = []
188
-
189
- # SPACY ENTITIES
190
- for entity in sentence.ents:
191
- entities_this_sentence.append(str(entity))
192
-
193
- # FLAIR ENTITIES (CURRENTLY NOT USED)
194
- # sentence_entities = Sentence(str(sentence))
195
- # tagger.predict(sentence_entities)
196
- # for entity in sentence_entities.get_spans('ner'):
197
- # entities_this_sentence.append(entity.text)
198
-
199
- # XLM ENTITIES
200
- entities_xlm = [entity["word"] for entity in ner_model(str(sentence))]
201
- for entity in entities_xlm:
202
- entities_this_sentence.append(str(entity))
203
-
204
- entities_all_sentences.append(entities_this_sentence)
205
-
206
- return entities_all_sentences
207
-
208
- def get_all_entities(text):
209
- all_entities_per_sentence = get_all_entities_per_sentence(text)
210
- return list(itertools.chain.from_iterable(all_entities_per_sentence))
211
-
212
- def get_and_compare_entities(article_content,summary_output):
213
-
214
- all_entities_per_sentence = get_all_entities_per_sentence(article_content)
215
- entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
216
-
217
- all_entities_per_sentence = get_all_entities_per_sentence(summary_output)
218
- entities_summary = list(itertools.chain.from_iterable(all_entities_per_sentence))
219
-
220
- matched_entities = []
221
- unmatched_entities = []
222
- for entity in entities_summary:
223
- if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article):
224
- matched_entities.append(entity)
225
- elif any(
226
- np.inner(sentence_embedding_model.encode(entity, show_progress_bar=False),
227
- sentence_embedding_model.encode(art_entity, show_progress_bar=False)) > 0.9 for
228
- art_entity in entities_article):
229
- matched_entities.append(entity)
230
- else:
231
- unmatched_entities.append(entity)
232
-
233
- matched_entities = list(dict.fromkeys(matched_entities))
234
- unmatched_entities = list(dict.fromkeys(unmatched_entities))
235
-
236
- matched_entities_to_remove = []
237
- unmatched_entities_to_remove = []
238
-
239
- for entity in matched_entities:
240
- for substring_entity in matched_entities:
241
- if entity != substring_entity and entity.lower() in substring_entity.lower():
242
- matched_entities_to_remove.append(entity)
243
-
244
- for entity in unmatched_entities:
245
- for substring_entity in unmatched_entities:
246
- if entity != substring_entity and entity.lower() in substring_entity.lower():
247
- unmatched_entities_to_remove.append(entity)
248
-
249
- matched_entities_to_remove = list(dict.fromkeys(matched_entities_to_remove))
250
- unmatched_entities_to_remove = list(dict.fromkeys(unmatched_entities_to_remove))
251
-
252
- for entity in matched_entities_to_remove:
253
- matched_entities.remove(entity)
254
- for entity in unmatched_entities_to_remove:
255
- unmatched_entities.remove(entity)
256
-
257
- return matched_entities, unmatched_entities
258
-
259
- def highlight_entities(article_content,summary_output):
260
-
261
- markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
262
- markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
263
- markdown_end = "</mark>"
264
-
265
- matched_entities, unmatched_entities = get_and_compare_entities(article_content,summary_output)
266
-
267
- print(summary_output)
268
-
269
- for entity in matched_entities:
270
- summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_green + entity + markdown_end,summary_output)
271
-
272
- for entity in unmatched_entities:
273
- summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_red + entity + markdown_end,summary_output)
274
-
275
- print("")
276
- print(summary_output)
277
-
278
- print("")
279
- print(summary_output)
280
-
281
- soup = BeautifulSoup(summary_output, features="html.parser")
282
-
283
- return HTML_WRAPPER.format(soup)
284
-
285
- nlp = get_spacy()
286
-
287
- def display_df_as_table(model,top_k,score='score'):
288
- '''Display the df with text and scores as a table'''
289
-
290
- df = pd.DataFrame([(hit[score],passages[hit['corpus_id']]) for hit in model[0:top_k]],columns=['Score','Text'])
291
- df['Score'] = round(df['Score'],2)
292
-
293
- return df
294
-
295
- def make_spans(text,results):
296
- results_list = []
297
- for i in range(len(results)):
298
- results_list.append(results[i]['label'])
299
- facts_spans = []
300
- facts_spans = list(zip(sent_tokenizer(text),results_list))
301
- return facts_spans
302
-
303
- ##Fiscal Sentiment by Sentence
304
- def fin_ext(text):
305
- results = remote_clx(sent_tokenizer(text))
306
- return make_spans(text,results)
307
-
308
  progress_bar.empty()
 
10
  from sentence_transformers import SentenceTransformer, CrossEncoder, util
11
  import streamlit as st
12
  import en_core_web_lg
13
+ from funtions import *
14
 
15
  nltk.download('punkt')
16
 
 
51
  auth_token = os.environ.get("auth_token")
52
 
53
  progress_bar = st.sidebar.progress(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  nlp = get_spacy()
56
  asr_model, sent_pipe, sum_pipe, ner_pipe, sbert, cross_encoder = load_models()
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  progress_bar.empty()