Update app.py
Browse files
@@ -10,6 +10,7 @@ from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassifica
10 |
from sentence_transformers import SentenceTransformer, CrossEncoder, util
11 |
import streamlit as st
12 |
import en_core_web_lg
13 |
14 |
15 |
@@ -50,259 +51,8 @@ upload_wav = st.file_uploader("Upload a .wav sound file ",key="upload")
50 |
auth_token = os.environ.get("auth_token")
51 |
52 |
progress_bar = st.sidebar.progress(0)
53 |
54 |
55 |
def load_models():
56 |
asr_model = whisper.load_model("small")
57 |
q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
58 |
ner_model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
59 |
q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
60 |
ner_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
61 |
sent_pipe = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
62 |
sum_pipe = pipeline("summarization",model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
63 |
ner_pip = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
64 |
sbert = SentenceTransformer("all-mpnet-base-v2")
65 |
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
66 |
67 |
return asr_model, sent_pipe, sum_pipe, ner_pipe, sbert, cross_encoder
68 |
69 |
70 |
def get_spacy():
71 |
nlp = en_core_web_lg.load()
72 |
return nlp
73 |
74 |
nlp = get_spacy()
75 |
asr_model, sent_pipe, sum_pipe, ner_pipe, sbert, cross_encoder = load_models()
76 |
77 |
78 |
def inference(link, upload):
79 |
'''Convert Youtube video or Audio upload to text'''
80 |
81 |
if validators.url(link):
82 |
83 |
yt = YouTube(link)
84 |
title = yt.title
85 |
path = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp4")
86 |
options = whisper.DecodingOptions(without_timestamps=True)
87 |
results = asr_model.transcribe(path)
88 |
89 |
return results, yt.title
90 |
91 |
elif upload:
92 |
results = asr_model.transcribe(upload)
93 |
94 |
return results, "Transcribed Earnings Audio"
95 |
96 |
97 |
def sentiment_pipe(earnings_text):
98 |
'''Determine the sentiment of the text'''
99 |
100 |
earnings_sentences = sent_tokenize(earnings_text)
101 |
earnings_sentiment = sent_pipe(earnings_sentences)
102 |
103 |
return earnings_sentiment, earnings_sentences
104 |
105 |
106 |
def preprocess_plain_text(text,window_size=3):
107 |
'''Preprocess text for semantic search'''
108 |
109 |
text = text.encode("ascii", "ignore").decode() # unicode
110 |
text = re.sub(r"https*\S+", " ", text) # url
111 |
text = re.sub(r"@\S+", " ", text) # mentions
112 |
text = re.sub(r"#\S+", " ", text) # hastags
113 |
text = re.sub(r"\s{2,}", " ", text) # over spaces
114 |
#text = re.sub("[^.,!?%$A-Za-z0-9]+", " ", text) # special characters except .,!?
115 |
116 |
#break into lines and remove leading and trailing space on each
117 |
lines = [line.strip() for line in text.splitlines()]
118 |
119 |
# #break multi-headlines into a line each
120 |
chunks = [phrase.strip() for line in lines for phrase in line.split(" ")]
121 |
122 |
# # drop blank lines
123 |
text = '\n'.join(chunk for chunk in chunks if chunk)
124 |
125 |
## We split this article into paragraphs and then every paragraph into sentences
126 |
paragraphs = []
127 |
for paragraph in text.replace('\n',' ').split("\n\n"):
128 |
if len(paragraph.strip()) > 0:
129 |
130 |
131 |
#We combine up to 3 sentences into a passage. You can choose smaller or larger values for window_size
132 |
#Smaller value: Context from other sentences might get lost
133 |
#Lager values: More context from the paragraph remains, but results are longer
134 |
window_size = window_size
135 |
passages = []
136 |
for paragraph in paragraphs:
137 |
for start_idx in range(0, len(paragraph), window_size):
138 |
end_idx = min(start_idx+window_size, len(paragraph))
139 |
passages.append(" ".join(paragraph[start_idx:end_idx]))
140 |
141 |
print(f"Sentences: {sum([len(p) for p in paragraphs])}")
142 |
print(f"Passages: {len(passages)}")
143 |
144 |
return passages
145 |
146 |
147 |
def chunk_clean_text(text):
148 |
149 |
"""Chunk text longer than 500 tokens"""
150 |
151 |
article = nlp(text)
152 |
sentences = [i.text for i in list(article.sents)]
153 |
154 |
current_chunk = 0
155 |
chunks = []
156 |
157 |
for sentence in sentences:
158 |
if len(chunks) == current_chunk + 1:
159 |
if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
160 |
chunks[current_chunk].extend(sentence.split(" "))
161 |
162 |
current_chunk += 1
163 |
chunks.append(sentence.split(" "))
164 |
165 |
chunks.append(sentence.split(" "))
166 |
167 |
for chunk_id in range(len(chunks)):
168 |
chunks[chunk_id] = " ".join(chunks[chunk_id])
169 |
170 |
return chunks
171 |
172 |
def summary_downloader(raw_text):
173 |
174 |
b64 = base64.b64encode(raw_text.encode()).decode()
175 |
new_filename = "new_text_file_{}_.txt".format(time_str)
176 |
st.markdown("#### Download Summary as a File ###")
177 |
href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
178 |
179 |
180 |
def get_all_entities_per_sentence(text):
181 |
doc = nlp(''.join(text))
182 |
183 |
sentences = list(doc.sents)
184 |
185 |
entities_all_sentences = []
186 |
for sentence in sentences:
187 |
entities_this_sentence = []
188 |
189 |
190 |
for entity in sentence.ents:
191 |
192 |
193 |
194 |
# sentence_entities = Sentence(str(sentence))
195 |
# tagger.predict(sentence_entities)
196 |
# for entity in sentence_entities.get_spans('ner'):
197 |
# entities_this_sentence.append(entity.text)
198 |
199 |
200 |
entities_xlm = [entity["word"] for entity in ner_model(str(sentence))]
201 |
for entity in entities_xlm:
202 |
203 |
204 |
205 |
206 |
return entities_all_sentences
207 |
208 |
def get_all_entities(text):
209 |
all_entities_per_sentence = get_all_entities_per_sentence(text)
210 |
return list(itertools.chain.from_iterable(all_entities_per_sentence))
211 |
212 |
def get_and_compare_entities(article_content,summary_output):
213 |
214 |
all_entities_per_sentence = get_all_entities_per_sentence(article_content)
215 |
entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
216 |
217 |
all_entities_per_sentence = get_all_entities_per_sentence(summary_output)
218 |
entities_summary = list(itertools.chain.from_iterable(all_entities_per_sentence))
219 |
220 |
matched_entities = []
221 |
unmatched_entities = []
222 |
for entity in entities_summary:
223 |
if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article):
224 |
225 |
elif any(
226 |
np.inner(sentence_embedding_model.encode(entity, show_progress_bar=False),
227 |
sentence_embedding_model.encode(art_entity, show_progress_bar=False)) > 0.9 for
228 |
art_entity in entities_article):
229 |
230 |
231 |
232 |
233 |
matched_entities = list(dict.fromkeys(matched_entities))
234 |
unmatched_entities = list(dict.fromkeys(unmatched_entities))
235 |
236 |
matched_entities_to_remove = []
237 |
unmatched_entities_to_remove = []
238 |
239 |
for entity in matched_entities:
240 |
for substring_entity in matched_entities:
241 |
if entity != substring_entity and entity.lower() in substring_entity.lower():
242 |
243 |
244 |
for entity in unmatched_entities:
245 |
for substring_entity in unmatched_entities:
246 |
if entity != substring_entity and entity.lower() in substring_entity.lower():
247 |
248 |
249 |
matched_entities_to_remove = list(dict.fromkeys(matched_entities_to_remove))
250 |
unmatched_entities_to_remove = list(dict.fromkeys(unmatched_entities_to_remove))
251 |
252 |
for entity in matched_entities_to_remove:
253 |
254 |
for entity in unmatched_entities_to_remove:
255 |
256 |
257 |
return matched_entities, unmatched_entities
258 |
259 |
def highlight_entities(article_content,summary_output):
260 |
261 |
markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
262 |
markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
263 |
markdown_end = "</mark>"
264 |
265 |
matched_entities, unmatched_entities = get_and_compare_entities(article_content,summary_output)
266 |
267 |
268 |
269 |
for entity in matched_entities:
270 |
summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_green + entity + markdown_end,summary_output)
271 |
272 |
for entity in unmatched_entities:
273 |
summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_red + entity + markdown_end,summary_output)
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
soup = BeautifulSoup(summary_output, features="html.parser")
282 |
283 |
return HTML_WRAPPER.format(soup)
284 |
285 |
nlp = get_spacy()
286 |
287 |
def display_df_as_table(model,top_k,score='score'):
288 |
'''Display the df with text and scores as a table'''
289 |
290 |
df = pd.DataFrame([(hit[score],passages[hit['corpus_id']]) for hit in model[0:top_k]],columns=['Score','Text'])
291 |
df['Score'] = round(df['Score'],2)
292 |
293 |
return df
294 |
295 |
def make_spans(text,results):
296 |
results_list = []
297 |
for i in range(len(results)):
298 |
299 |
facts_spans = []
300 |
facts_spans = list(zip(sent_tokenizer(text),results_list))
301 |
return facts_spans
302 |
303 |
##Fiscal Sentiment by Sentence
304 |
def fin_ext(text):
305 |
results = remote_clx(sent_tokenizer(text))
306 |
return make_spans(text,results)
307 |
308 |
10 |
from sentence_transformers import SentenceTransformer, CrossEncoder, util
11 |
import streamlit as st
12 |
import en_core_web_lg
13 |
from funtions import *
14 |
15 |
16 |
51 |
auth_token = os.environ.get("auth_token")
52 |
53 |
progress_bar = st.sidebar.progress(0)
54 |
55 |
nlp = get_spacy()
56 |
asr_model, sent_pipe, sum_pipe, ner_pipe, sbert, cross_encoder = load_models()
57 |
58 |