nickmuchi commited on
Commit
a6a7c7a
Β·
1 Parent(s): 2b5cb3b

Delete Home.py

Browse files
Files changed (1) hide show
  1. Home.py +0 -126
Home.py DELETED
@@ -1,126 +0,0 @@
1
- import whisper
2
- import os
3
- from pytube import YouTube
4
- import pandas as pd
5
- import plotly_express as px
6
- import nltk
7
- import plotly.graph_objects as go
8
- from optimum.onnxruntime import ORTModelForSequenceClassification
9
- from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
10
- from sentence_transformers import SentenceTransformer, CrossEncoder, util
11
- import streamlit as st
12
-
13
- nltk.download('punkt')
14
-
15
- from nltk import sent_tokenize
16
-
17
-
18
- st.set_page_config(
19
- page_title="Home",
20
- page_icon="πŸ“ž",
21
- )
22
-
23
- auth_token = os.environ.get("auth_token")
24
-
25
- @st.experimental_singleton()
26
- def load_models():
27
- asr_model = whisper.load_model("small")
28
- q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
29
- q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
30
- cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
31
-
32
- return asr_model, q_model, q_tokenizer, cross_encoder
33
-
34
- asr_model, q_model, q_tokenizer, cross_encoder = load_models()
35
-
36
- @st.experimental_memo(suppress_st_warning=True)
37
- def inference(link, upload):
38
- '''Convert Youtube video or Audio upload to text'''
39
-
40
- if validators.url(link):
41
-
42
- yt = YouTube(link)
43
- title = yt.title
44
- path = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp4")
45
- options = whisper.DecodingOptions(without_timestamps=True)
46
- results = asr_model.transcribe(path)
47
-
48
- return results, yt.title
49
-
50
- elif upload:
51
- results = asr_model.transcribe(upload)
52
-
53
- return results, "Transcribed Earnings Audio"
54
-
55
- @st.experimental_memo(suppress_st_warning=True)
56
- def sentiment_pipe(earnings_text):
57
- '''Determine the sentiment of the text'''
58
-
59
- remote_clx = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
60
-
61
- earnings_sentiment = remote_clx(sent_tokenize(earnings_text))
62
-
63
- return earnings_sentiment
64
-
65
-
66
- def preprocess_plain_text(text,window_size=3):
67
- '''Preprocess text for semantic search'''
68
-
69
- text = text.encode("ascii", "ignore").decode() # unicode
70
- text = re.sub(r"https*\S+", " ", text) # url
71
- text = re.sub(r"@\S+", " ", text) # mentions
72
- text = re.sub(r"#\S+", " ", text) # hastags
73
- text = re.sub(r"\s{2,}", " ", text) # over spaces
74
- #text = re.sub("[^.,!?%$A-Za-z0-9]+", " ", text) # special characters except .,!?
75
-
76
- #break into lines and remove leading and trailing space on each
77
- lines = [line.strip() for line in text.splitlines()]
78
-
79
- # #break multi-headlines into a line each
80
- chunks = [phrase.strip() for line in lines for phrase in line.split(" ")]
81
-
82
- # # drop blank lines
83
- text = '\n'.join(chunk for chunk in chunks if chunk)
84
-
85
- ## We split this article into paragraphs and then every paragraph into sentences
86
- paragraphs = []
87
- for paragraph in text.replace('\n',' ').split("\n\n"):
88
- if len(paragraph.strip()) > 0:
89
- paragraphs.append(sent_tokenize(paragraph.strip()))
90
-
91
- #We combine up to 3 sentences into a passage. You can choose smaller or larger values for window_size
92
- #Smaller value: Context from other sentences might get lost
93
- #Lager values: More context from the paragraph remains, but results are longer
94
- window_size = window_size
95
- passages = []
96
- for paragraph in paragraphs:
97
- for start_idx in range(0, len(paragraph), window_size):
98
- end_idx = min(start_idx+window_size, len(paragraph))
99
- passages.append(" ".join(paragraph[start_idx:end_idx]))
100
-
101
- print(f"Sentences: {sum([len(p) for p in paragraphs])}")
102
- print(f"Passages: {len(passages)}")
103
-
104
- return passages
105
-
106
- def display_df_as_table(model,top_k,score='score'):
107
- '''Display the df with text and scores as a table'''
108
-
109
- df = pd.DataFrame([(hit[score],passages[hit['corpus_id']]) for hit in model[0:top_k]],columns=['Score','Text'])
110
- df['Score'] = round(df['Score'],2)
111
-
112
- return df
113
-
114
- def make_spans(text,results):
115
- results_list = []
116
- for i in range(len(results)):
117
- results_list.append(results[i]['label'])
118
- facts_spans = []
119
- facts_spans = list(zip(sent_tokenizer(text),results_list))
120
- return facts_spans
121
-
122
- ##Fiscal Sentiment by Sentence
123
- def fin_ext(text):
124
- results = remote_clx(sent_tokenizer(text))
125
- return make_spans(text,results)
126
-