jmaciejowski's picture
Update app.py
91e4ae1 verified
raw
history blame
5.47 kB
# Gradio Application Interface
import gradio as gr
from transformers import pipeline
from bs4 import BeautifulSoup
import requests
import pandas as pd
import gensim
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import os
def summarizer_func():
return pipeline(
model="Majon911/pegasus_multi_news_ep1",
tokenizer = "google/pegasus-xsum",
min_length=100, max_length=200,
truncation = True
)
def sentiment_func():
return pipeline("text-classification",
model="kbaumgartner/DeBERTa_Finetuned_Financial_News",
tokenizer = "microsoft/deberta-v3-base")
def source_outlet(choise):
if choise == 'CNBC':
url = "https://www.cnbc.com/finance/"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
headlines = {}
headline_elements = soup.find_all('a', class_='Card-title')
for headline_element in headline_elements:
headlines[headline_element.text.strip()] = headline_element['href']
elif choise == "Reuters":
pass
df = pd.DataFrame({'headline': headlines.keys(),
'url': headlines.values()})
first_5_articles = df.head()
first_5_articles = first_5_articles.assign(text='')
first_5_articles = first_5_articles.assign(summary='')
first_5_articles = first_5_articles.assign(sentiment='')
first_5_articles = first_5_articles.assign(topic='')
return first_5_articles
def sentiment_translation(curr_sentiment):
if curr_sentiment == "LABEL_0":
trans_lbl = "NEGATIVE"
elif curr_sentiment == "LABEL_1":
trans_lbl = "NEUTRAL"
elif curr_sentiment == "LABEL_2":
trans_lbl = "POSITIVE"
return trans_lbl
def preprocess(text):
# Remove special characters and digits
text = text.lower()
text = re.sub("(\\d|\\W)+", " ", text)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word) for word in text.lower().split() if word not in stop_words and len(word) > 3]
return tokens
def lda_topic_modeling(text):
lda_model = gensim.models.LdaModel.load("lda_gensim_5t/lda_model5.gensim")
dictionary = gensim.corpora.Dictionary.load("lda_gensim_5t/dictionary5.gensim")
processed_text = preprocess(text)
bow = dictionary.doc2bow(processed_text)
topic_distribution = lda_model.get_document_topics(bow, minimum_probability=0.0)
topic_distribution = sorted(topic_distribution, key=lambda x: x[1], reverse=True)
topic_names = {
'0': "Corporate Valuation & Performance",
'1': "Quarterly Financial Reports",
'2': "Stock Market & Investment Funds",
'3': "Corporate Affairs & Products",
'4': "Investment Research"
}
# Extract the most probable topic and its probability
if topic_distribution:
dominant_topic, probability = topic_distribution[0]
topic_name = topic_names.get(str(dominant_topic), "Unknown Topic")
return (topic_name, probability)
else:
# If no topic is found, return a placeholder and zero probability
return ("No Topic Found", 0.0)
def gradio_stocknews(source_ch, art_number):
# Defining the summarizer
summarizer = summarizer_func()
# Defining the semtiment analysis
pipe_sentiment = sentiment_func()
# Identyfying the Articles
first_5_articles = source_outlet(source_ch)
# Scraping text for the chosen article
response = requests.get(first_5_articles.loc[art_number-1, 'url'])
sub_soup = BeautifulSoup(response.content, 'html.parser')
article_body_element = sub_soup.find('div', class_='ArticleBody-articleBody') # ArticleBody-articleBody
article_text = article_body_element.get_text() # Extracting only the text
first_5_articles.loc[art_number-1, 'text'] = article_text
first_5_articles.loc[art_number-1, 'summary'] = summarizer(article_text)[0]['generated_text']
label_sentiment = pipe_sentiment(article_text)[0]['label']
first_5_articles.loc[art_number-1, 'sentiment'] = sentiment_translation(label_sentiment)
# Get the human-readable topic name using the topic names mapping
first_5_articles.loc[art_number-1, 'topic'] = lda_topic_modeling(article_text)[0]
return first_5_articles.loc[art_number-1, 'headline'], first_5_articles.loc[art_number-1, 'url'], first_5_articles.loc[art_number-1, 'summary'], first_5_articles.loc[art_number-1, 'sentiment'], first_5_articles.loc[art_number-1, 'topic']
def main():
os.chdir(os.path.dirname(os.path.realpath(__file__)))
nltk.download('stopwords')
nltk.download('wordnet')
#print(gradio_stocknews("CNBC", 2))
iface = gr.Interface(fn=gradio_stocknews,
inputs=[gr.Dropdown(choices=["CNBC"], label="Select Source"), gr.Dropdown(choices=[1, 2, 3, 4, 5], label="Select Article Number")],
outputs=[gr.Textbox(lines=1, label="Article Title"), gr.Textbox(lines=1, label="Article Link"), gr.Textbox(lines=1, label="Article Summary"), gr.Textbox(lines=1, label="Article Sentiment"), gr.Textbox(lines=1, label="Article Topic"), gr.Image("NLP_A2_Poster.jpg", label="Download Model Info/Shorter version in files!")], # Add this line for topic
title="Latest 5 Stock News Dashboard",
description="Click the button to refresh the news summary.")
iface.launch()
if __name__ == "__main__":
main()