Spaces:

r1391819
/

financial-researcher

Runtime error

File size: 11,531 Bytes

004fea0

# Import baseline dependencies
import csv
import time
from datetime import date

import numpy as np
import pandas as pd
import pandas_datareader as data
import requests
import streamlit as st
from bs4 import BeautifulSoup
from plotly import graph_objs as go
from prophet import Prophet
from prophet.plot import plot_plotly
# summarisation (Pegasus) and sentiment analysis (BERT) models
from transformers import (BertForSequenceClassification, BertTokenizer,
                          PegasusTokenizer, TFPegasusForConditionalGeneration,
                          pipeline)

# Setting streamlit page config to wide
st.set_page_config(layout='wide')


@st.cache(allow_output_mutation=True, show_spinner=False)
# Setup summarisation model
def get_summarisation_model():
    sum_model_name = "human-centered-summarization/financial-summarization-pegasus"
    sum_tokenizer = PegasusTokenizer.from_pretrained(sum_model_name)
    sum_model = TFPegasusForConditionalGeneration.from_pretrained(
        sum_model_name)

    # returning model and tokenizer
    return sum_model, sum_tokenizer


@st.cache(allow_output_mutation=True, show_spinner=False)
# Setup sentiment analysis model
def get_sentiment_pepeline():
    sen_model_name = "ahmedrachid/FinancialBERT-Sentiment-Analysis"
    sen_tokenizer = BertTokenizer.from_pretrained(sen_model_name)
    sen_model = BertForSequenceClassification.from_pretrained(
        sen_model_name, num_labels=3)
    sentiment_nlp = pipeline("sentiment-analysis",
                             model=sen_model, tokenizer=sen_tokenizer)

    # returning sentiment pipeline
    return sentiment_nlp


@st.cache(show_spinner=False, suppress_st_warning=True)
# Get all links from Google News
def search_urls(ticker, num, date):

    # https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages

    # Request headers and parameters
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36",
    }

    params = {
        "as_sitesearch": "finance.yahoo.com",  # we only want results from Yahoo Finance
        "hl": "en",  # language of the interface
        "gl": "us",  # country of the search
        "tbm": "nws",  # news results
        "lr": "lang_en"  # language filter
    }

    # base URL
    url = "https://www.google.com/search"

    # search query
    params["as_epq"] = ticker
    params["as_occt"] = ticker
    # number of search results per page
    params["num"] = num

    # articles timeframe
    #  d = past 24h, h = past hour, w = past week, m = pasth month
    if date == "Past week":
        params["as_qdr"] = "w"
    elif date == "Past day":
        params["as_qdr"] = "d"

    r = requests.get(url, headers=headers, params=params,
                     cookies={'CONSENT': 'YES+'})
    time.sleep(5)
    st.write("Searched URL:")
    st.write(r.url)  # debugging
    soup = BeautifulSoup(r.text, "html.parser")
    atags = soup.find_all("a", "WlydOe")
    hrefs = [link["href"] for link in atags]

    return hrefs


@st.cache(show_spinner=False)
# Extract title, date, and content of the article from all given URLs
def search_scrape(urls):
    articles = []
    titles = []
    post_dates = []

    for url in urls:
        r = requests.get(url)
        time.sleep(5)
        soup = BeautifulSoup(r.text, "html.parser")

        # title
        title = soup.find("header", "caas-title-wrapper")
        # handling missing titles
        if title is not None:
            titles.append(title.text)
        else:
            titles.append("N/A")

        # posting date of the article
        date = soup.find("time", "caas-attr-meta-time")
        # handling missing dates
        if date is not None:
            post_dates.append(date.text)
        else:
            post_dates.append("N/A")

        # article content
        # all the paragraphs within the article
        paragraphs = soup.find_all("div", "caas-body")
        text = [paragraph.text for paragraph in paragraphs]
        # extract only the first 300 words (needs to be done to avoid limit
        # problems with the summarisation model)
        words = " ".join(text).split(" ")[:350]
        article = " ".join(words)
        articles.append(article)

    return titles, post_dates, articles


@st.cache(show_spinner=False)
# Summarise all given articles using a fine-tuned Pegasus Transformers model
def summarise_articles(sum_model, sum_tokenizer, articles):
    summaries = []
    for article in articles:

        # source
        # https://huggingface.co/human-centered-summarization/financial-summarization-pegasus
        input_ids = sum_tokenizer(
            article, return_tensors="tf").input_ids
        output = sum_model.generate(
            input_ids, max_length=55, num_beans=5, early_stopping=True)
        summary = sum_tokenizer.decode(
            output[0], skip_special_tokens=True)
        summaries.append(summary)

    return summaries


@st.cache(show_spinner=False)
# Join all data into rows
def create_output_array(titles, post_dates, summarised_articles, sentiment_scores, raw_urls):
    output_array = []
    for idx in range(len(summarised_articles)):
        row = [
            titles[idx],
            post_dates[idx],
            summarised_articles[idx],
            sentiment_scores[idx]["label"].capitalize(),
            "{:.0%}".format(sentiment_scores[idx]["score"]),
            raw_urls[idx]
        ]
        output_array.append(row)

    return output_array


@st.cache(show_spinner=False)
# Convert dataframe to .csv file
def convert_df(df):
    return df.to_csv().encode("utf-8")

# ------------------------------------------------------------------------------


@st.cache(show_spinner=False)
# Load data from Yahoo Finance
def load_data(ticker, start, end):
    df = data.DataReader(ticker, "yahoo", start, end)
    df.reset_index(inplace=True)
    return df


@st.cache(show_spinner=False)
# Predict stock trend for N years using Prophet
def predict(df, period):

    df_train = df[["Date", "Close"]]
    df_train = df_train.rename(columns={"Date": "ds", "Close": "y"})

    model = Prophet()

    model.fit(df_train)
    future = model.make_future_dataframe(periods=period)
    forecast = model.predict(future)

    return model, forecast


def main_page():

    # Financial News Analysis feature

    # Streamlit text

    st.sidebar.markdown("## Financial News Analysis")
    st.sidebar.write(
        "Scrape, auto summarise and calculate sentiment for stock and crypto news.")

    # User input
    ticker = st.text_input("Ticker:", "TSLA")
    num = st.number_input("Number of articles:", 5, 15, 10)
    date = st.selectbox(
        "Timeline:", ["Past week", "Past day"])

    search = st.button("Search")

    st.info("Please do not spam the search button")
    st.markdown("---")

    # If button is pressed
    if search:

        with st.spinner("Processing articles, please wait..."):
            # Search query and return all articles' links
            raw_urls = search_urls(ticker, num, date)

            # If any problems happened (e.g., blocked by Google's server) stop app
            if not raw_urls:
                st.error("Please wait a few minutes before trying again")
            else:

                # Scrap title, posting date and article content from all the URLs
                titles, post_dates, articles = search_scrape(raw_urls)

                # Summarise all articles
                summarised_articles = summarise_articles(
                    sum_model, sum_tokenizer, articles)

                # Calculate sentiment for all articles
                # source
                # https://huggingface.co/ahmedrachid/FinancialBERT-Sentiment-Analysis
                sentiment_scores = sentiment_pipeline(summarised_articles)

                # Create dataframe
                output_array = create_output_array(
                    titles, post_dates, summarised_articles, sentiment_scores, raw_urls)
                cols = ["Title", "Date", "Summary",
                        "Label", "Confidence", "URL"]
                df = pd.DataFrame(output_array, columns=cols)

                # Visualise dataframe
                st.dataframe(df)

                # Convert dataframe to csv and let user download it
                csv_file = convert_df(df)

                # Download CSV
                st.download_button(
                    "Save data to CSV", csv_file, "assetsummaries.csv", "text/csv", key="download-csv")


def page2():

    # Stock Trend Forecasting feature

    # Streamlit text
    st.sidebar.markdown("## Stock Trend Forecasting")
    st.sidebar.write(
        "A simple dashboard for stock trend forecasting and analysis.")

    # Start and end date of data
    start = "2010-01-01"
    end = date.today().strftime("%Y-%m-%d")

    # Ticker selection
    ticker = st.text_input("Ticker:", "AAPL")
    # Loading data from Yahoo Finance
    df = load_data(ticker, start, end)

    # Period selection
    n_years = st.number_input("Years of prediction:", 1, 4, 1)
    period = n_years * 365

    # Start prediction button
    init = st.button("Predict")

    st.markdown("---")

    # Visualisation
    # Dropping adj close column
    df = df.drop(["Adj Close"], axis=1)

    # Visualisation
    # Exploratory analysis
    st.subheader("Exploratory analysis")
    st.write(df.describe())

    # Plot raw closing data with 100 and 200 days MA (for simple analysis)
    st.subheader("Closing data, MA100 and MA200")

    ma100 = df.Close.rolling(100).mean()
    ma200 = df.Close.rolling(200).mean()

    fig = go.Figure()
    fig.update_layout(
        margin=dict(
            l=0,
            r=0,
            b=0,
            t=50,
            pad=4
        )
    )
    fig.add_trace(go.Scatter(x=df["Date"],
                  y=df['Close'], name="stock_close"))
    fig.add_trace(go.Scatter(x=df["Date"], y=ma100, name="ma100"))
    fig.add_trace(go.Scatter(x=df["Date"], y=ma200, name="ma200"))
    fig.layout.update(xaxis_rangeslider_visible=True)
    st.plotly_chart(fig, use_container_width=True)

    # If button is pressed, start forecasting
    if init:
        with st.spinner("Please wait..."):
            model, forecast = predict(df, period)

            st.markdown("---")
            st.subheader("Forecast data")
            st.write(forecast.tail())

            st.subheader(f"Forecast plot for {n_years} years")

            fig = plot_plotly(model, forecast)
            fig.update_layout(
                margin=dict(
                    l=0,
                    r=0,
                    b=0,
                    t=0,
                    pad=4
                )
            )
            st.plotly_chart(fig,  use_container_width=True)

            st.subheader("Forecast components")
            fig = model.plot_components(forecast)
            st.write(fig)


if __name__ == "__main__":

    with st.spinner("Loading all models..."):
        # Creating summariser and sentiment models
        sum_model, sum_tokenizer = get_summarisation_model()
        sentiment_pipeline = get_sentiment_pepeline()

    page_names_to_funcs = {
        "Financial News Analysis": main_page,
        "Stock Trend Forecasting": page2
    }

    st.sidebar.markdown("# Financial Researcher")

    selected_page = st.sidebar.selectbox(
        "Select a page", page_names_to_funcs.keys())

    st.sidebar.markdown("---")

    page_names_to_funcs[selected_page]()