import streamlit as st
import joblib
import pandas as pd
import string
import re
import nltk
nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer


model = joblib.load("ridge_classifier.pkl")
data = pd.read_csv("data_modified.csv")

ps = nltk.PorterStemmer()
stopwords = nltk.corpus.stopwords.words('english')


def clean_text(text):
    text = "".join([word.lower()
                   for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text


vectoriz = TfidfVectorizer(analyzer=clean_text)
vectorizer = vectoriz.fit(data["text"])


def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100


st.title("Sentiment analysis classification")

text = st.text_input("Type the text here")
if st.button("Predict"):
    #text = str(text)
    trans = vectorizer.transform([text])
    body_len = len(text) - text.count(" ")
    punct = count_punct(text)
    #k = {"body_len": [body_len], "punc%": [punct]}
    k = {"body_len": [body_len], "punc%": [punct]}
    df = pd.DataFrame(k)
    #df.columns = df.columns.astype(str) 
    test_vect = pd.concat([df.reset_index(drop=True),
                           pd.DataFrame(trans.toarray())], axis=1)
    prediction = model.predict(test_vect)
    st.write(prediction[0])