Spaces:
Runtime error
Runtime error
File size: 11,196 Bytes
c818471 e151b1f c818471 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 |
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 12 08:28:35 2021
@author: rejid4996
"""
# packages
import os
import re
import time
import base64
import pickle
import numpy as np
import pandas as pd
import streamlit as st
from io import BytesIO
import preprocessor as p
from textblob.classifiers import NaiveBayesClassifier
# custum function to clean the dataset (combining tweet_preprocessor and reguar expression)
def clean_tweets(df):
#set up punctuations we want to be replaced
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")
tempArr = []
for line in df:
# send to tweet_processor
tmpL = p.clean(line)
# remove puctuation
tmpL = REPLACE_NO_SPACE.sub("", tmpL.lower()) # convert all tweets to lower cases
tmpL = REPLACE_WITH_SPACE.sub(" ", tmpL)
tempArr.append(tmpL)
return tempArr
def to_excel(df):
output = BytesIO()
writer = pd.ExcelWriter(output, engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1')
writer.save()
processed_data = output.getvalue()
return processed_data
def get_table_download_link(df):
"""Generates a link allowing the data in a given panda dataframe to be downloaded
in: dataframe
out: href string
"""
val = to_excel(df)
b64 = base64.b64encode(val) # val looks like b'...'
return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="classified_data.xlsx">Download file</a>' # decode b'abc' => abc
def download_model(model):
output_model = pickle.dumps(model)
b64 = base64.b64encode(output_model).decode()
href = f'<a href="data:file/output_model;base64,{b64}" download="myClassifier.pkl">Download Model .pkl File</a>'
st.markdown(href, unsafe_allow_html=True)
def main():
"""NLP App with Streamlit"""
from PIL import Image
wallpaper = Image.open('file.jpg')
wallpaper = wallpaper.resize((700,350))
st.sidebar.title("Text Classification App 1.0")
st.sidebar.success("Please reach out to https://www.linkedin.com/in/deepak-john-reji/ for more queries")
st.sidebar.subheader("Classifier using Textblob ")
st.info("For more contents subscribe to my Youtube Channel https://www.youtube.com/channel/UCgOwsx5injeaB_TKGsVD5GQ")
st.image(wallpaper)
options = ("Train the model", "Test the model", "Predict for a new data")
a = st.sidebar.empty()
value = a.radio("what do you wanna do", options, 0)
if value == "Train the model":
uploaded_file = st.file_uploader("*Upload your file, make sure you have a column for text that has to be classified and the label", type="xlsx")
if uploaded_file:
df = pd.read_excel(uploaded_file)
option1 = st.sidebar.selectbox(
'Select the text column',
tuple(df.columns.to_list()))
option2 = st.sidebar.selectbox(
'Select the label column',
tuple(df.columns.to_list()))
# clean training data
df[option1] = clean_tweets(df[option1])
# Enter the label names
label1 = st.sidebar.text_input("Enter the label for '0' value")
label2 = st.sidebar.text_input("Enter the label for '1' value")
# replace value with pos and neg
df[option2] = df[option2].map({0:label1, 1:label2})
gcr_config = st.sidebar.slider(label="choose the training size, longer the size longer the training time",
min_value=100,
max_value=10000,
step=10)
#subsetting based on classes
df1 = df[df[option2] == label1][0:int(gcr_config/2)]
df2 = df[df[option2] == label2][0:int(gcr_config/2)]
df_new = pd.concat([df1, df2]).reset_index(drop=True)
# convert in the format
training_list = []
for i in df_new.index:
value = (df_new[option1][i], df_new[option2][i])
training_list.append(value)
# run classification
run_button = st.sidebar.button(label='Start Training')
if run_button:
# Train using Naive Bayes
start = time.time() # start time
cl = NaiveBayesClassifier(training_list[0:gcr_config])
st.success("Congratulations!!! Model trained successfully with an accuracy of "+str(cl.accuracy(training_list) * 100) + str("%"))
st.write("Total Time taken for Training :" + str((time.time()-start)/60) + " minutes")
# download the model
download_model(cl)
# testing the model
if value == "Test the model":
uploaded_file = st.file_uploader("*Upload your model file, make sure its in the right format (currently pickle file)", type="pkl")
if uploaded_file:
model = pickle.load(uploaded_file)
st.success("Congratulations!!! Model upload successfull")
if model:
value1 = ""
test_sentence = st.text_input("Enter the testing sentence")
#predict_button = st.button(label='Predict')
if test_sentence:
st.info("Model Prediction is : " + model.classify(test_sentence))
"\n"
st.write("### π² Help me train the model better. How is the prediction?")
"\n"
correct = st.checkbox("Correct")
wrong = st.checkbox("Incorrect")
if correct:
st.success("Great!!! I am happy for you")
st.write("If you would like please try out for more examples")
if wrong:
st.write("### π² Dont worry!!! Lets add this new data to the model and retrain. ")
label = st.text_input("Could you write the actual label, please note the label name should be the same while you trained")
#retrain_button = st.button(label='Retrain')
if label:
new_data = [(test_sentence, label)]
model.update(new_data)
st.write("### π² Lets classify and see whether model had learned from this example ")
st.write("Sentence : " + test_sentence)
st.info("New Model Prediction is : " + model.classify(test_sentence))
sec_wrong3 = st.checkbox("It's Correct")
sec_wrong1 = st.checkbox("Still Incorrect")
sec_wrong2 = st.checkbox("I will go ahead and change the data in excel and retrain the model")
if sec_wrong1:
st.write("### π² Lets try training with some sentences of this sort")
new_sentence = st.text_input("Enter the training sentence")
new_label = st.text_input("Enter the training label")
st.write("Lets try one last time ")
retrain_button1 = st.button(label='Retrain again!')
if retrain_button1:
new_data1 = [(new_sentence, new_label)]
model.update(new_data1)
st.write("Sentence : " + new_sentence)
st.info("New Model Prediction is : " + model.classify(new_sentence))
# download the model
download_model(model)
if sec_wrong2:
st.info("Great!!! Fingers Crossed")
st.write("### π² Please return to your excel file and add more sentences and Train the model again")
if sec_wrong3:
st.info("Wow!!! Awesome")
st.write("Now lets download the updated model")
# download the model
download_model(model)
# predicting for new data
if value == "Predict for a new data":
uploaded_file3 = st.file_uploader("*Upload your model file, make sure its in the right format (currently pickle file)", type="pkl")
if uploaded_file3:
model1 = pickle.load(uploaded_file3)
st.success("Congratulations!!! Model uploaded successfully")
uploaded_file1 = st.file_uploader("*Upload your new data which you have to predict", type="xlsx")
if uploaded_file1:
st.success("Congratulations!!! Data uploaded successfully")
df_valid = pd.read_excel(uploaded_file1)
option3 = st.selectbox(
'Select the text column which needs to be predicted',
tuple(df_valid.columns.to_list()))
predict_button1 = st.button(label='Predict for new data')
if predict_button1:
start1 = time.time() # start time
df_valid['predicted'] = df_valid[option3].apply(lambda tweet: model1.classify(tweet))
st.write("### π² Prediction Successfull !!!")
st.write("Total No. of sentences: "+ str(len(df_valid)))
st.write("Total Time taken for Prediction :" + str((time.time()-start1)/60) + " minutes")
st.markdown(get_table_download_link(df_valid), unsafe_allow_html=True)
if __name__ == "__main__":
main()
|