import os
import streamlit as st
# EDA Pkgs
import pandas as pd
import numpy as np
# Viz Pkgs
import matplotlib
matplotlib.use('Agg')
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
regressor = LogisticRegression()
def main():
""" Common ML Dataset Explorer """
html_temp = """
Room Occupancy Predictor 💡
"""
st.markdown(html_temp,unsafe_allow_html=True)
def file_selector(folder_path='./dataset'):
filenames = os.listdir(folder_path)
selected_filename = st.selectbox("Select A file",filenames)
return os.path.join(folder_path,selected_filename)
filename = file_selector()
st.info("You Selected {}".format(filename))
# Read Data
df = pd.read_csv(filename)
# Show Dataset
if st.checkbox("Show Dataset"):
st.write(df.astype(str))
# Show Columns
if st.button("Column Names"):
st.write(df.columns)
# Show Shape
if st.checkbox("Shape of Dataset"):
data_dim = st.radio("Show Dimension By ",("Rows","Columns"))
if data_dim == 'Rows':
st.text("Number of Rows")
st.write(df.shape[0])
elif data_dim == 'Columns':
st.text("Number of Columns")
st.write(df.shape[1])
else:
st.write(df.shape)
# Select Columns
if st.checkbox("Select Columns To Show"):
all_columns = df.columns.tolist()
selected_columns = st.multiselect("Select",all_columns)
new_df = df[selected_columns]
st.dataframe(new_df)
# Show Values
if st.button("Value Counts"):
st.text("Value Counts By Target/Class")
st.write(df.iloc[:,-1].value_counts())
# Show Datatypes
if st.button("Data Types"):
st.text(df.dtypes)
# Show Summary
if st.checkbox("Summary"):
st.write(df.describe().T)
## Plot and Visualization
st.subheader("Data Visualization")
# Correlation
# Seaborn Plot
if st.checkbox("Correlation Plot[Seaborn]"):
st.set_option('deprecation.showPyplotGlobalUse', False)
st.write(sns.heatmap(df.corr(),annot=True))
st.pyplot()
# Pie Chart
if st.checkbox("Pie Plot"):
all_columns_names = df.columns.tolist()
if st.button("Generate Pie Plot"):
st.success("Generating A Pie Plot")
st.write(df.iloc[:,-1].value_counts().plot.pie(autopct="%1.1f%%"))
st.pyplot()
# Count Plot
if st.checkbox("Plot of Value Counts"):
st.text("Value Counts By Target")
all_columns_names = df.columns.tolist()
primary_col = st.selectbox("Primary Columm to GroupBy",all_columns_names)
selected_columns_names = st.multiselect("Select Columns",all_columns_names)
if st.button("Plot"):
st.text("Generate Plot")
if selected_columns_names:
vc_plot = df.groupby(primary_col)[selected_columns_names].count()
else:
vc_plot = df.iloc[:,-1].value_counts()
st.write(vc_plot.plot(kind="bar"))
st.pyplot()
# Customizable Plot
st.subheader("Customizable Plot")
all_columns_names = df.columns.tolist()
type_of_plot = st.selectbox("Select Type of Plot",["area","bar","line","hist","box","kde"])
selected_columns_names = st.multiselect("Select Columns To Plot",all_columns_names)
if st.button("Generate Plot"):
st.success("Generating Customizable Plot of {} for {}".format(type_of_plot,selected_columns_names))
# Plot By Streamlit
if type_of_plot == 'area':
cust_data = df[selected_columns_names]
st.area_chart(cust_data)
elif type_of_plot == 'bar':
cust_data = df[selected_columns_names]
st.bar_chart(cust_data)
elif type_of_plot == 'line':
cust_data = df[selected_columns_names]
st.line_chart(cust_data)
# Custom Plot
elif type_of_plot:
cust_plot= df[selected_columns_names].plot(kind=type_of_plot)
st.write(cust_plot)
st.pyplot()
#Evaluate Model
st.subheader("Model, Deployment, and Evaluation")
# Impute nans with mean for numeris and most frequent for categoricals
cat_imp = SimpleImputer(strategy="most_frequent")
if len(df.loc[:,df.dtypes == 'object'].columns) != 0:
df.loc[:,df.dtypes == 'object'] = cat_imp.fit_transform(df.loc[:,df.dtypes == 'object'])
imp = SimpleImputer(missing_values = np.nan, strategy="mean")
df.loc[:,df.dtypes != 'object'] = imp.fit_transform(df.loc[:,df.dtypes != 'object'])
# One hot encoding for categorical variables
features = st.multiselect('select features and target variable',df.columns.tolist())
cats = df.dtypes == 'object'
le = LabelEncoder()
for x in df.columns[cats]:
df.loc[:,x] = le.fit_transform(df[x])
onehotencoder = OneHotEncoder()
df.loc[:,~cats].join(pd.DataFrame(data=onehotencoder.
fit_transform (df.loc[:,cats]).toarray(), columns=onehotencoder.get_feature_names_out()))
chosen_target = st.sidebar.selectbox("Please choose target column", (df.columns))
X = df.loc[:, df.columns != chosen_target]
scaler = MinMaxScaler(feature_range=(0,1))
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X))
X.columns = df.loc[:, df.columns != chosen_target].columns
y = df[chosen_target]
# Train test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
type = st.sidebar.selectbox("Algorithm type", ("Classification", "Classification"))
if type == "Classification":
chosen_classifier = st.sidebar.selectbox("Please choose a classifier", ('Logistic Regression', 'Naive Bayes'))
if chosen_classifier == 'Logistic Regression':
max_iter = st.sidebar.slider('max iterations', 1, 100, 10)
if type == "Classification":
if chosen_classifier == 'Logistic Regression':
alg = LogisticRegression()
model = alg.fit(X_train, y_train)
predictions = alg.predict(X_test)
predictions_train = alg.predict(X_train)
elif chosen_classifier=='Naive Bayes':
alg = GaussianNB()
model = alg.fit(X_train, y_train)
predictions = alg.predict(X_test)
predictions_train = alg.predict(X_train)
error_metrics = {}
if type == 'Classification':
error_metrics['Accuracy_test'] = accuracy_score(y_test, predictions)
error_metrics['Accuracy_train'] = accuracy_score(y_train, predictions_train)
st.write('### Accuracy Train: ' + str(round(error_metrics['Accuracy_train'], 3)) +
' -- Accuracy Test: ' + str(round(error_metrics['Accuracy_test'], 3)))
if st.button("Thanks"):
st.balloons()
st.sidebar.header("About App")
st.sidebar.info("A Simple ML App for predicting Room Occupancy")
st.sidebar.header("Developer")
st.sidebar.info("Nasim Obeid")
st.sidebar.text("Built with Streamlit")
if __name__ == '__main__':
main()