MatthiasPi
commited on
Commit
•
a3171a2
1
Parent(s):
09bf6bf
commit the whole project
Browse files- .gitignore +1 -0
- README.md +21 -0
- algos/classification/logistic.py +31 -0
- algos/classification/nnclassifier.py +29 -0
- algos/classification/svmclassifier.py +30 -0
- algos/clustering/dbscan.py +26 -0
- algos/clustering/kmeans.py +24 -0
- algos/clustering/kproto.py +44 -0
- algos/others/others_page.py +4 -0
- algos/regression/elasticnet.py +43 -0
- algos/regression/linR.py +41 -0
- algos/regression/ridge.py +37 -0
- analysis/exploration.py +20 -0
- analysis/preprocessing.py +113 -0
- app.py +49 -0
- carott.png +0 -0
- requirements.txt +10 -0
- utilities/components.py +118 -0
- utilities/land.py +35 -0
- utilities/standard_template.py +58 -0
- utilities/template_helpers.py +9 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
**/__pycache__/
|
README.md
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# AIViz
|
2 |
+
Software Engineering Project ESILV DIA 1
|
3 |
+
|
4 |
+
Web application allowing the user to perform Machine Learning over their own datasets. Results are then diplayed through dynamic visualizations, and downlodable.
|
5 |
+
|
6 |
+
To run the app locally :
|
7 |
+
|
8 |
+
- Make sure to have Python 3.10+
|
9 |
+
- Install App dependencies:
|
10 |
+
|
11 |
+
```
|
12 |
+
pip install -r requirements.txt
|
13 |
+
```
|
14 |
+
|
15 |
+
- Run the app with Streamlit
|
16 |
+
|
17 |
+
```
|
18 |
+
streamlit run app.py engine=python
|
19 |
+
```
|
20 |
+
|
21 |
+
AIViz is also accessible <a href="https://clementcornet-aiviz-app-n0g5vp.streamlit.app/">online</a>.
|
algos/classification/logistic.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import numpy as np
|
3 |
+
from sklearn.linear_model import LogisticRegression
|
4 |
+
from types import NoneType
|
5 |
+
|
6 |
+
def process(data):
|
7 |
+
if type(data[0]) == NoneType or type(data[1]) == NoneType: # if either training or testing dataset is still missing
|
8 |
+
st.info('Please Upload Data')
|
9 |
+
return None
|
10 |
+
|
11 |
+
if 'object' in list(data[0].dtypes) or 'object' in list(data[1].dtypes):
|
12 |
+
st.info('Please Upload Numerica Data.')
|
13 |
+
return None
|
14 |
+
st.write(data[0].dtypes)
|
15 |
+
|
16 |
+
x_train = data[0].iloc[:,:-1]
|
17 |
+
y_train = data[0].iloc[:,-1]
|
18 |
+
#st.write(x_train.shape)
|
19 |
+
x_test = data[1].iloc[:,:x_train.shape[1]]
|
20 |
+
#st.dataframe(data[1])
|
21 |
+
#st.write(x_test.shape)
|
22 |
+
|
23 |
+
if len(x_train.columns) != len(x_test.columns):
|
24 |
+
st.info('Training and testing datasets have different column number, cannot perform classification.')
|
25 |
+
return None
|
26 |
+
|
27 |
+
clf = LogisticRegression(random_state=0).fit(x_train, y_train)
|
28 |
+
#clf.fit(x_train, y_train)
|
29 |
+
pred = clf.predict(x_test)
|
30 |
+
x_test[data[0].columns[-1]] = pred
|
31 |
+
return x_test
|
algos/classification/nnclassifier.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from sklearn.neural_network import MLPClassifier
|
3 |
+
import pandas as pd
|
4 |
+
from types import NoneType
|
5 |
+
|
6 |
+
def process(data):
|
7 |
+
if type(data[0]) == NoneType or type(data[1]) == NoneType: # if either training or testing dataset is still missing
|
8 |
+
st.info('Please Upload Data')
|
9 |
+
return None
|
10 |
+
if 'object' in list(data[0].dtypes) or 'object' in list(data[1].dtypes):
|
11 |
+
st.info('Please Upload Numerica Data.')
|
12 |
+
return None
|
13 |
+
x_train = data[0].iloc[:,:-1]
|
14 |
+
y_train = data[0].iloc[:,-1]
|
15 |
+
#st.write(x_train.shape)
|
16 |
+
x_test = data[1].iloc[:,:x_train.shape[1]]
|
17 |
+
#st.dataframe(data[1])
|
18 |
+
#st.write(x_test.shape)
|
19 |
+
|
20 |
+
if len(x_train.columns) != len(x_test.columns):
|
21 |
+
st.info('Training and testing datasets have different column number, cannot perform classification.')
|
22 |
+
return None
|
23 |
+
|
24 |
+
clf = MLPClassifier(random_state=1, max_iter=300).fit(x_train, y_train)
|
25 |
+
pred = clf.predict(x_test)
|
26 |
+
#x_test[data[0].columns[-1]] = pred
|
27 |
+
x_test[data[0].columns[-1]] = pred
|
28 |
+
#st.dataframe(x_test)
|
29 |
+
return x_test
|
algos/classification/svmclassifier.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import numpy as np
|
3 |
+
from sklearn.pipeline import make_pipeline
|
4 |
+
from sklearn.preprocessing import StandardScaler
|
5 |
+
from sklearn.svm import SVC
|
6 |
+
from types import NoneType
|
7 |
+
|
8 |
+
def process(data):
|
9 |
+
if type(data[0]) == NoneType or type(data[1]) == NoneType: # if either training or testing dataset is still missing
|
10 |
+
st.info('Please Upload Data')
|
11 |
+
return None
|
12 |
+
if 'object' in list(data[0].dtypes) or 'object' in list(data[1].dtypes):
|
13 |
+
st.info('Please Upload Numerica Data.')
|
14 |
+
return None
|
15 |
+
x_train = data[0].iloc[:,:-1]
|
16 |
+
y_train = data[0].iloc[:,-1]
|
17 |
+
#st.write(x_train.shape)
|
18 |
+
x_test = data[1].iloc[:,:x_train.shape[1]]
|
19 |
+
#st.dataframe(data[1])
|
20 |
+
#st.write(x_test.shape)
|
21 |
+
|
22 |
+
if len(x_train.columns) != len(x_test.columns):
|
23 |
+
st.info('Training and testing datasets have different column number, cannot perform classification.')
|
24 |
+
return None
|
25 |
+
|
26 |
+
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
|
27 |
+
clf.fit(x_train, y_train)
|
28 |
+
pred = clf.predict(x_test)
|
29 |
+
x_test[data[0].columns[-1]] = pred
|
30 |
+
return x_test
|
algos/clustering/dbscan.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import streamlit as st
|
3 |
+
from sklearn.cluster import DBSCAN
|
4 |
+
from sklearn.preprocessing import StandardScaler
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
|
8 |
+
def process(data):
|
9 |
+
|
10 |
+
if 'object' in list(data[0].dtypes):
|
11 |
+
st.info('This Algorithm can only process numerical data')
|
12 |
+
return None
|
13 |
+
|
14 |
+
scaler = StandardScaler()
|
15 |
+
df = data[0].copy()
|
16 |
+
|
17 |
+
for c in data[0].columns:
|
18 |
+
df[c] = scaler.fit_transform(data[0][[c]])
|
19 |
+
|
20 |
+
max_distance = st.slider("""Maximum distance between two samples for one to be considered
|
21 |
+
as in the neighborhood of the other. :""",0.01,5.0)
|
22 |
+
dbscan = DBSCAN(max_distance)
|
23 |
+
res = dbscan.fit_predict(df)
|
24 |
+
df = data[0]
|
25 |
+
df['cluster'] = res
|
26 |
+
return df
|
algos/clustering/kmeans.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from sklearn.preprocessing import StandardScaler
|
3 |
+
import streamlit as st
|
4 |
+
from sklearn.cluster import KMeans
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
def process(data):
|
9 |
+
|
10 |
+
if 'object' in list(data[0].dtypes):
|
11 |
+
st.info('This Algorithm can only process numerical data')
|
12 |
+
return None
|
13 |
+
|
14 |
+
scaler = StandardScaler()
|
15 |
+
df = data[0].copy()
|
16 |
+
|
17 |
+
for c in data[0].columns:
|
18 |
+
df[c] = scaler.fit_transform(data[0][[c]])
|
19 |
+
k = st.slider('Number of Clusters :',2,9)
|
20 |
+
kmeans = KMeans(k)
|
21 |
+
res = kmeans.fit_predict(df)
|
22 |
+
df = data[0]
|
23 |
+
df['cluster'] = res
|
24 |
+
return df
|
algos/clustering/kproto.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.preprocessing import StandardScaler
|
2 |
+
from kmodes.kprototypes import KPrototypes
|
3 |
+
from kmodes.kprototypes import euclidean_dissim
|
4 |
+
import streamlit as st
|
5 |
+
import algos.clustering.kmeans
|
6 |
+
|
7 |
+
def process(data):
|
8 |
+
|
9 |
+
|
10 |
+
"""Process K-prototype"""
|
11 |
+
df = data[0]
|
12 |
+
if 'object' not in list(df.dtypes):
|
13 |
+
return algos.clustering.kmeans.process(data)
|
14 |
+
|
15 |
+
k = st.slider('Number of Clusters :',2,9)
|
16 |
+
|
17 |
+
numerical_columns = df.select_dtypes('number').columns
|
18 |
+
categorical_columns = df.select_dtypes('object').columns
|
19 |
+
categorical_indexes = []
|
20 |
+
|
21 |
+
# Scaling
|
22 |
+
scaler = StandardScaler()
|
23 |
+
for c in categorical_columns:
|
24 |
+
categorical_indexes.append(df.columns.get_loc(c))
|
25 |
+
if len(numerical_columns) == 0 or len(categorical_columns) == 0:
|
26 |
+
return
|
27 |
+
# create a copy of our data to be scaled
|
28 |
+
df_scale = df.copy()
|
29 |
+
# standard scale numerical features
|
30 |
+
for c in numerical_columns:
|
31 |
+
df_scale[c] = scaler.fit_transform(df[[c]])
|
32 |
+
|
33 |
+
# Process Data
|
34 |
+
kproto = KPrototypes(n_clusters=k,
|
35 |
+
num_dissim=euclidean_dissim,
|
36 |
+
random_state=0)
|
37 |
+
|
38 |
+
kproto.fit_predict(df_scale, categorical= categorical_indexes)
|
39 |
+
|
40 |
+
# add clusters to dataframe
|
41 |
+
df = data[0]
|
42 |
+
df["cluster"] = kproto.labels_
|
43 |
+
|
44 |
+
return df
|
algos/others/others_page.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def render():
|
4 |
+
st.title("Other Algorithms")
|
algos/regression/elasticnet.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.linear_model import ElasticNet
|
2 |
+
import streamlit as st
|
3 |
+
from types import NoneType
|
4 |
+
|
5 |
+
def process(data):
|
6 |
+
if type(data[0]) == NoneType or type(data[1]) == NoneType: # if either training or testing dataset is still missing
|
7 |
+
st.info('Please Upload Data')
|
8 |
+
return None
|
9 |
+
if 'object' in list(data[0].dtypes) or 'object' in list(data[1].dtypes):
|
10 |
+
st.info('Please Upload Numerica Data.')
|
11 |
+
return None
|
12 |
+
if len(data) == 0:
|
13 |
+
st.info('Please Upload Data')
|
14 |
+
return None
|
15 |
+
x_train = data[0].iloc[:,:-1]
|
16 |
+
y_train = data[0].iloc[:,-1]
|
17 |
+
#st.write(x_train.shape)
|
18 |
+
x_test = data[1].iloc[:,:x_train.shape[1]]
|
19 |
+
#st.dataframe(data[1])
|
20 |
+
#st.write(x_test.shape)
|
21 |
+
|
22 |
+
if len(x_train.columns) != len(x_test.columns):
|
23 |
+
st.info('Training and testing datasets have different column number, cannot perform classification.')
|
24 |
+
return None
|
25 |
+
|
26 |
+
clf = ElasticNet().fit(x_train, y_train)
|
27 |
+
pred = clf.predict(x_test)
|
28 |
+
|
29 |
+
|
30 |
+
cols = x_train.columns
|
31 |
+
|
32 |
+
#st.write(clf.coef_)
|
33 |
+
|
34 |
+
st.latex(f" {x_train.columns[-1]} = ")
|
35 |
+
coeffs = ['{:.4f}'.format(float(c)) for c in clf.coef_]
|
36 |
+
#st.write(coeffs)
|
37 |
+
eq = ' + '.join([str(col) +' × '+ (alpha) for col,alpha in zip(coeffs,cols)])
|
38 |
+
st.markdown(f" $$ {clf.intercept_} {eq} $$")
|
39 |
+
|
40 |
+
st.latex(f" R² = {clf.score(x_train, y_train)} ")
|
41 |
+
|
42 |
+
x_test[data[0].columns[-1]] = pred
|
43 |
+
return x_test
|
algos/regression/linR.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.linear_model import LinearRegression
|
2 |
+
import streamlit as st
|
3 |
+
from types import NoneType
|
4 |
+
|
5 |
+
def process(data):
|
6 |
+
if type(data[0]) == NoneType or type(data[1]) == NoneType: # if either training or testing dataset is still missing
|
7 |
+
st.info('Please Upload Data')
|
8 |
+
return None
|
9 |
+
if len(data) == 0:
|
10 |
+
st.info('Please Upload Data')
|
11 |
+
return None
|
12 |
+
x_train = data[0].iloc[:,:-1]
|
13 |
+
y_train = data[0].iloc[:,-1]
|
14 |
+
#st.write(x_train.shape)
|
15 |
+
x_test = data[1].iloc[:,:x_train.shape[1]]
|
16 |
+
#st.dataframe(data[1])
|
17 |
+
#st.write(x_test.shape)
|
18 |
+
|
19 |
+
if len(x_train.columns) != len(x_test.columns):
|
20 |
+
st.info('Training and testing datasets have different column number, cannot perform classification.')
|
21 |
+
return None
|
22 |
+
if 'object' in list(data[0].dtypes) or 'object' in list(data[1].dtypes):
|
23 |
+
st.info('Please Upload Numerica Data.')
|
24 |
+
return None
|
25 |
+
|
26 |
+
reg = LinearRegression().fit(x_train, y_train)
|
27 |
+
|
28 |
+
|
29 |
+
cols = x_train.columns
|
30 |
+
#st.write(list(zip(reg.coef_,cols)))
|
31 |
+
st.latex(f" {x_train.columns[-1]} = ")
|
32 |
+
coeffs = ['{:.4f}'.format(float(c)) for c in reg.coef_]
|
33 |
+
|
34 |
+
eq = ' + '.join([str(col) +' × '+ (alpha) for col,alpha in zip(coeffs,cols)])
|
35 |
+
st.markdown(f" $$ {reg.intercept_} {eq} $$")
|
36 |
+
|
37 |
+
st.latex(f" R² = {reg.score(x_train, y_train)} ")
|
38 |
+
|
39 |
+
pred = reg.predict(x_test)
|
40 |
+
x_test[data[0].columns[-1]] = pred
|
41 |
+
return x_test
|
algos/regression/ridge.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.linear_model import Ridge
|
2 |
+
import streamlit as st
|
3 |
+
from types import NoneType
|
4 |
+
|
5 |
+
def process(data):
|
6 |
+
if type(data[0]) == NoneType or type(data[1]) == NoneType: # if either training or testing dataset is still missing
|
7 |
+
st.info('Please Upload Data')
|
8 |
+
return None
|
9 |
+
if len(data) == 0:
|
10 |
+
st.info('Please Upload Data')
|
11 |
+
return None
|
12 |
+
if 'object' in list(data[0].dtypes) or 'object' in list(data[1].dtypes):
|
13 |
+
st.info('Please Upload Numerica Data.')
|
14 |
+
return None
|
15 |
+
x_train = data[0].iloc[:,:-1]
|
16 |
+
y_train = data[0].iloc[:,-1]
|
17 |
+
#st.write(x_train.shape)
|
18 |
+
x_test = data[1].iloc[:,:x_train.shape[1]]
|
19 |
+
#st.dataframe(data[1])
|
20 |
+
#st.write(x_test.shape)
|
21 |
+
|
22 |
+
if len(x_train.columns) != len(x_test.columns):
|
23 |
+
st.info('Training and testing datasets have different column number, cannot perform classification.')
|
24 |
+
return None
|
25 |
+
|
26 |
+
clf = Ridge(alpha=1.0).fit(x_train, y_train)
|
27 |
+
pred = clf.predict(x_test)
|
28 |
+
#st.write(clf.coef_)
|
29 |
+
|
30 |
+
cols = x_train.columns
|
31 |
+
st.latex(f" {data[0].columns[-1]} = ")
|
32 |
+
coeffs = ['{:.4f}'.format(float(c)) for c in clf.coef_]
|
33 |
+
eq = ' + '.join([str(col) +' × '+ (alpha) for col,alpha in zip(coeffs,cols)])
|
34 |
+
st.markdown(f" $$ {clf.intercept_} + {eq} $$")
|
35 |
+
st.latex(f" R² = {clf.score(x_train, y_train)} ")
|
36 |
+
x_test[data[0].columns[-1]] = pred
|
37 |
+
return x_test
|
analysis/exploration.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from utilities.template_helpers import upload_data
|
3 |
+
import pandas as pd
|
4 |
+
from types import NoneType
|
5 |
+
from pandas_profiling import ProfileReport
|
6 |
+
from streamlit_pandas_profiling import st_profile_report
|
7 |
+
import sys
|
8 |
+
|
9 |
+
def render():
|
10 |
+
st.title("DATA EXPLORATION")
|
11 |
+
col1, col2 = st.columns([2,5])
|
12 |
+
df = None
|
13 |
+
with col1.container():
|
14 |
+
df = upload_data()
|
15 |
+
if type(df) is NoneType:
|
16 |
+
return
|
17 |
+
st.dataframe(df.describe())
|
18 |
+
with col2.container():
|
19 |
+
pr = ProfileReport(df)
|
20 |
+
st_profile_report(pr)
|
analysis/preprocessing.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from utilities.template_helpers import upload_data
|
3 |
+
from types import NoneType
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
from sklearn.preprocessing import StandardScaler
|
7 |
+
|
8 |
+
|
9 |
+
def render():
|
10 |
+
st.title("PREPROCESSING")
|
11 |
+
# dropna
|
12 |
+
# fillna
|
13 |
+
# select columns
|
14 |
+
# scaling
|
15 |
+
|
16 |
+
col1, col2, col3 = st.columns([1,1,1])
|
17 |
+
|
18 |
+
df = None
|
19 |
+
with col1.container():
|
20 |
+
df = upload_data()
|
21 |
+
if type(df) is NoneType:
|
22 |
+
return
|
23 |
+
if df.shape == (0,0):
|
24 |
+
return
|
25 |
+
info = pd.DataFrame()
|
26 |
+
info['dtypes'] = pd.DataFrame(df.dtypes)
|
27 |
+
info['null'] = df.isna().sum()
|
28 |
+
|
29 |
+
tab1, tab2 = st.tabs(['Dataframe','Info'])
|
30 |
+
with tab1:
|
31 |
+
st.dataframe(df, use_container_width=True, height=300)
|
32 |
+
with tab2:
|
33 |
+
st.dataframe(info,use_container_width=True,height=300)
|
34 |
+
|
35 |
+
with col2.container():
|
36 |
+
### DROP NA ###
|
37 |
+
st.write('\n\n')
|
38 |
+
st.markdown('#### Drop Null Values')
|
39 |
+
st.write('Drop any row containing null values')
|
40 |
+
drop_null = st.checkbox('Drop')
|
41 |
+
if drop_null:
|
42 |
+
df.dropna(inplace=True)
|
43 |
+
|
44 |
+
### FILL NA ####
|
45 |
+
st.write("\n\n")
|
46 |
+
st.markdown('#### Fill Null Values')
|
47 |
+
st.write("""Replace null values with mean of the column for numerical variables,
|
48 |
+
and mode for categorical variables""")
|
49 |
+
fill_null = st.checkbox('Fill')
|
50 |
+
if fill_null:
|
51 |
+
for col in df.columns:
|
52 |
+
val = 0
|
53 |
+
if df[col].dtype == 'object':
|
54 |
+
val = df[col].mode()
|
55 |
+
else:
|
56 |
+
val = df[col].mean()
|
57 |
+
df[col].fillna(val)
|
58 |
+
|
59 |
+
### SCALING ###
|
60 |
+
st.write('\n\n')
|
61 |
+
st.markdown("#### Scaling")
|
62 |
+
st.write("Standardize numerical features by removing the mean and scaling to unit variance.")
|
63 |
+
scale = st.checkbox('Scale')
|
64 |
+
if scale:
|
65 |
+
numerical_columns = df.select_dtypes('number').columns
|
66 |
+
categorical_columns = df.select_dtypes('object').columns
|
67 |
+
categorical_indexes = []
|
68 |
+
|
69 |
+
# Scaling
|
70 |
+
scaler = StandardScaler()
|
71 |
+
for c in categorical_columns:
|
72 |
+
categorical_indexes.append(df.columns.get_loc(c))
|
73 |
+
# create a copy of our data to be scaled
|
74 |
+
df_scale = df.copy()
|
75 |
+
# standard scale numerical features
|
76 |
+
for c in numerical_columns:
|
77 |
+
df_scale[c] = scaler.fit_transform(df[[c]])
|
78 |
+
df = df_scale
|
79 |
+
|
80 |
+
|
81 |
+
with col3.container():
|
82 |
+
### SELECT COLUMNS
|
83 |
+
st.write("\n\n")
|
84 |
+
st.markdown("#### Choose columns")
|
85 |
+
cols = st.multiselect('Select columns to use',options=list(df.columns),default=list(df.columns))
|
86 |
+
#select_cols = st.button('Use selected columns')
|
87 |
+
#if select_cols:
|
88 |
+
df = df[cols]
|
89 |
+
|
90 |
+
st.write("\n\n")
|
91 |
+
st.markdown("#### Encode Numerical values")
|
92 |
+
enc = st.checkbox('Encode')
|
93 |
+
if enc:
|
94 |
+
df.loc[:,df.dtypes == 'object']=df.loc[:,df.dtypes == 'object'].apply(
|
95 |
+
lambda x: x.replace(x.unique(),list(range(1,1+len(x.unique())))))
|
96 |
+
|
97 |
+
st.write('\n\n')
|
98 |
+
st.markdown("#### Download Preprocessed data")
|
99 |
+
st.download_button("Download Results",
|
100 |
+
df.to_csv(index=False),
|
101 |
+
"preprocessed.csv",
|
102 |
+
"text/csv",
|
103 |
+
key="download-csv")
|
104 |
+
#st.dataframe(df)
|
105 |
+
|
106 |
+
|
107 |
+
|
108 |
+
|
109 |
+
#def res_session():
|
110 |
+
# st.session_state['drop_na'] = False
|
111 |
+
# st.session_state['fill_na'] = False
|
112 |
+
# st.session_state['scale'] = False
|
113 |
+
# st.session_state['']
|
app.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from utilities.standard_template import Page, get_info
|
3 |
+
from utilities.land import land_page
|
4 |
+
import analysis.preprocessing
|
5 |
+
import analysis.exploration
|
6 |
+
import warnings
|
7 |
+
|
8 |
+
import algos.others.others_page
|
9 |
+
|
10 |
+
warnings.filterwarnings("ignore")
|
11 |
+
|
12 |
+
# PAGE CONFIGURATION, CHANGE NAME AND ICON
|
13 |
+
|
14 |
+
st.set_page_config(layout="wide",page_title='AIViz',page_icon='carott.png')
|
15 |
+
hide_streamlit_style = """
|
16 |
+
<style>
|
17 |
+
#MainMenu {visibility: hidden;}
|
18 |
+
footer {visibility: hidden;}
|
19 |
+
</style>
|
20 |
+
"""
|
21 |
+
#st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|
22 |
+
|
23 |
+
with st.sidebar:
|
24 |
+
#st.image('carott.png')
|
25 |
+
choice = st.selectbox('Choose Algorithm Category',[
|
26 |
+
" --- Choose --- ",
|
27 |
+
"Clustering",
|
28 |
+
"Classification",
|
29 |
+
"Regression",
|
30 |
+
"Data Exploration",
|
31 |
+
"Data Preprocessing",
|
32 |
+
#"Others"
|
33 |
+
])
|
34 |
+
get_info(choice)
|
35 |
+
|
36 |
+
if choice in ['Clustering', 'Classification', 'Regression']:
|
37 |
+
Page(choice).render()
|
38 |
+
|
39 |
+
elif choice == 'Data Preprocessing':
|
40 |
+
analysis.preprocessing.render()
|
41 |
+
|
42 |
+
elif choice == 'Data Exploration':
|
43 |
+
analysis.exploration.render()
|
44 |
+
|
45 |
+
elif choice == 'Others':
|
46 |
+
algos.others.others_page.render()
|
47 |
+
|
48 |
+
else:
|
49 |
+
land_page()
|
carott.png
ADDED
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy
|
2 |
+
pandas
|
3 |
+
kmodes
|
4 |
+
scikit-learn
|
5 |
+
streamlit
|
6 |
+
extra_streamlit_components
|
7 |
+
plotly
|
8 |
+
prince
|
9 |
+
pandas-profiling
|
10 |
+
streamlit-pandas-profiling
|
utilities/components.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from utilities.template_helpers import upload_data
|
3 |
+
from types import NoneType
|
4 |
+
import extra_streamlit_components as stx
|
5 |
+
|
6 |
+
import prince
|
7 |
+
import plotly.express as px
|
8 |
+
import pandas as pd
|
9 |
+
|
10 |
+
import algos.clustering.kmeans
|
11 |
+
import algos.clustering.dbscan
|
12 |
+
import algos.clustering.kproto
|
13 |
+
|
14 |
+
import algos.classification.nnclassifier
|
15 |
+
import algos.classification.logistic
|
16 |
+
import algos.classification.svmclassifier
|
17 |
+
|
18 |
+
import algos.regression.linR
|
19 |
+
import algos.regression.ridge
|
20 |
+
import algos.regression.elasticnet
|
21 |
+
|
22 |
+
from types import NoneType
|
23 |
+
|
24 |
+
def get_data(category, algo_name=None):
|
25 |
+
if category in ['Classification','Regression']:
|
26 |
+
train = upload_data('Training Data')
|
27 |
+
test = upload_data('Testing Data')
|
28 |
+
return train, test
|
29 |
+
else:
|
30 |
+
df = upload_data()
|
31 |
+
if type(df) != NoneType:
|
32 |
+
return (df,)
|
33 |
+
|
34 |
+
|
35 |
+
def choose_algo(category):
|
36 |
+
if category == 'Clustering':
|
37 |
+
algo = stx.tab_bar(data=[
|
38 |
+
stx.TabBarItemData(id='K-Means',title='K-Means',description='Partitional Clustering Algorithm'),
|
39 |
+
stx.TabBarItemData(id='DBSCAN',title='DBSCAN',description='Density Based Clustering Algorithm'),
|
40 |
+
stx.TabBarItemData(id='K-Prototype',title='K-Prototype',description='Partitional over Mixed Data')]
|
41 |
+
)
|
42 |
+
if algo == 'K-Means':
|
43 |
+
return algos.clustering.kmeans.process
|
44 |
+
if algo == 'DBSCAN':
|
45 |
+
return algos.clustering.dbscan.process
|
46 |
+
if algo == 'K-Prototype':
|
47 |
+
return algos.clustering.kproto.process
|
48 |
+
elif category == 'Classification':
|
49 |
+
algo = stx.tab_bar(data=[
|
50 |
+
stx.TabBarItemData(id='NN',title='Neural Network',description='Multi-Layer Perceptron classifier'),
|
51 |
+
stx.TabBarItemData(id='SVM',title='Suport Vector Classifier',
|
52 |
+
description='Classification using Support Vector Machines'),
|
53 |
+
stx.TabBarItemData(id='logR',title='Logistic Regression',description='Logistic Regression Classifier')]
|
54 |
+
)
|
55 |
+
if algo == 'NN':
|
56 |
+
return algos.classification.nnclassifier.process
|
57 |
+
if algo == 'SVM':
|
58 |
+
return algos.classification.svmclassifier.process
|
59 |
+
if algo == 'logR':
|
60 |
+
return algos.classification.logistic.process
|
61 |
+
elif category == 'Regression':
|
62 |
+
algo = stx.tab_bar(data=[
|
63 |
+
stx.TabBarItemData(id='linR',title='Linear Regression',description='Linear Regression'),
|
64 |
+
stx.TabBarItemData(id='ridge',title='Ridge',
|
65 |
+
description='Ridge Regression'),
|
66 |
+
stx.TabBarItemData(id='elastic',title='Elastic Net Regression',description='Elastic Net Regression')]
|
67 |
+
)
|
68 |
+
if algo == 'linR':
|
69 |
+
return algos.regression.linR.process
|
70 |
+
if algo == 'ridge':
|
71 |
+
return algos.regression.ridge.process
|
72 |
+
if algo == 'elastic':
|
73 |
+
return algos.regression.elasticnet.process
|
74 |
+
|
75 |
+
|
76 |
+
def get_plot(df, title):
|
77 |
+
|
78 |
+
if title == 'Regression':
|
79 |
+
return None # Do not plot regression, display its coefficients
|
80 |
+
|
81 |
+
reduce_algo = None
|
82 |
+
pca = None
|
83 |
+
|
84 |
+
# Better title for the graph
|
85 |
+
viz_thing = 'Clusters'
|
86 |
+
if title == 'Classification':
|
87 |
+
viz_thing = 'Classes'
|
88 |
+
|
89 |
+
# name of column to represent as color on the graph (target class)
|
90 |
+
if type(df) == NoneType:
|
91 |
+
return None
|
92 |
+
if len(df) == 0:
|
93 |
+
return None
|
94 |
+
target_class = df.columns[-1]
|
95 |
+
|
96 |
+
if df.shape == (0,0):
|
97 |
+
return None
|
98 |
+
|
99 |
+
if 'object' in list(df.dtypes):
|
100 |
+
reduce_algo = 'FAMD'
|
101 |
+
pca = prince.FAMD(n_components=3)
|
102 |
+
else:
|
103 |
+
reduce_algo = 'Principal Component Analysis'
|
104 |
+
pca = prince.PCA(n_components=3)
|
105 |
+
reduced = pca.fit(df.iloc[:,:-1]).row_coordinates(df.iloc[:,:-1])
|
106 |
+
reduced.columns = ['X','Y','Z']
|
107 |
+
reduced[target_class] = df[target_class].astype(str)
|
108 |
+
# Each axe's inertia
|
109 |
+
labs = {
|
110 |
+
"X" : f"Component 0 - ({round(100*pca.explained_inertia_[0],2)}% inertia)",
|
111 |
+
"Y" : f"Component 1 - ({round(100*pca.explained_inertia_[1],2)}% inertia)",
|
112 |
+
"Z" : f"Component 2 - ({round(100*pca.explained_inertia_[2],2)}% inertia)",
|
113 |
+
}
|
114 |
+
tot_inertia = f"{round(100*pca.explained_inertia_.sum(),2)}"
|
115 |
+
st.write(f'{reduce_algo} Visualization of {viz_thing} ({tot_inertia}%) :')
|
116 |
+
fig = px.scatter_3d(reduced,x='X',y='Y',z='Z',color=target_class,labels=labs)
|
117 |
+
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0),showlegend=False,height=300)
|
118 |
+
return fig
|
utilities/land.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
|
4 |
+
|
5 |
+
def land_page():
|
6 |
+
_,center,_ = st.columns([2,3,2])
|
7 |
+
center.markdown("<h1 style='text-align: center;'>AIViz</h1>", unsafe_allow_html=True)
|
8 |
+
center.write("""Machine Learning. For everyone. Now. AIViz is a platform built to let everyone perform Machine
|
9 |
+
Learning easily on their own data.""")
|
10 |
+
|
11 |
+
center.image('carott.png')
|
12 |
+
|
13 |
+
center.markdown("<h3 style='text-align: center;'>Use your own data</h3>", unsafe_allow_html=True)
|
14 |
+
|
15 |
+
center.write("You can use your own data with AIViz. All you need is clicking a button. ")
|
16 |
+
|
17 |
+
center.markdown("<h3 style='text-align: center;'>Understand your Data</h3>", unsafe_allow_html=True)
|
18 |
+
|
19 |
+
center.write("""AIViz provides a Data Exploration tool, that lets you explore all your variables. You can
|
20 |
+
easily visalize and understand univariate and bivariate behavior of your data. """)
|
21 |
+
|
22 |
+
center.markdown("<h3 style='text-align: center;'>Preprocessing</h3>", unsafe_allow_html=True)
|
23 |
+
|
24 |
+
center.write("""You can prepare your data for Machine Learning in just a few clicks. You can decide how
|
25 |
+
to handle missing values, choose which columns to use, scale your data...""")
|
26 |
+
|
27 |
+
center.markdown("<h3 style='text-align: center;'>Machine Learning</h3>", unsafe_allow_html=True)
|
28 |
+
|
29 |
+
st.latex("""The \ smartest \ carott \ of \ the \ World \\newline \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
|
30 |
+
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
|
31 |
+
\ \ \ \ \ \ \ \ \ \ \ \ \ - \ us.""")
|
32 |
+
|
33 |
+
center.write("""The core of AIViz is Machine Learning. Now that you have uploaded and preprocessed
|
34 |
+
your data, you can perform Artificial Intelligence algorithms to it. We provide several
|
35 |
+
different algorithms, for Clustering, Classification or Regression.""")
|
utilities/standard_template.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from utilities.components import get_data, choose_algo, get_plot
|
3 |
+
from types import NoneType
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
|
7 |
+
def get_info(category):
|
8 |
+
infos = {
|
9 |
+
" --- Choose --- ":'We Provide several different types of algorithms, such as Clustering or Classification',
|
10 |
+
"Clustering":'Unsupervised, creates clusters of similars individuals',
|
11 |
+
"Classification":"""Supervised, assigns individuals to a class using
|
12 |
+
training data. Last column will be used as targer class.""",
|
13 |
+
"Regression":"Supervised, predicts numerical value to a column, usign training data",
|
14 |
+
"Data Exploration":"Univariate and bivariate data analysis",
|
15 |
+
"Data Preprocessing":"Prepare data for Machine Learning",
|
16 |
+
"Others":'Other algorithms, such as linear regression'
|
17 |
+
}
|
18 |
+
st.info(infos[category])
|
19 |
+
|
20 |
+
class Page:
|
21 |
+
def __init__(self, title) -> None:
|
22 |
+
self.title = title
|
23 |
+
self.data = None
|
24 |
+
self.algo = None
|
25 |
+
self.plot = None
|
26 |
+
self.results = None
|
27 |
+
|
28 |
+
def render(self):
|
29 |
+
st.title(self.title.upper())
|
30 |
+
col1, col2 = st.columns([2,5])
|
31 |
+
|
32 |
+
##### CHOOSE DATA #####
|
33 |
+
with col1.container():
|
34 |
+
data = get_data(self.title)
|
35 |
+
if type(data) == tuple:
|
36 |
+
if self.title == 'Clustering' and type(data[0]) is not NoneType:
|
37 |
+
st.dataframe(data[0], use_container_width=True,height=280)
|
38 |
+
self.data = data
|
39 |
+
|
40 |
+
|
41 |
+
with col2.container():
|
42 |
+
##### CHOSE ALGORITHM #####
|
43 |
+
self.algo = choose_algo(self.title)
|
44 |
+
if self.algo is not None and self.data is not None:
|
45 |
+
self.results = pd.DataFrame(self.algo(self.data))
|
46 |
+
self.plot = get_plot(self.results, self.title)
|
47 |
+
|
48 |
+
##### PLOT RESULTS #####
|
49 |
+
if self.plot is not None:
|
50 |
+
st.plotly_chart(self.plot)
|
51 |
+
|
52 |
+
##### DOWNLOAD RESULTS #####
|
53 |
+
if self.results is not None:
|
54 |
+
col1.download_button("Download Results",
|
55 |
+
self.results.to_csv(index=False),
|
56 |
+
"results.csv",
|
57 |
+
"text/csv",
|
58 |
+
key="download-csv")
|
utilities/template_helpers.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
|
5 |
+
def upload_data(descr='Upload Data'):
|
6 |
+
up = st.file_uploader(descr)
|
7 |
+
if up:
|
8 |
+
df = pd.read_csv(up).dropna()
|
9 |
+
return df
|