Upload 12 files
Browse files- .streamlit/config.toml +7 -0
- app.py +166 -53
- auto_optimizer.py +361 -317
- best_tts.py +2 -2
- eda.py +325 -0
- feature_selections.py +6 -6
- grid_search_cv.py +284 -0
- models.py +2 -0
- requirements.txt +5 -4
.streamlit/config.toml
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
[theme]
|
3 |
+
primaryColor="#F63366"
|
4 |
+
backgroundColor="#002148"
|
5 |
+
secondaryBackgroundColor="#576c86"
|
6 |
+
textColor="white"
|
7 |
+
font="serif"
|
app.py
CHANGED
@@ -8,6 +8,7 @@ import evaluationer,models, null_value_handling
|
|
8 |
import auto_optimizer
|
9 |
from sklearn.experimental import enable_iterative_imputer
|
10 |
from sklearn.impute import SimpleImputer, IterativeImputer
|
|
|
11 |
# st.set_page_config(layout="wide")
|
12 |
|
13 |
st.set_page_config(
|
@@ -21,7 +22,23 @@ st.set_page_config(
|
|
21 |
}
|
22 |
)
|
23 |
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
# Title with Rainbow Transition Effect and Neon Glow
|
27 |
html_code = """
|
@@ -67,23 +84,74 @@ html_code = """
|
|
67 |
"""
|
68 |
|
69 |
st.markdown(html_code, unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
|
|
|
72 |
# file uploader
|
73 |
csv_upload = st.sidebar.file_uploader("Input CSV File for ML modelling", type=['csv'])
|
|
|
|
|
|
|
|
|
74 |
csv_upload2 = st.sidebar.file_uploader("Input CSV File of Test Data Prediction",type = ["csv"])
|
|
|
75 |
test = pd.DataFrame()
|
76 |
if csv_upload is not None:
|
77 |
# read the uploaded file into dataframe
|
78 |
-
df = pd.read_csv(csv_upload)
|
79 |
|
80 |
# saving the dataframe to a CSV file
|
81 |
df.to_csv('csv_upload.csv', index=False)
|
82 |
-
st.
|
83 |
-
|
84 |
if csv_upload2 is not None:
|
85 |
-
test = pd.read_csv(csv_upload2)
|
86 |
-
|
|
|
|
|
|
|
87 |
submission_id = test[id_col]
|
88 |
# st.write("Train File upl",submission_id)
|
89 |
|
@@ -93,8 +161,10 @@ if csv_upload is not None:
|
|
93 |
if len(test) >0:
|
94 |
# saving the test dataframe to a CSV file
|
95 |
test.to_csv('csv_upload_test.csv', index=False)
|
96 |
-
|
97 |
|
|
|
|
|
98 |
display_train_data = st.radio("Display Train Data",["Yes","No"],index = 1)
|
99 |
if display_train_data == "Yes":
|
100 |
st.dataframe(df.head())
|
@@ -104,29 +174,40 @@ if csv_upload is not None:
|
|
104 |
if display_test_data == "Yes":
|
105 |
st.dataframe(test.head())
|
106 |
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
111 |
|
112 |
# Display the selected column
|
113 |
st.write('You selected:', selected_column)
|
114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
y = df[selected_column]
|
116 |
|
117 |
if y.dtype == "O":
|
118 |
-
st.
|
119 |
-
|
|
|
120 |
from sklearn.preprocessing import LabelEncoder
|
121 |
le = LabelEncoder()
|
122 |
y= pd.Series(le.fit_transform(y))
|
123 |
-
st.
|
124 |
-
|
125 |
-
if st.radio("Display Target Column",["Yes","No"],index =1) == "Yes":
|
126 |
st.dataframe(y.head())
|
127 |
|
128 |
-
|
129 |
-
|
|
|
130 |
if select_target_trans == "Yes":
|
131 |
selected_transformation = st.selectbox("Select Transformation method",["Log Transformation","Power Transformation"])
|
132 |
if selected_transformation == "Log Transformation":
|
@@ -155,36 +236,52 @@ if csv_upload is not None:
|
|
155 |
|
156 |
if st.radio("Display Target Column after Transformation",["Yes","No"],index =1) == "Yes":
|
157 |
st.dataframe(y.head())
|
158 |
-
|
|
|
159 |
|
160 |
X = df.drop(columns = selected_column)
|
161 |
|
162 |
if st.radio("Display X-Train Data",["Yes","No"],index =1) == "Yes":
|
163 |
st.dataframe(X.head())
|
164 |
-
|
|
|
|
|
|
|
|
|
165 |
len_duplicates = len(X[X.duplicated()])
|
166 |
if len_duplicates >0:
|
167 |
st.write(f"There are {len_duplicates} duplicate values in Train")
|
|
|
|
|
168 |
if st.selectbox("Drop Duplicate values",["Yes","No"],index = 1) == "Yes":
|
169 |
X = X.drop_duplicates()
|
170 |
st.write("Duplicate values removed ✅")
|
171 |
else:
|
172 |
st.write("There are no duplicate values in Train")
|
|
|
173 |
# dropping not important columns
|
174 |
-
|
|
|
175 |
selected_drop_column = st.multiselect('Select columns to be dropped', X.columns)
|
176 |
X = X.drop(columns = selected_drop_column)
|
177 |
if len(test) >0:
|
178 |
test = test.drop(columns = selected_drop_column)
|
179 |
-
st.write("Un-Important column(s)
|
180 |
st.dataframe(X.head())
|
181 |
|
|
|
182 |
num_cols = X.select_dtypes(exclude = "O").columns
|
183 |
cat_cols = X.select_dtypes(include = "O").columns
|
184 |
st.write("Numerical Columns in Train Data: ", tuple(num_cols))
|
185 |
st.write("Categorical Columns in Train Data: ", tuple(cat_cols))
|
186 |
-
|
187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
ml_cat_ao = st.radio("Select Machine Learning Category",["Regression","Classification"],index =0)
|
189 |
|
190 |
if ml_cat_ao =="Regression":
|
@@ -192,7 +289,7 @@ if csv_upload is not None:
|
|
192 |
st.write("Select ML algorithm")
|
193 |
reg_model_name = st.selectbox("select model",models.Regression_models.index)
|
194 |
reg_model = models.Regression_models.loc[reg_model_name].values[0]
|
195 |
-
auto_optimizer.Auto_optimizer(X,y,eva,reg_model)
|
196 |
|
197 |
elif ml_cat_ao =="Classification":
|
198 |
eva = "class"
|
@@ -201,10 +298,12 @@ if csv_upload is not None:
|
|
201 |
class_model = models.Classification_models.loc[class_model_name].values[0]
|
202 |
auto_optimizer.Auto_optimizer(X,y,eva,class_model)
|
203 |
|
204 |
-
|
205 |
else:
|
|
|
206 |
if X.isnull().sum().sum() >0 :
|
207 |
-
|
|
|
208 |
|
209 |
if st.selectbox("Drop null values or Impute",["Drop Null Values","Impute Null Values"],index = 1) == "Drop Null Values":
|
210 |
|
@@ -241,7 +340,9 @@ if csv_upload is not None:
|
|
241 |
|
242 |
|
243 |
clean_num_nvh_df_cat = pd.DataFrame()
|
|
|
244 |
if X[cat_cols].isnull().sum().sum() >0:
|
|
|
245 |
st.write("Categorical Columns with Percentage of Null Values: ")
|
246 |
cat_cols_nvh = X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0].index
|
247 |
st.dataframe(round(X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0]/len(X)*100,2))
|
@@ -270,33 +371,41 @@ if csv_upload is not None:
|
|
270 |
null_value_handling.null_handling(X,clean_num_nvh_df,clean_num_nvh_df_cat)
|
271 |
st.write("X Data after Null value handling", X.head())
|
272 |
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
st.
|
278 |
-
|
|
|
|
|
279 |
ord_enc_cols = []
|
280 |
|
281 |
if len(cat_cols) == 0:
|
282 |
st.write("No Categorical Columns in Train")
|
283 |
else:
|
284 |
-
st.
|
|
|
|
|
|
|
285 |
for column in cat_cols:
|
286 |
|
287 |
selected = st.checkbox(column)
|
288 |
if selected:
|
289 |
st.write(f"No. of Unique value in {column} column are", X[column].nunique())
|
290 |
ord_enc_cols.append(column)
|
|
|
291 |
ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
|
292 |
ohe_enc_cols = list(ohe_enc_cols)
|
293 |
if len(ord_enc_cols)>0:
|
294 |
st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
|
295 |
if len(ohe_enc_cols)>0:
|
296 |
st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
|
297 |
-
|
|
|
298 |
if len(ord_enc_cols)>0:
|
299 |
-
|
|
|
300 |
ordinal_order_vals = []
|
301 |
|
302 |
for column in ord_enc_cols:
|
@@ -317,7 +426,7 @@ if csv_upload is not None:
|
|
317 |
st.write("Ordinal Encoding Completed ✅")
|
318 |
|
319 |
if len(ohe_enc_cols)>0:
|
320 |
-
if st.
|
321 |
from sklearn.preprocessing import OneHotEncoder
|
322 |
ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
|
323 |
pd.options.mode.chained_assignment = None
|
@@ -331,39 +440,43 @@ if csv_upload is not None:
|
|
331 |
|
332 |
st.write("DataFrame after One Hot Encoding",X.head())
|
333 |
st.write("OneHot Encoding Completed ✅")
|
334 |
-
|
335 |
new_df = pd.concat([X,y],axis = 1)
|
336 |
|
337 |
csv = new_df.to_csv(index = False)
|
338 |
-
if st.
|
339 |
st.download_button(label="Download Ordinal Encoded CSV File",data=csv,file_name='Encoded_DataFrame.csv',mime='text/csv')
|
340 |
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
"KFoldCV, Default (CV = 5)"], index = 0)== f"Train_Test_split, Default (Random_state = {random_state},Test_size = {test_size})":
|
347 |
ttsmethod = "Train_Test_split"
|
348 |
else:
|
349 |
ttsmethod = "KFoldCV"
|
350 |
st.write('You selected:', ttsmethod)
|
351 |
if ttsmethod == "Train_Test_split":
|
|
|
|
|
352 |
X_train,X_Val,y_train,y_val = tts(X,y[X.index],random_state = random_state,test_size = test_size)
|
353 |
-
st.write('X-Training Data shape:', (X_train.info()))
|
354 |
|
355 |
st.write('X-Training Data shape:', X_train.shape)
|
356 |
st.write('X-Validation Data shape:', X_Val.shape)
|
357 |
-
|
358 |
-
|
359 |
-
|
|
|
360 |
if ml_cat =="Regression":
|
361 |
-
|
|
|
|
|
|
|
362 |
|
363 |
method = evaluationer.method_df.loc[method_name_selector].values[0]
|
364 |
reg_algorithm = []
|
365 |
selected_options = []
|
366 |
-
|
367 |
for option in models.Regression_models.index:
|
368 |
selected = st.checkbox(option)
|
369 |
if selected:
|
@@ -450,7 +563,7 @@ if csv_upload is not None:
|
|
450 |
|
451 |
cla_algorithm = []
|
452 |
selected_options = []
|
453 |
-
|
454 |
for option in models.Classification_models.index:
|
455 |
selected = st.checkbox(option)
|
456 |
if selected:
|
|
|
8 |
import auto_optimizer
|
9 |
from sklearn.experimental import enable_iterative_imputer
|
10 |
from sklearn.impute import SimpleImputer, IterativeImputer
|
11 |
+
import eda
|
12 |
# st.set_page_config(layout="wide")
|
13 |
|
14 |
st.set_page_config(
|
|
|
22 |
}
|
23 |
)
|
24 |
|
25 |
+
|
26 |
+
|
27 |
+
# Set the background image
|
28 |
+
background_image = """
|
29 |
+
<style>
|
30 |
+
[data-testid="stAppViewContainer"] > .main {
|
31 |
+
background-image: url("https://w.wallhaven.cc/full/jx/wallhaven-jx7w25.png");
|
32 |
+
background-size: 100vw 100vh; # This sets the size to cover 100% of the viewport width and height
|
33 |
+
background-position: center;
|
34 |
+
background-repeat: no-repeat;
|
35 |
+
}
|
36 |
+
</style>
|
37 |
+
"""
|
38 |
+
|
39 |
+
st.markdown(background_image, unsafe_allow_html=True)
|
40 |
+
|
41 |
+
|
42 |
|
43 |
# Title with Rainbow Transition Effect and Neon Glow
|
44 |
html_code = """
|
|
|
84 |
"""
|
85 |
|
86 |
st.markdown(html_code, unsafe_allow_html=True)
|
87 |
+
st.divider()
|
88 |
+
|
89 |
+
st.markdown(
|
90 |
+
"""
|
91 |
+
<style>
|
92 |
+
.success-message {
|
93 |
+
font-family: Arial, sans-serif;
|
94 |
+
font-size: 24px;
|
95 |
+
color: green;
|
96 |
+
text-align: left;
|
97 |
+
}
|
98 |
+
.unsuccess-message {
|
99 |
+
font-family: Arial, sans-serif;
|
100 |
+
font-size: 24px;
|
101 |
+
color: red;
|
102 |
+
text-align: left;
|
103 |
+
}
|
104 |
+
.prompt-message {
|
105 |
+
font-family: Arial, sans-serif;
|
106 |
+
font-size: 24px;
|
107 |
+
color: #333;
|
108 |
+
text-align: center;
|
109 |
+
}
|
110 |
+
.success-message2 {
|
111 |
+
font-family: Arial, sans-serif;
|
112 |
+
font-size: 18px;
|
113 |
+
color: white;
|
114 |
+
text-align: left;
|
115 |
+
}
|
116 |
+
.message-box {
|
117 |
+
text-align: center;
|
118 |
+
background-color: white;
|
119 |
+
padding: 5px;
|
120 |
+
border-radius: 10px;
|
121 |
+
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
|
122 |
+
font-size: 24px;
|
123 |
+
color: #333;
|
124 |
+
}
|
125 |
+
</style>
|
126 |
+
""",
|
127 |
+
unsafe_allow_html=True
|
128 |
+
)
|
129 |
|
130 |
|
131 |
+
# st.markdown('<p class="success-message">Train File uploaded successfully. ✅</p>', unsafe_allow_html=True)
|
132 |
# file uploader
|
133 |
csv_upload = st.sidebar.file_uploader("Input CSV File for ML modelling", type=['csv'])
|
134 |
+
|
135 |
+
sep = st.sidebar.text_input("Input Seperator")
|
136 |
+
if (len(sep) ==0):
|
137 |
+
sep = ","
|
138 |
csv_upload2 = st.sidebar.file_uploader("Input CSV File of Test Data Prediction",type = ["csv"])
|
139 |
+
|
140 |
test = pd.DataFrame()
|
141 |
if csv_upload is not None:
|
142 |
# read the uploaded file into dataframe
|
143 |
+
df = pd.read_csv(csv_upload,sep = sep)
|
144 |
|
145 |
# saving the dataframe to a CSV file
|
146 |
df.to_csv('csv_upload.csv', index=False)
|
147 |
+
st.markdown('<p class="success-message">Train File uploaded successfully. ✅</p>', unsafe_allow_html=True)
|
148 |
+
|
149 |
if csv_upload2 is not None:
|
150 |
+
test = pd.read_csv(csv_upload2,sep = sep)
|
151 |
+
st.markdown('<p class="success-message">Test File uploaded successfully. ✅</p>', unsafe_allow_html=True)
|
152 |
+
st.divider()
|
153 |
+
id_col = st.selectbox("Select Column for Submission i.e, ID",test.columns)
|
154 |
+
st.divider()
|
155 |
submission_id = test[id_col]
|
156 |
# st.write("Train File upl",submission_id)
|
157 |
|
|
|
161 |
if len(test) >0:
|
162 |
# saving the test dataframe to a CSV file
|
163 |
test.to_csv('csv_upload_test.csv', index=False)
|
164 |
+
|
165 |
|
166 |
+
st.markdown('<p class="message-box">Display Data</p>', unsafe_allow_html=True)
|
167 |
+
st.write("")
|
168 |
display_train_data = st.radio("Display Train Data",["Yes","No"],index = 1)
|
169 |
if display_train_data == "Yes":
|
170 |
st.dataframe(df.head())
|
|
|
174 |
if display_test_data == "Yes":
|
175 |
st.dataframe(test.head())
|
176 |
|
177 |
+
st.divider()
|
178 |
+
st.markdown('<div class="message-box success">Select Supervision Category</div>', unsafe_allow_html=True)
|
179 |
+
if st.radio("",["Supervised","Un-Supervised"],index =0) == "Supervised":
|
180 |
+
st.divider()
|
181 |
+
|
182 |
+
st.write('<p class="success-message2">Select Target column</p>', unsafe_allow_html=True)
|
183 |
+
selected_column = st.selectbox('', df.columns, index=(len(df.columns)-1))
|
184 |
|
185 |
# Display the selected column
|
186 |
st.write('You selected:', selected_column)
|
187 |
+
st.divider()
|
188 |
+
|
189 |
+
st.markdown('<div class="message-box success ">Perform EDA</div>', unsafe_allow_html=True)
|
190 |
+
st.write("")
|
191 |
+
if st.checkbox("Proceed to perform EDA"):
|
192 |
+
eda.eda_analysis(df)
|
193 |
+
st.write('<p class="success-message">EDA Performed proceed for Pre-processing</p>', unsafe_allow_html=True)
|
194 |
+
st.divider()
|
195 |
y = df[selected_column]
|
196 |
|
197 |
if y.dtype == "O":
|
198 |
+
st.markdown('<p class="unsuccess-message">⚠️⚠️⚠️ Target Column is Object Type ⚠️⚠️⚠️</p>', unsafe_allow_html=True)
|
199 |
+
|
200 |
+
if st.checkbox("Proceed for Label Encoding "):
|
201 |
from sklearn.preprocessing import LabelEncoder
|
202 |
le = LabelEncoder()
|
203 |
y= pd.Series(le.fit_transform(y))
|
204 |
+
st.markdown('<p class="success-message">Label Encoding Completed ✅</p>', unsafe_allow_html=True)
|
205 |
+
if st.checkbox("Display Target Column"):
|
|
|
206 |
st.dataframe(y.head())
|
207 |
|
208 |
+
st.divider()
|
209 |
+
st.markdown('<div class="message-box success">Target column Transformation</div>', unsafe_allow_html=True)
|
210 |
+
select_target_trans = st.radio("",["Yes","No"],index = 1)
|
211 |
if select_target_trans == "Yes":
|
212 |
selected_transformation = st.selectbox("Select Transformation method",["Log Transformation","Power Transformation"])
|
213 |
if selected_transformation == "Log Transformation":
|
|
|
236 |
|
237 |
if st.radio("Display Target Column after Transformation",["Yes","No"],index =1) == "Yes":
|
238 |
st.dataframe(y.head())
|
239 |
+
|
240 |
+
|
241 |
|
242 |
X = df.drop(columns = selected_column)
|
243 |
|
244 |
if st.radio("Display X-Train Data",["Yes","No"],index =1) == "Yes":
|
245 |
st.dataframe(X.head())
|
246 |
+
st.divider()
|
247 |
+
|
248 |
+
# st.checkbox()
|
249 |
+
st.markdown('<div class="message-box success">Check for duplicate Values</div>', unsafe_allow_html=True)
|
250 |
+
if st.radio(" ",["Yes","No"],index = 1) == "Yes":
|
251 |
len_duplicates = len(X[X.duplicated()])
|
252 |
if len_duplicates >0:
|
253 |
st.write(f"There are {len_duplicates} duplicate values in Train")
|
254 |
+
if st.checkbox("Show Duplicate values"):
|
255 |
+
st.dataframe(X[X.duplicated()])
|
256 |
if st.selectbox("Drop Duplicate values",["Yes","No"],index = 1) == "Yes":
|
257 |
X = X.drop_duplicates()
|
258 |
st.write("Duplicate values removed ✅")
|
259 |
else:
|
260 |
st.write("There are no duplicate values in Train")
|
261 |
+
st.divider()
|
262 |
# dropping not important columns
|
263 |
+
st.markdown('<div class="message-box success">Drop Unimportant Columns</div>', unsafe_allow_html=True)
|
264 |
+
if st.radio(" ",["Yes","No"],index = 1) == "Yes":
|
265 |
selected_drop_column = st.multiselect('Select columns to be dropped', X.columns)
|
266 |
X = X.drop(columns = selected_drop_column)
|
267 |
if len(test) >0:
|
268 |
test = test.drop(columns = selected_drop_column)
|
269 |
+
st.write("Un-Important column(s) Deleted ✅")
|
270 |
st.dataframe(X.head())
|
271 |
|
272 |
+
st.divider()
|
273 |
num_cols = X.select_dtypes(exclude = "O").columns
|
274 |
cat_cols = X.select_dtypes(include = "O").columns
|
275 |
st.write("Numerical Columns in Train Data: ", tuple(num_cols))
|
276 |
st.write("Categorical Columns in Train Data: ", tuple(cat_cols))
|
277 |
+
if st.sidebar.button("Clear Evaluation DataFrame"):
|
278 |
+
evaluationer.reg_evaluation_df = evaluationer.reg_evaluation_df.drop(index =evaluationer.reg_evaluation_df.index)
|
279 |
+
evaluationer.classification_evaluation_df = evaluationer.classification_evaluation_df.drop(index =evaluationer.reg_evaluation_df.index)
|
280 |
+
st.divider()
|
281 |
+
# markdown
|
282 |
+
st.markdown('<div class="message-box success">Select method for ML modelling</div>', unsafe_allow_html = True)
|
283 |
+
if st.radio(" ", ["Manual","Auto Optimized"],index = 0) == "Auto Optimized":
|
284 |
+
st.divider()
|
285 |
ml_cat_ao = st.radio("Select Machine Learning Category",["Regression","Classification"],index =0)
|
286 |
|
287 |
if ml_cat_ao =="Regression":
|
|
|
289 |
st.write("Select ML algorithm")
|
290 |
reg_model_name = st.selectbox("select model",models.Regression_models.index)
|
291 |
reg_model = models.Regression_models.loc[reg_model_name].values[0]
|
292 |
+
auto_optimizer.Auto_optimizer(X,y,eva,reg_model,reg_model_name)
|
293 |
|
294 |
elif ml_cat_ao =="Classification":
|
295 |
eva = "class"
|
|
|
298 |
class_model = models.Classification_models.loc[class_model_name].values[0]
|
299 |
auto_optimizer.Auto_optimizer(X,y,eva,class_model)
|
300 |
|
301 |
+
|
302 |
else:
|
303 |
+
st.divider()
|
304 |
if X.isnull().sum().sum() >0 :
|
305 |
+
|
306 |
+
st.markdown('<p class="unsuccess-message">⚠️⚠️⚠️ There are missing values in Train Data ⚠️⚠️⚠️</p>', unsafe_allow_html=True)
|
307 |
|
308 |
if st.selectbox("Drop null values or Impute",["Drop Null Values","Impute Null Values"],index = 1) == "Drop Null Values":
|
309 |
|
|
|
340 |
|
341 |
|
342 |
clean_num_nvh_df_cat = pd.DataFrame()
|
343 |
+
|
344 |
if X[cat_cols].isnull().sum().sum() >0:
|
345 |
+
st.divider()
|
346 |
st.write("Categorical Columns with Percentage of Null Values: ")
|
347 |
cat_cols_nvh = X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0].index
|
348 |
st.dataframe(round(X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0]/len(X)*100,2))
|
|
|
371 |
null_value_handling.null_handling(X,clean_num_nvh_df,clean_num_nvh_df_cat)
|
372 |
st.write("X Data after Null value handling", X.head())
|
373 |
|
374 |
+
new_df = pd.concat([X,y[X.index]],axis = 1)
|
375 |
+
|
376 |
+
csv = new_df.to_csv(index = False)
|
377 |
+
|
378 |
+
st.markdown('<p class="success-message">Null Values Handled Successfully. ✅</p>', unsafe_allow_html=True)
|
379 |
+
if st.checkbox("Download Null Value Handled DataFrame as CSV File ? "):
|
380 |
+
st.download_button(label="Download Null Value Handled CSV File",data=csv,file_name='NVH_DataFrame.csv',mime='text/csv')
|
381 |
+
st.divider()
|
382 |
ord_enc_cols = []
|
383 |
|
384 |
if len(cat_cols) == 0:
|
385 |
st.write("No Categorical Columns in Train")
|
386 |
else:
|
387 |
+
st.markdown('<div class="message-box success">Features Encoding</div>', unsafe_allow_html=True)
|
388 |
+
st.markdown('<p class="unsuccess-message">There are Object type Features in Train Data ⚠️</p>', unsafe_allow_html=True)
|
389 |
+
st.markdown('<p class="success-message2">Select Columns for Ordinal Encoding</p>', unsafe_allow_html=True)
|
390 |
+
|
391 |
for column in cat_cols:
|
392 |
|
393 |
selected = st.checkbox(column)
|
394 |
if selected:
|
395 |
st.write(f"No. of Unique value in {column} column are", X[column].nunique())
|
396 |
ord_enc_cols.append(column)
|
397 |
+
st.divider()
|
398 |
ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
|
399 |
ohe_enc_cols = list(ohe_enc_cols)
|
400 |
if len(ord_enc_cols)>0:
|
401 |
st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
|
402 |
if len(ohe_enc_cols)>0:
|
403 |
st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
|
404 |
+
st.divider()
|
405 |
+
st.markdown('<div class="message-box success">Proceed for Encoding</div>', unsafe_allow_html=True)
|
406 |
if len(ord_enc_cols)>0:
|
407 |
+
|
408 |
+
if st.checkbox("Proceed for Ordinal Encoding"):
|
409 |
ordinal_order_vals = []
|
410 |
|
411 |
for column in ord_enc_cols:
|
|
|
426 |
st.write("Ordinal Encoding Completed ✅")
|
427 |
|
428 |
if len(ohe_enc_cols)>0:
|
429 |
+
if st.checkbox("Proceed for OneHotEncoding "): # import one hot encoder
|
430 |
from sklearn.preprocessing import OneHotEncoder
|
431 |
ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
|
432 |
pd.options.mode.chained_assignment = None
|
|
|
440 |
|
441 |
st.write("DataFrame after One Hot Encoding",X.head())
|
442 |
st.write("OneHot Encoding Completed ✅")
|
443 |
+
st.divider()
|
444 |
new_df = pd.concat([X,y],axis = 1)
|
445 |
|
446 |
csv = new_df.to_csv(index = False)
|
447 |
+
if st.checkbox("Download Encoded DataFrame as CSV File ? "):
|
448 |
st.download_button(label="Download Ordinal Encoded CSV File",data=csv,file_name='Encoded_DataFrame.csv',mime='text/csv')
|
449 |
|
450 |
+
st.divider()
|
451 |
+
st.markdown('<div class="message-box success">Modelling</div>', unsafe_allow_html=True)
|
452 |
+
st.write("")
|
453 |
+
st.markdown('<p class="success-message">Select Train Validation Split Method</p>', unsafe_allow_html=True)
|
454 |
+
if st.radio("",["Train_Test_split","KFoldCV, Default (CV = 5)"], index = 0)== "Train_Test_split":
|
|
|
455 |
ttsmethod = "Train_Test_split"
|
456 |
else:
|
457 |
ttsmethod = "KFoldCV"
|
458 |
st.write('You selected:', ttsmethod)
|
459 |
if ttsmethod == "Train_Test_split":
|
460 |
+
random_state = st.number_input("Enter Random_state",max_value=100,min_value=1,value=42)
|
461 |
+
test_size = st.number_input("Enter test_size",max_value=0.99, min_value = 0.01,value =0.2)
|
462 |
X_train,X_Val,y_train,y_val = tts(X,y[X.index],random_state = random_state,test_size = test_size)
|
|
|
463 |
|
464 |
st.write('X-Training Data shape:', X_train.shape)
|
465 |
st.write('X-Validation Data shape:', X_Val.shape)
|
466 |
+
st.divider()
|
467 |
+
st.markdown('<p class="success-message2">Select Machine Learning Category</p>', unsafe_allow_html=True)
|
468 |
+
ml_cat = st.radio("___",options=["Regression","Classification"],index =0)
|
469 |
+
st.divider()
|
470 |
if ml_cat =="Regression":
|
471 |
+
st.markdown('<p class="success-message2">Select Error Evaluation Method</p>', unsafe_allow_html=True)
|
472 |
+
method_name_selector = st.selectbox(" ",evaluationer.method_df.index,index = 0)
|
473 |
+
|
474 |
+
st.divider()
|
475 |
|
476 |
method = evaluationer.method_df.loc[method_name_selector].values[0]
|
477 |
reg_algorithm = []
|
478 |
selected_options = []
|
479 |
+
st.markdown('<div class="message-box success">Select ML Model(s)</div>', unsafe_allow_html=True)
|
480 |
for option in models.Regression_models.index:
|
481 |
selected = st.checkbox(option)
|
482 |
if selected:
|
|
|
563 |
|
564 |
cla_algorithm = []
|
565 |
selected_options = []
|
566 |
+
st.markdown('<div class="message-box success">Select ML Model(s)</div>', unsafe_allow_html=True)
|
567 |
for option in models.Classification_models.index:
|
568 |
selected = st.checkbox(option)
|
569 |
if selected:
|
auto_optimizer.py
CHANGED
@@ -1,317 +1,361 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
import numpy as np
|
3 |
-
import streamlit as st
|
4 |
-
from sklearn.impute import KNNImputer,SimpleImputer,IterativeImputer
|
5 |
-
import best_tts, evaluationer,models
|
6 |
-
from sklearn.experimental import enable_iterative_imputer
|
7 |
-
from sklearn.model_selection import train_test_split as tts
|
8 |
-
from collections import Counter
|
9 |
-
|
10 |
-
from sklearn.metrics import root_mean_squared_error
|
11 |
-
import seaborn as sns
|
12 |
-
|
13 |
-
import
|
14 |
-
import
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
if
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
if
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
if
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import streamlit as st
|
4 |
+
from sklearn.impute import KNNImputer,SimpleImputer,IterativeImputer
|
5 |
+
import best_tts, evaluationer,models
|
6 |
+
from sklearn.experimental import enable_iterative_imputer
|
7 |
+
from sklearn.model_selection import train_test_split as tts
|
8 |
+
from collections import Counter
|
9 |
+
from sklearn.preprocessing import PolynomialFeatures
|
10 |
+
from sklearn.metrics import root_mean_squared_error
|
11 |
+
import seaborn as sns
|
12 |
+
from sklearn.decomposition import PCA
|
13 |
+
import grid_search_cv
|
14 |
+
import matplotlib.pyplot as plt
|
15 |
+
import outliers,best_tts
|
16 |
+
import feature_selections
|
17 |
+
def Auto_optimizer(X,y,eva,model,model_name,test= None):
|
18 |
+
if st.button("Train Regression Model"):
|
19 |
+
num_cols = X.select_dtypes(exclude = "O").columns
|
20 |
+
cat_cols = X.select_dtypes(include = "O").columns
|
21 |
+
st.write("Num_cols",tuple(num_cols))
|
22 |
+
st.write("cat_cols",tuple(cat_cols))
|
23 |
+
|
24 |
+
# check for Duplicate and drop duplicated in X
|
25 |
+
|
26 |
+
if len(X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40]) >0:
|
27 |
+
X = X.drop(columns = X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40].index)
|
28 |
+
st.write("Columns with more than 40% null values removed")
|
29 |
+
# st.write("csx",X)
|
30 |
+
|
31 |
+
len_null = X.isnull().sum().sum()
|
32 |
+
|
33 |
+
st.write(f"There are {len_null} null values in Train")
|
34 |
+
|
35 |
+
knn_imputed_num_X = X.copy()
|
36 |
+
si_mean_imputed_num_X = X.copy()
|
37 |
+
# st.write("sf",si_mean_imputed_num_X)
|
38 |
+
si_median_imputed_num_X = X.copy()
|
39 |
+
si_most_frequent_imputed_num_X = X.copy()
|
40 |
+
iter_imputed_num_X = X.copy()
|
41 |
+
knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
|
42 |
+
si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
|
43 |
+
si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
|
44 |
+
si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
|
45 |
+
iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
|
46 |
+
if len_null >0:
|
47 |
+
|
48 |
+
if X[num_cols].isnull().sum().sum() >0:
|
49 |
+
|
50 |
+
knn_imputer = KNNImputer(n_neighbors = 5)
|
51 |
+
knn_imputed_num_X[num_cols] = knn_imputer.fit_transform(knn_imputed_num_X[num_cols])
|
52 |
+
si_imputer = SimpleImputer(strategy = "mean")
|
53 |
+
si_mean_imputed_num_X[num_cols] = si_imputer.fit_transform(si_mean_imputed_num_X[num_cols])
|
54 |
+
si_imputer = SimpleImputer(strategy = "median")
|
55 |
+
si_median_imputed_num_X[num_cols] = si_imputer.fit_transform(si_median_imputed_num_X[num_cols])
|
56 |
+
si_imputer = SimpleImputer(strategy = "most_frequent")
|
57 |
+
si_most_frequent_imputed_num_X[num_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[num_cols])
|
58 |
+
iter_imputer = IterativeImputer(max_iter = 200,random_state= 42)
|
59 |
+
iter_imputed_num_X[num_cols] = iter_imputer.fit_transform(iter_imputed_num_X[num_cols])
|
60 |
+
knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
|
61 |
+
si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
|
62 |
+
si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
|
63 |
+
si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
|
64 |
+
iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
|
65 |
+
|
66 |
+
if X[cat_cols].isnull().sum().sum() >0:
|
67 |
+
# treating missing values in categorical columns
|
68 |
+
# st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
|
69 |
+
si_imputer = SimpleImputer(strategy = "most_frequent")
|
70 |
+
|
71 |
+
knn_imputed_num_X[cat_cols] = si_imputer.fit_transform(knn_imputed_num_X[cat_cols])
|
72 |
+
si_imputer = SimpleImputer(strategy = "most_frequent")
|
73 |
+
si_mean_imputed_num_X.loc[:,cat_cols] = si_imputer.fit_transform(si_mean_imputed_num_X.loc[:,cat_cols])
|
74 |
+
# st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
|
75 |
+
si_median_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_median_imputed_num_X[cat_cols])
|
76 |
+
si_most_frequent_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[cat_cols])
|
77 |
+
iter_imputed_num_X[cat_cols] = si_imputer.fit_transform(iter_imputed_num_X[cat_cols])
|
78 |
+
|
79 |
+
knn_imputed_X_cat_dropped = knn_imputed_X_cat_dropped.dropna()
|
80 |
+
si_mean_imputed_X_cat_dropped =si_mean_imputed_X_cat_dropped.dropna()
|
81 |
+
si_median_imputed_X_cat_dropped =si_median_imputed_X_cat_dropped.dropna()
|
82 |
+
si_most_frequent_imputed_X_cat_dropped =si_most_frequent_imputed_X_cat_dropped.dropna()
|
83 |
+
iter_imputed_X_cat_dropped =iter_imputed_X_cat_dropped.dropna()
|
84 |
+
|
85 |
+
|
86 |
+
miss_val_dropped_X = X.dropna()
|
87 |
+
|
88 |
+
# list of dataframes
|
89 |
+
|
90 |
+
list_X_after_missing_values= [knn_imputed_num_X,
|
91 |
+
si_mean_imputed_num_X,
|
92 |
+
si_median_imputed_num_X,
|
93 |
+
si_most_frequent_imputed_num_X,
|
94 |
+
iter_imputed_num_X,
|
95 |
+
knn_imputed_X_cat_dropped,
|
96 |
+
si_mean_imputed_X_cat_dropped,
|
97 |
+
si_median_imputed_X_cat_dropped,
|
98 |
+
si_most_frequent_imputed_X_cat_dropped,
|
99 |
+
iter_imputed_X_cat_dropped,
|
100 |
+
miss_val_dropped_X]
|
101 |
+
list_X_after_missing_values_names= ["knn_imputed_num_X",
|
102 |
+
"si_mean_imputed_num_X",
|
103 |
+
"si_median_imputed_num_X",
|
104 |
+
"si_most_frequent_imputed_num_X",
|
105 |
+
"iter_imputed_num_X",
|
106 |
+
"knn_imputed_X_cat_dropped",
|
107 |
+
"si_mean_imputed_X_cat_dropped",
|
108 |
+
"si_median_imputed_X_cat_dropped",
|
109 |
+
"si_most_frequent_imputed_X_cat_dropped",
|
110 |
+
"iter_imputed_X_cat_dropped",
|
111 |
+
"miss_val_dropped_X"]
|
112 |
+
# st.write("si_most_frequent_imputed_num_X",si_most_frequent_imputed_num_X,)
|
113 |
+
ord_enc_cols = []
|
114 |
+
ohe_enc_cols = []
|
115 |
+
|
116 |
+
if len(cat_cols) == 0:
|
117 |
+
st.write("No Categorical Columns in Train")
|
118 |
+
else:
|
119 |
+
st.write("Select Columns for Ordinal Encoding")
|
120 |
+
for column in cat_cols:
|
121 |
+
selected = st.checkbox(column)
|
122 |
+
if selected:
|
123 |
+
st.write(f"No. of Unique value in {column} column are", X[column].nunique())
|
124 |
+
ord_enc_cols.append(column)
|
125 |
+
ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
|
126 |
+
ohe_enc_cols = list(ohe_enc_cols)
|
127 |
+
|
128 |
+
if len(ord_enc_cols)>0:
|
129 |
+
st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
|
130 |
+
if len(ohe_enc_cols)>0:
|
131 |
+
st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
|
132 |
+
|
133 |
+
if len(ord_enc_cols)>0:
|
134 |
+
|
135 |
+
ordinal_order_vals = []
|
136 |
+
|
137 |
+
for column in ord_enc_cols:
|
138 |
+
unique_vals = X.dropna()[column].unique()
|
139 |
+
# st.write(f"No. of Unique value in {column} column are", len(unique_vals))
|
140 |
+
|
141 |
+
ordered_unique_vals = st.multiselect("Select values in order for Ordinal Encoding",unique_vals,unique_vals)
|
142 |
+
ordinal_order_vals.append(ordered_unique_vals)
|
143 |
+
|
144 |
+
st.write("order of values for Ordinal Encoding",tuple(ordinal_order_vals))
|
145 |
+
|
146 |
+
if len_null > 0:
|
147 |
+
|
148 |
+
for df_name, df in enumerate(list_X_after_missing_values):
|
149 |
+
# st.write(f"{list_X_after_missing_values_names[df_name]}",df)
|
150 |
+
from sklearn.preprocessing import OrdinalEncoder
|
151 |
+
ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
|
152 |
+
df[ord_enc_cols] = ord.fit_transform(df[ord_enc_cols])
|
153 |
+
# st.write(f"{list_X_after_missing_values_names[df_name]}",df)
|
154 |
+
else :
|
155 |
+
from sklearn.preprocessing import OrdinalEncoder
|
156 |
+
ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
|
157 |
+
X[ord_enc_cols] = ord.fit_transform(X[ord_enc_cols])
|
158 |
+
|
159 |
+
st.write("Ordinal Encoding Completed ✅")
|
160 |
+
|
161 |
+
if len(ohe_enc_cols)>0:
|
162 |
+
if len_null > 0:
|
163 |
+
for df_name, df in enumerate(list_X_after_missing_values):
|
164 |
+
from sklearn.preprocessing import OneHotEncoder
|
165 |
+
ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
|
166 |
+
pd.options.mode.chained_assignment = None
|
167 |
+
df.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(df[ohe_enc_cols])
|
168 |
+
df.drop(columns = ohe_enc_cols,inplace = True)
|
169 |
+
pd.options.mode.chained_assignment = 'warn'
|
170 |
+
else:
|
171 |
+
from sklearn.preprocessing import OneHotEncoder
|
172 |
+
ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
|
173 |
+
pd.options.mode.chained_assignment = None
|
174 |
+
X.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(X[ohe_enc_cols])
|
175 |
+
X.drop(columns = ohe_enc_cols,inplace = True)
|
176 |
+
pd.options.mode.chained_assignment = 'warn'
|
177 |
+
st.write("OneHot Encoding Completed ✅")
|
178 |
+
|
179 |
+
|
180 |
+
if len(ohe_enc_cols)>0:
|
181 |
+
if len_null > 0:
|
182 |
+
for name,df in enumerate(list_X_after_missing_values):
|
183 |
+
X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
|
184 |
+
# best_tts.best_tts(df,y,model,eva)
|
185 |
+
evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
186 |
+
else:
|
187 |
+
X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =.2 ,random_state = 42)
|
188 |
+
# best_tts.best_tts(X,y,model,eva)
|
189 |
+
|
190 |
+
evaluationer.evaluation(f"baseline_model",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
191 |
+
|
192 |
+
if len_null >0:
|
193 |
+
for name,df in enumerate(list_X_after_missing_values):
|
194 |
+
X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
|
195 |
+
|
196 |
+
evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
197 |
+
|
198 |
+
if eva == "class":
|
199 |
+
counter = Counter(y)
|
200 |
+
total = sum(counter.values())
|
201 |
+
balance_ratio = {cls: count / total for cls, count in counter.items()}
|
202 |
+
num_classes = len(balance_ratio)
|
203 |
+
ideal_ratio = 1 / num_classes
|
204 |
+
a = all(abs(ratio - ideal_ratio) <= 0.1 * ideal_ratio for ratio in balance_ratio.values())
|
205 |
+
if a == True:
|
206 |
+
st.write("Balanced Dataset ✅")
|
207 |
+
st.write("Using accuracy for Evaluation")
|
208 |
+
value = "test_acc"
|
209 |
+
else:
|
210 |
+
st.write("Unbalanced Dataset ❌")
|
211 |
+
st.write("Using F1 score for Evaluation")
|
212 |
+
value = "test_f1"
|
213 |
+
|
214 |
+
evaluationer.classification_evaluation_df.sort_values(by = value,inplace= True)
|
215 |
+
name = str(evaluationer.classification_evaluation_df.iloc[-1,0])
|
216 |
+
st.write("df name",evaluationer.classification_evaluation_df.iloc[-1,0])
|
217 |
+
if len_null >0:
|
218 |
+
b = list_X_after_missing_values_names.index(name)
|
219 |
+
|
220 |
+
st.write("df",list_X_after_missing_values[b])
|
221 |
+
X = list_X_after_missing_values[b]
|
222 |
+
if eva == "reg":
|
223 |
+
st.write("Using R2 score for Evaluation",evaluationer.reg_evaluation_df)
|
224 |
+
value = "test_r2"
|
225 |
+
evaluationer.reg_evaluation_df.sort_values(by = value,inplace= True)
|
226 |
+
|
227 |
+
name = str(evaluationer.reg_evaluation_df.iloc[-1,0])
|
228 |
+
|
229 |
+
if len_null >0:
|
230 |
+
b = list_X_after_missing_values_names.index(name)
|
231 |
+
|
232 |
+
st.write("df",list_X_after_missing_values[b])
|
233 |
+
X = list_X_after_missing_values[b]
|
234 |
+
|
235 |
+
|
236 |
+
# Create a figure and axes
|
237 |
+
num_plots = len(num_cols)
|
238 |
+
cols = 2 # Number of columns in the subplot grid
|
239 |
+
rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
|
240 |
+
|
241 |
+
fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
|
242 |
+
|
243 |
+
# Flatten the axes array for easy iteration, and remove any excess subplots
|
244 |
+
axes = axes.flatten()
|
245 |
+
for ax in axes[num_plots:]:
|
246 |
+
fig.delaxes(ax)
|
247 |
+
|
248 |
+
for i, col in enumerate(num_cols):
|
249 |
+
sns.histplot(X[col], ax=axes[i],kde = True,color=sns.color_palette('Oranges', as_cmap=True)(0.7))
|
250 |
+
axes[i].set_title(col)
|
251 |
+
|
252 |
+
# Adjust layout
|
253 |
+
plt.tight_layout()
|
254 |
+
|
255 |
+
# Show the plot in Streamlit
|
256 |
+
st.pyplot(fig)
|
257 |
+
|
258 |
+
# Create a figure and axes
|
259 |
+
num_plots = len(num_cols)
|
260 |
+
cols = 3 # Number of columns in the subplot grid
|
261 |
+
rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
|
262 |
+
|
263 |
+
fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
|
264 |
+
|
265 |
+
# Flatten the axes array for easy iteration, and remove any excess subplots
|
266 |
+
axes = axes.flatten()
|
267 |
+
for ax in axes[num_plots:]:
|
268 |
+
fig.delaxes(ax)
|
269 |
+
|
270 |
+
for i, col in enumerate(num_cols):
|
271 |
+
sns.boxplot(y=X[col], ax=axes[i],palette="magma")
|
272 |
+
axes[i].set_title(col)
|
273 |
+
|
274 |
+
# Adjust layout
|
275 |
+
plt.tight_layout()
|
276 |
+
|
277 |
+
# Show the plot in Streamlit
|
278 |
+
st.pyplot(fig)
|
279 |
+
|
280 |
+
outlier_cols = st.multiselect("De-Select columns for Detecting Outliers", num_cols,default= list(num_cols))
|
281 |
+
|
282 |
+
st.write("Checking for Outliers")
|
283 |
+
outliers_df_X,outlier_indexes = outliers.detect_outliers(X,list(outlier_cols))
|
284 |
+
st.write("Outliers in Dataframe Summary",outliers_df_X)
|
285 |
+
st.write("Columns for Outliers handling",tuple(outliers_df_X["columns name"]))
|
286 |
+
|
287 |
+
select_outlier_cols = st.multiselect("Select columns for Outlier Handling",tuple(outliers_df_X["columns name"]),default =tuple(outliers_df_X["columns name"]))
|
288 |
+
resultant,outlier_handled_df,outlier_handled_df_name= outliers.outlier_handling(X,y,model,outlier_indexes = outlier_indexes,outlier_cols = select_outlier_cols ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = "reg")
|
289 |
+
st.write("outlier handling with methods",resultant)
|
290 |
+
st.write("Best method with outlier handling",resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])
|
291 |
+
try :
|
292 |
+
st.write("Best X Data Index No.",outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0]))
|
293 |
+
|
294 |
+
st.write("Best X DataFrame after outlier handling ",outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
|
295 |
+
X = outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])]
|
296 |
+
except :
|
297 |
+
"evaluation of baseline model is better continuing with baseline model"
|
298 |
+
|
299 |
+
|
300 |
+
X_train,X_test,y_train,y_test = tts(X,y[X.index],random_state = 42,test_size = 0.2)
|
301 |
+
st.write("result_df",X)
|
302 |
+
|
303 |
+
|
304 |
+
|
305 |
+
try:
|
306 |
+
result_df_1 , feature_col, feature_col_name = feature_selections.feature_selection(X_train,X_test,y_train,y_test,model,alpha = 0.05)
|
307 |
+
X = X.drop(columns = feature_col[feature_col_name.index(result_df_1.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
|
308 |
+
except:
|
309 |
+
"evaluation by feature selection is not better than previous"
|
310 |
+
|
311 |
+
try:
|
312 |
+
result,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
|
313 |
+
st.write("result_df",result)
|
314 |
+
except:
|
315 |
+
X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =0.2,random_state = 42)
|
316 |
+
|
317 |
+
|
318 |
+
|
319 |
+
|
320 |
+
st.write("cheking with polynomial features")
|
321 |
+
poly = PolynomialFeatures(degree=(2))
|
322 |
+
X_train_poly = poly.fit_transform(X_train)
|
323 |
+
X_test_poly = poly.transform(X_test)
|
324 |
+
result_df_2 = evaluationer.evaluation("polynomial features degree 2",X_train_poly,X_test_poly,y_train,y_test,model,root_mean_squared_error,eva)
|
325 |
+
st.write("after polynomial features degree 2",evaluationer.reg_evaluation_df)
|
326 |
+
poly1 = PolynomialFeatures(degree=(3))
|
327 |
+
X_train_poly1 = poly.fit_transform(X_train)
|
328 |
+
X_test_poly1 = poly.transform(X_test)
|
329 |
+
evaluationer.evaluation("polynomial features degree 3",X_train_poly1,X_test_poly1,y_train,y_test,model,root_mean_squared_error,eva)
|
330 |
+
st.write("after polynomial features degree 3",evaluationer.reg_evaluation_df)
|
331 |
+
|
332 |
+
pca = PCA(n_components=0.95)
|
333 |
+
X_train_pca = pca.fit_transform(X_train)
|
334 |
+
X_test_pca = pca.transform(X_test)
|
335 |
+
evaluationer.evaluation("PCA",X_train_pca,X_test_pca,y_train,y_test,model,root_mean_squared_error,eva)
|
336 |
+
st.write("After PCA",evaluationer.reg_evaluation_df)
|
337 |
+
|
338 |
+
grid_search_cv.perform_grid_search(model,model_name,X_train,X_test,y_train,y_test,eva)
|
339 |
+
st.write("best param",evaluationer.reg_evaluation_df)
|
340 |
+
st.sidebar.button("click to clear evaluation metrics",evaluationer.reg_evaluation_df.drop(index = evaluationer.reg_evaluation_df.index))
|
341 |
+
|
342 |
+
|
343 |
+
|
344 |
+
|
345 |
+
|
346 |
+
|
347 |
+
|
348 |
+
|
349 |
+
|
350 |
+
|
351 |
+
|
352 |
+
|
353 |
+
|
354 |
+
|
355 |
+
|
356 |
+
|
357 |
+
|
358 |
+
|
359 |
+
|
360 |
+
|
361 |
+
|
best_tts.py
CHANGED
@@ -10,9 +10,9 @@ def best_tts(X,y,model,eva):
|
|
10 |
if eva == "reg":
|
11 |
|
12 |
test_r2_,test_r2_ts,test_r2_rs = 0,0,0
|
13 |
-
for k in range(10,25):
|
14 |
i = k/100
|
15 |
-
for j in range(1,100):
|
16 |
X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size = i, random_state = j,)
|
17 |
|
18 |
model = model
|
|
|
10 |
if eva == "reg":
|
11 |
|
12 |
test_r2_,test_r2_ts,test_r2_rs = 0,0,0
|
13 |
+
for k in range(10,25,3):
|
14 |
i = k/100
|
15 |
+
for j in range(1,100,10):
|
16 |
X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size = i, random_state = j,)
|
17 |
|
18 |
model = model
|
eda.py
ADDED
@@ -0,0 +1,325 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import streamlit as st
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
import streamlit as st
|
7 |
+
import streamlit.components.v1 as components
|
8 |
+
import plotly.express as px
|
9 |
+
from plotly.subplots import make_subplots
|
10 |
+
import plotly.graph_objects as go
|
11 |
+
import streamlit as st
|
12 |
+
import pandas as pd
|
13 |
+
import datashader as ds
|
14 |
+
import datashader.transfer_functions as tf
|
15 |
+
from colorcet import fire
|
16 |
+
import plotly.express as px
|
17 |
+
# function to analysing EDA
|
18 |
+
def eda_analysis(df):
|
19 |
+
|
20 |
+
target_col = st.sidebar.selectbox("Select Target Column", df.columns,index = len(df.columns)-1)
|
21 |
+
y = df[target_col]
|
22 |
+
X = df.drop(columns = target_col)
|
23 |
+
num_cols = X.select_dtypes(exclude= "O").columns.tolist()
|
24 |
+
cat_cols = X.select_dtypes(include= "O").columns.tolist()
|
25 |
+
st.write("num_cols",tuple(num_cols))
|
26 |
+
st.write("cat_cols",tuple(cat_cols))
|
27 |
+
st.divider()
|
28 |
+
|
29 |
+
results = []
|
30 |
+
for column in X[num_cols].columns:
|
31 |
+
skewness = X[column].skew()
|
32 |
+
kurtosis = X[column].kurtosis()
|
33 |
+
|
34 |
+
skewness_html = f'<span style="color: {"red" if abs(skewness) > .5 else "white"}">{skewness:.2f}</span>'
|
35 |
+
kurtosis_html = f'<span style="color: {"red" if abs(kurtosis) > 3 else "white"}">{kurtosis:.2f}</span>'
|
36 |
+
|
37 |
+
results.append({
|
38 |
+
'Column': column,
|
39 |
+
'Skewness': skewness,
|
40 |
+
'Kurtosis': kurtosis,
|
41 |
+
'Skewness_': skewness_html,
|
42 |
+
'Kurtosis_': kurtosis_html
|
43 |
+
})
|
44 |
+
|
45 |
+
result_df = pd.DataFrame(results)
|
46 |
+
|
47 |
+
# Display the data types of Skewness and Kurtosis columns
|
48 |
+
# st.write("Data types of Skewness and Kurtosis columns:", result_df[["Skewness", "Kurtosis"]].dtypes)
|
49 |
+
|
50 |
+
if st.toggle("Show Skewness and Kurtosis of DataFrame columns"):
|
51 |
+
st.write("Columns with Skewness and Kurtosis:")
|
52 |
+
if st.checkbox("Filter Skewed columns"):
|
53 |
+
filtered_df = result_df[abs(result_df["Skewness"]) > 0.5]
|
54 |
+
st.write(filtered_df[['Column', 'Skewness_', 'Kurtosis_']].to_html(escape=False), unsafe_allow_html=True)
|
55 |
+
else:
|
56 |
+
st.write(result_df[['Column', 'Skewness_', 'Kurtosis_']].to_html(escape=False), unsafe_allow_html=True)
|
57 |
+
|
58 |
+
st.divider()
|
59 |
+
st.write("Plotting Numerical Columns for Visual EDA")
|
60 |
+
|
61 |
+
# Create two columns
|
62 |
+
column1, column2 = st.columns(2)
|
63 |
+
|
64 |
+
# Checkbox for plotting distribution in the first column
|
65 |
+
with column1:
|
66 |
+
plot_distribution = st.checkbox("Plot Distribution of Target Column")
|
67 |
+
|
68 |
+
# Show the second checkbox in the second column only if the first checkbox is clicked
|
69 |
+
if plot_distribution:
|
70 |
+
with column2:
|
71 |
+
show_kde = st.checkbox("Show KDE Plot")
|
72 |
+
kde = show_kde
|
73 |
+
else:
|
74 |
+
kde = False
|
75 |
+
|
76 |
+
# Plot the histogram if the first checkbox is checked
|
77 |
+
if plot_distribution:
|
78 |
+
fig, ax = plt.subplots()
|
79 |
+
sns.histplot(y, ax=ax, kde=kde)
|
80 |
+
|
81 |
+
# Show the plot in the Streamlit app
|
82 |
+
st.pyplot(fig)
|
83 |
+
|
84 |
+
column3, column4 = st.columns(2)
|
85 |
+
with column3:
|
86 |
+
plot_distribution_nc =st.checkbox("Plot Distribution of Input Numerical columns")
|
87 |
+
if plot_distribution_nc:
|
88 |
+
with column4:
|
89 |
+
show_kde_1 = st.checkbox("Show KDE Plot for Numerical Columns")
|
90 |
+
kde_1 = show_kde_1
|
91 |
+
if plot_distribution_nc:
|
92 |
+
for column in num_cols:
|
93 |
+
fig, ax = plt.subplots()
|
94 |
+
sns.histplot(df[column], ax=ax, kde=kde_1)
|
95 |
+
st.write(f"Distribution of {column}:")
|
96 |
+
st.pyplot(fig)
|
97 |
+
st.divider()
|
98 |
+
# plot count plot for categorical columns
|
99 |
+
st.write("Plotting Categorical Columns for Visual EDA")
|
100 |
+
if st.checkbox("Plot Distribution of Input Categorical columns") :
|
101 |
+
for column in cat_cols:
|
102 |
+
fig, ax = plt.subplots()
|
103 |
+
fig = px.histogram(df.fillna('Null'), x=column, color=target_col)
|
104 |
+
st.write(fig)
|
105 |
+
|
106 |
+
st.divider()
|
107 |
+
# plot correlation matrics using plotly
|
108 |
+
st.write("Plotting Correlation Matrix for Numerical Columns")
|
109 |
+
|
110 |
+
column5, column6 = st.columns(2)
|
111 |
+
with column5:
|
112 |
+
plot_distribution =st.checkbox("Plot Correlation Matrix")
|
113 |
+
if plot_distribution:
|
114 |
+
with column6:
|
115 |
+
show_value = st.checkbox("Correlation values > 0.5")
|
116 |
+
if show_value:
|
117 |
+
# Compute correlation matrix
|
118 |
+
corr_matrix = df[num_cols].corr()
|
119 |
+
|
120 |
+
# Plot correlation matrix heatmap
|
121 |
+
fig = px.imshow(corr_matrix[abs(corr_matrix)>0.5], color_continuous_scale='RdBu')
|
122 |
+
|
123 |
+
# Add annotations for values greater than 0.5
|
124 |
+
for i in range(corr_matrix.shape[0]):
|
125 |
+
for j in range(corr_matrix.shape[1]):
|
126 |
+
correlation_value = corr_matrix.iloc[i, j]
|
127 |
+
if abs(correlation_value) > 0.5: # Filter values greater than 0.5
|
128 |
+
fig.add_annotation(
|
129 |
+
x=i, y=j,
|
130 |
+
text=str(round(correlation_value, 2)),
|
131 |
+
showarrow=False
|
132 |
+
)
|
133 |
+
|
134 |
+
# Update layout
|
135 |
+
fig.update_layout(
|
136 |
+
xaxis=dict(side="top"),
|
137 |
+
width=600,
|
138 |
+
height=600,
|
139 |
+
margin=dict(l=20, r=20, t=40, b=20)
|
140 |
+
)
|
141 |
+
|
142 |
+
# Display the heatmap
|
143 |
+
st.write(fig)
|
144 |
+
if plot_distribution and not show_value:
|
145 |
+
|
146 |
+
|
147 |
+
corr_matrix = df[num_cols].corr()
|
148 |
+
fig = px.imshow(corr_matrix, color_continuous_scale='RdBu')
|
149 |
+
for i in range(corr_matrix.shape[0]):
|
150 |
+
for j in range(corr_matrix.shape[1]):
|
151 |
+
fig.add_annotation(
|
152 |
+
x=i, y=j,
|
153 |
+
text=str(round(corr_matrix.iloc[i, j], 2)),
|
154 |
+
showarrow=False
|
155 |
+
)
|
156 |
+
|
157 |
+
# Update the layout to ensure annotations are displayed properly
|
158 |
+
fig.update_layout(
|
159 |
+
xaxis=dict(side="top"),
|
160 |
+
width=600,
|
161 |
+
height=600,
|
162 |
+
margin=dict(l=20, r=20, t=40, b=20)
|
163 |
+
)
|
164 |
+
|
165 |
+
st.write(fig)
|
166 |
+
st.divider()
|
167 |
+
outlier_cols = st.multiselect("Select Continous numerical columns for Outlier Plot",num_cols)
|
168 |
+
|
169 |
+
# plot px.boxplot for outlier cols
|
170 |
+
if st.toggle("Toggle for Violin Plot"):
|
171 |
+
if st.checkbox("Plot BoxPlot for Outlier Cols"):
|
172 |
+
if st.toggle("Split by Target"):
|
173 |
+
for col in outlier_cols:
|
174 |
+
fig = px.violin(df, x=col,color=y)
|
175 |
+
st.write(fig)
|
176 |
+
st.divider()
|
177 |
+
else:
|
178 |
+
for col in outlier_cols:
|
179 |
+
fig = px.violin(df, x=col)
|
180 |
+
st.write(fig)
|
181 |
+
st.divider()
|
182 |
+
if st.checkbox("check outlier distribution of Target column"):
|
183 |
+
fig = px.violin(y)
|
184 |
+
st.write(fig)
|
185 |
+
|
186 |
+
else:
|
187 |
+
if st.checkbox("Plot BoxPlot for Outlier Cols"):
|
188 |
+
if st.toggle("Split by Target"):
|
189 |
+
for col in outlier_cols:
|
190 |
+
fig = px.box(df, x=col,color=y)
|
191 |
+
st.write(fig)
|
192 |
+
st.divider()
|
193 |
+
else:
|
194 |
+
for col in outlier_cols:
|
195 |
+
fig = px.box(df, x=col)
|
196 |
+
st.write(fig)
|
197 |
+
st.divider()
|
198 |
+
if st.checkbox("check outlier distribution of Target column"):
|
199 |
+
fig = px.box(y)
|
200 |
+
st.write(fig)
|
201 |
+
|
202 |
+
|
203 |
+
# plot scatter plot using px
|
204 |
+
st.divider()
|
205 |
+
|
206 |
+
if st.checkbox("Plot Scatter Plot"):
|
207 |
+
column7, column8,column9 = st.columns(3)
|
208 |
+
with column7:
|
209 |
+
|
210 |
+
|
211 |
+
# Select y-axis column
|
212 |
+
y_col = st.selectbox("Select y axis column", df.columns)
|
213 |
+
|
214 |
+
# Filter categorical columns for the x-axis selection
|
215 |
+
categorical_columns = df.columns
|
216 |
+
with column8:
|
217 |
+
# Allow user to select the x-axis column from categorical columns
|
218 |
+
x_col = st.selectbox("Select x axis column", categorical_columns)
|
219 |
+
with column9:
|
220 |
+
hue_col = st.selectbox("Select Hue column",categorical_columns)
|
221 |
+
# Plot scatter plot using Plotly
|
222 |
+
fig = px.scatter(df, x=x_col, y=y_col, color=hue_col)
|
223 |
+
st.write(fig)
|
224 |
+
|
225 |
+
# barchart and line chart
|
226 |
+
st.divider()
|
227 |
+
if st.checkbox("Plot Bar Chart"):
|
228 |
+
column10, column11 = st.columns(2)
|
229 |
+
with column10:
|
230 |
+
# Select y-axis column
|
231 |
+
y_col = st.selectbox("Select y axis column", df.columns)
|
232 |
+
|
233 |
+
# Filter categorical columns for the x-axis selection
|
234 |
+
categorical_columns = df.columns
|
235 |
+
with column11:
|
236 |
+
# Allow user to select the x-axis column from categorical columns
|
237 |
+
x_col = st.selectbox("Select x axis column", categorical_columns)
|
238 |
+
fig = px.bar(df, x=x_col, y=y_col,color = x_col)
|
239 |
+
st.write(fig)
|
240 |
+
st.divider()
|
241 |
+
if st.checkbox("Plot Line Chart"):
|
242 |
+
column12, column13,colx = st.columns(3)
|
243 |
+
with column12:
|
244 |
+
# Select y-axis column
|
245 |
+
y_col = st.selectbox("Select y axis column", df.columns)
|
246 |
+
|
247 |
+
# Filter categorical columns for the x-axis selection
|
248 |
+
categorical_columns = df.columns
|
249 |
+
with column13:
|
250 |
+
# Allow user to select the x-axis column from categorical columns
|
251 |
+
x_col = st.selectbox("Select x axis column", categorical_columns)
|
252 |
+
with colx:
|
253 |
+
hue_col1 = st.selectbox("Select Line split column",categorical_columns)
|
254 |
+
fig = px.line(df.sort_values(by = y_col), x=x_col, y=y_col,color = hue_col1)
|
255 |
+
st.write(fig)
|
256 |
+
st.divider()
|
257 |
+
# plot pie chart
|
258 |
+
if st.checkbox("Plot Pie Chart "):
|
259 |
+
column14, column15 = st.columns(2)
|
260 |
+
with column14:
|
261 |
+
# Select y-axis column
|
262 |
+
y_col = st.selectbox("Select values columns", df.columns)
|
263 |
+
|
264 |
+
# Filter categorical columns for the x-axis selection
|
265 |
+
categorical_columns = df.columns
|
266 |
+
with column15:
|
267 |
+
# Allow user to select the x-axis column from categorical columns
|
268 |
+
x_col = st.selectbox("Select names column", categorical_columns)
|
269 |
+
fig = px.pie(df, values=y_col, names=x_col)
|
270 |
+
st.write(fig)
|
271 |
+
|
272 |
+
st.divider()
|
273 |
+
# check if there are latitude and longitude columns
|
274 |
+
if st.checkbox("Plot on Map"):
|
275 |
+
lat_col = st.selectbox("Select Latitute Column",df.columns)
|
276 |
+
long_col = st.selectbox("Select Longitude Column",df.columns)
|
277 |
+
color = st.selectbox
|
278 |
+
|
279 |
+
# # Create the datashader canvas and aggregate points
|
280 |
+
# cvs = ds.Canvas(plot_width=1000, plot_height=1000)
|
281 |
+
# agg = cvs.points(df, x=long_col, y=lat_col)
|
282 |
+
|
283 |
+
# # Get the coordinates for the mapbox layer
|
284 |
+
# coords_lat, coords_lon = agg.coords[lat_col].values, agg.coords[long_col].values
|
285 |
+
# coordinates = [
|
286 |
+
# [coords_lon[0], coords_lat[0]],
|
287 |
+
# [coords_lon[-1], coords_lat[0]],
|
288 |
+
# [coords_lon[-1], coords_lat[-1]],
|
289 |
+
# [coords_lon[0], coords_lat[-1]]
|
290 |
+
# ]
|
291 |
+
|
292 |
+
# # Generate the datashader image
|
293 |
+
# img = tf.shade(agg, cmap=fire)[::-1].to_pil()
|
294 |
+
|
295 |
+
# # Create the Plotly figure with a mapbox layer
|
296 |
+
# fig = px.scatter_mapbox(df[:1], lat=lat_col, lon=long_col, zoom=10) # Adjust zoom level as needed
|
297 |
+
# fig.update_layout(mapbox_style="carto-darkmatter",
|
298 |
+
# mapbox_layers=[
|
299 |
+
# {
|
300 |
+
# "sourcetype": "image",
|
301 |
+
# "source": img,
|
302 |
+
# "coordinates": coordinates
|
303 |
+
# }
|
304 |
+
# ])
|
305 |
+
|
306 |
+
# # Display the figure in Streamlit
|
307 |
+
# st.plotly_chart(fig)
|
308 |
+
|
309 |
+
# Create a scatter mapbox plot with vibrant colors and custom marker sizes
|
310 |
+
if st.button("Proceed to plot map"):
|
311 |
+
fig = px.scatter_mapbox(df, lat=lat_col, lon=long_col,
|
312 |
+
|
313 |
+
size_max=15, # Max marker size
|
314 |
+
mapbox_style="open-street-map", # Using a different map style for vibrancy
|
315 |
+
zoom=1,
|
316 |
+
title='Latitude and Longitude Plotting')
|
317 |
+
|
318 |
+
# Customize the layout for more vibrant appearance
|
319 |
+
fig.update_layout(mapbox_accesstoken='your_mapbox_access_token')
|
320 |
+
st.write(fig)
|
321 |
+
|
322 |
+
|
323 |
+
|
324 |
+
|
325 |
+
|
feature_selections.py
CHANGED
@@ -8,12 +8,10 @@ import pandas as pd
|
|
8 |
import numpy as np
|
9 |
import evaluationer
|
10 |
import streamlit as st
|
11 |
-
|
|
|
12 |
from sklearn.metrics import root_mean_squared_error
|
13 |
def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
|
14 |
-
|
15 |
-
st.write("dvsdv",y_train)
|
16 |
-
st.write("dvfssdv",X_train)
|
17 |
|
18 |
model = sm.OLS(y_train, sm.add_constant(X_train))
|
19 |
model_fit = model.fit()
|
@@ -100,5 +98,7 @@ def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
|
|
100 |
feature_cols_name = ["pval_cols","coef_cols","pval_and_coef_cols","mi_cols","corr_u_cols","corr_l_cols","vif_cols","lasso_cols"]
|
101 |
st.write("feature_cols", vif_cols)
|
102 |
for i,j in enumerate(feature_cols):
|
103 |
-
evaluationer.evaluation(f"{feature_cols_name[i]}
|
104 |
-
return evaluationer.reg_evaluation_df
|
|
|
|
|
|
8 |
import numpy as np
|
9 |
import evaluationer
|
10 |
import streamlit as st
|
11 |
+
|
12 |
+
|
13 |
from sklearn.metrics import root_mean_squared_error
|
14 |
def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
|
|
|
|
|
|
|
15 |
|
16 |
model = sm.OLS(y_train, sm.add_constant(X_train))
|
17 |
model_fit = model.fit()
|
|
|
98 |
feature_cols_name = ["pval_cols","coef_cols","pval_and_coef_cols","mi_cols","corr_u_cols","corr_l_cols","vif_cols","lasso_cols"]
|
99 |
st.write("feature_cols", vif_cols)
|
100 |
for i,j in enumerate(feature_cols):
|
101 |
+
evaluationer.evaluation(f"{feature_cols_name[i]}" ,X_train.drop(columns = j),X_test.drop(columns = j),y_train,y_test,model_reg,method = root_mean_squared_error,eva = "reg")
|
102 |
+
return evaluationer.reg_evaluation_df,feature_cols,feature_cols_name
|
103 |
+
|
104 |
+
|
grid_search_cv.py
ADDED
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet
|
2 |
+
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
|
3 |
+
from sklearn.neighbors import KNeighborsRegressor
|
4 |
+
from sklearn.tree import DecisionTreeRegressor
|
5 |
+
from sklearn.svm import SVR
|
6 |
+
from xgboost import XGBRegressor, XGBRFRegressor
|
7 |
+
from sklearn.neural_network import MLPRegressor
|
8 |
+
from lightgbm import LGBMRegressor
|
9 |
+
from sklearn.naive_bayes import GaussianNB
|
10 |
+
from sklearn.model_selection import GridSearchCV
|
11 |
+
from sklearn.datasets import make_regression
|
12 |
+
from sklearn.model_selection import train_test_split
|
13 |
+
import streamlit as st
|
14 |
+
import evaluationer
|
15 |
+
|
16 |
+
from sklearn.metrics import root_mean_squared_error
|
17 |
+
|
18 |
+
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
|
19 |
+
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
|
20 |
+
from sklearn.neighbors import KNeighborsClassifier
|
21 |
+
from sklearn.tree import DecisionTreeClassifier
|
22 |
+
from sklearn.svm import SVC
|
23 |
+
from xgboost import XGBClassifier, XGBRFClassifier
|
24 |
+
from sklearn.neural_network import MLPClassifier
|
25 |
+
from lightgbm import LGBMClassifier
|
26 |
+
from sklearn.naive_bayes import MultinomialNB, CategoricalNB
|
27 |
+
|
28 |
+
param_grids_class = {
|
29 |
+
"Logistic Regression": {
|
30 |
+
'penalty': ['l1', 'l2', 'elasticnet', 'none'],
|
31 |
+
'C': [0.01, 0.1, 1, 10],
|
32 |
+
'solver': ['lbfgs', 'liblinear', 'saga']
|
33 |
+
},
|
34 |
+
|
35 |
+
"SGD Classifier": {
|
36 |
+
'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge'],
|
37 |
+
'penalty': ['l2', 'l1', 'elasticnet'],
|
38 |
+
'alpha': [0.0001, 0.001, 0.01],
|
39 |
+
'max_iter': [1000, 5000, 10000]
|
40 |
+
},
|
41 |
+
|
42 |
+
"Ridge Classifier": {
|
43 |
+
'alpha': [0.1, 1, 10, 100]
|
44 |
+
},
|
45 |
+
|
46 |
+
"Random Forest Classifier": {
|
47 |
+
'n_estimators': [100, 200, 300],
|
48 |
+
'max_depth': [None, 10, 20, 30],
|
49 |
+
'min_samples_split': [2, 5, 10],
|
50 |
+
'min_samples_leaf': [1, 2, 4]
|
51 |
+
},
|
52 |
+
|
53 |
+
"AdaBoost Classifier": {
|
54 |
+
'n_estimators': [50, 100, 200],
|
55 |
+
'learning_rate': [0.01, 0.1, 1]
|
56 |
+
},
|
57 |
+
|
58 |
+
"Gradient Boosting Classifier": {
|
59 |
+
'n_estimators': [100, 200, 300],
|
60 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
61 |
+
'max_depth': [3, 5, 7]
|
62 |
+
},
|
63 |
+
|
64 |
+
"Hist Gradient Boosting Classifier": {
|
65 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
66 |
+
'max_depth': [None, 10, 20],
|
67 |
+
'min_samples_leaf': [20, 50, 100]
|
68 |
+
},
|
69 |
+
|
70 |
+
"K Neighbors Classifier": {
|
71 |
+
'n_neighbors': [3, 5, 7],
|
72 |
+
'weights': ['uniform', 'distance'],
|
73 |
+
'metric': ['euclidean', 'manhattan']
|
74 |
+
},
|
75 |
+
|
76 |
+
"Decision Tree Classifier": {
|
77 |
+
'max_depth': [None, 10, 20, 30],
|
78 |
+
'min_samples_split': [2, 5, 10],
|
79 |
+
'min_samples_leaf': [1, 2, 4]
|
80 |
+
},
|
81 |
+
|
82 |
+
"SVC": {
|
83 |
+
'C': [0.1, 1, 10],
|
84 |
+
'kernel': ['linear', 'poly', 'rbf'],
|
85 |
+
'degree': [3, 4, 5],
|
86 |
+
'gamma': ['scale', 'auto']
|
87 |
+
},
|
88 |
+
|
89 |
+
"XGB Classifier": {
|
90 |
+
'n_estimators': [100, 200, 300],
|
91 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
92 |
+
'max_depth': [3, 5, 7]
|
93 |
+
},
|
94 |
+
|
95 |
+
"XGBRF Classifier": {
|
96 |
+
'n_estimators': [100, 200, 300],
|
97 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
98 |
+
'max_depth': [3, 5, 7]
|
99 |
+
},
|
100 |
+
|
101 |
+
"MLP Classifier": {
|
102 |
+
'hidden_layer_sizes': [(50,), (100,), (50, 50)],
|
103 |
+
'activation': ['tanh', 'relu'],
|
104 |
+
'solver': ['adam', 'sgd'],
|
105 |
+
'alpha': [0.0001, 0.001, 0.01],
|
106 |
+
'learning_rate': ['constant', 'adaptive']
|
107 |
+
},
|
108 |
+
|
109 |
+
"LGBM Classifier": {
|
110 |
+
'n_estimators': [100, 200, 300],
|
111 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
112 |
+
'max_depth': [-1, 10, 20]
|
113 |
+
},
|
114 |
+
|
115 |
+
"Multinomial Naive Bayes": {
|
116 |
+
'alpha': [0.1, 0.5, 1.0]
|
117 |
+
},
|
118 |
+
|
119 |
+
"Categorical Naive Bayes": {
|
120 |
+
'alpha': [0.1, 0.5, 1.0]
|
121 |
+
}
|
122 |
+
}
|
123 |
+
|
124 |
+
param_grids_reg = {
|
125 |
+
"Linear Regression": {},
|
126 |
+
|
127 |
+
"SGD Regressor": {
|
128 |
+
'loss': ['squared_loss', 'huber'],
|
129 |
+
'penalty': ['l2', 'l1', 'elasticnet'],
|
130 |
+
'alpha': [0.0001, 0.001, 0.01],
|
131 |
+
'max_iter': [1000, 5000, 10000]
|
132 |
+
},
|
133 |
+
|
134 |
+
"Ridge Regressor": {
|
135 |
+
'alpha': [0.1, 1, 10, 100],
|
136 |
+
'solver': ['auto', 'svd', 'cholesky', 'lsqr']
|
137 |
+
},
|
138 |
+
|
139 |
+
"Lasso Regressor": {
|
140 |
+
'alpha': [0.1, 1, 10, 100]
|
141 |
+
},
|
142 |
+
|
143 |
+
"ElasticNet Regressor": {
|
144 |
+
'alpha': [0.1, 1, 10, 100],
|
145 |
+
'l1_ratio': [0.1, 0.5, 0.9]
|
146 |
+
},
|
147 |
+
|
148 |
+
"Random Forest Regressor": {
|
149 |
+
'n_estimators': [100, 200, 300],
|
150 |
+
'max_depth': [None, 10, 20, 30],
|
151 |
+
'min_samples_split': [2, 5, 10],
|
152 |
+
'min_samples_leaf': [1, 2, 4]
|
153 |
+
},
|
154 |
+
|
155 |
+
"AdaBoost Regressor": {
|
156 |
+
'n_estimators': [50, 100, 200],
|
157 |
+
'learning_rate': [0.01, 0.1, 1]
|
158 |
+
},
|
159 |
+
|
160 |
+
"Gradient Boosting Regressor": {
|
161 |
+
'n_estimators': [100, 200, 300],
|
162 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
163 |
+
'max_depth': [3, 5, 7]
|
164 |
+
},
|
165 |
+
|
166 |
+
"Hist Gradient Boosting Regressor": {
|
167 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
168 |
+
'max_depth': [None, 10, 20],
|
169 |
+
'min_samples_leaf': [20, 50, 100]
|
170 |
+
},
|
171 |
+
|
172 |
+
"K Neighbors Regressor": {
|
173 |
+
'n_neighbors': [3, 5, 7],
|
174 |
+
'weights': ['uniform', 'distance'],
|
175 |
+
'metric': ['euclidean', 'manhattan']
|
176 |
+
},
|
177 |
+
|
178 |
+
"Decision Tree Regressor": {
|
179 |
+
'max_depth': [None, 10, 20, 30],
|
180 |
+
'min_samples_split': [2, 5, 10],
|
181 |
+
'min_samples_leaf': [1, 2, 4]
|
182 |
+
},
|
183 |
+
|
184 |
+
"SVR": {
|
185 |
+
'C': [0.1, 1, 10],
|
186 |
+
'kernel': ['linear', 'poly', 'rbf'],
|
187 |
+
'degree': [3, 4, 5],
|
188 |
+
'gamma': ['scale', 'auto']
|
189 |
+
},
|
190 |
+
|
191 |
+
"XGB Regressor": {
|
192 |
+
'n_estimators': [100, 200, 300],
|
193 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
194 |
+
'max_depth': [3, 5, 7]
|
195 |
+
},
|
196 |
+
|
197 |
+
"XGBRF Regressor": {
|
198 |
+
'n_estimators': [100, 200, 300],
|
199 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
200 |
+
'max_depth': [3, 5, 7]
|
201 |
+
},
|
202 |
+
|
203 |
+
"MLP Regressor": {
|
204 |
+
'hidden_layer_sizes': [(50,), (100,), (50, 50)],
|
205 |
+
'activation': ['tanh', 'relu'],
|
206 |
+
'solver': ['adam', 'sgd'],
|
207 |
+
'alpha': [0.0001, 0.001, 0.01],
|
208 |
+
'learning_rate': ['constant', 'adaptive']
|
209 |
+
},
|
210 |
+
|
211 |
+
"LGBM Regressor": {
|
212 |
+
'n_estimators': [100, 200, 300],
|
213 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
214 |
+
'max_depth': [-1, 10, 20]
|
215 |
+
},
|
216 |
+
|
217 |
+
"Gaussian Naive Bayes": {
|
218 |
+
'var_smoothing': [1e-9, 1e-8, 1e-7]
|
219 |
+
}
|
220 |
+
}
|
221 |
+
|
222 |
+
# Define the regressors
|
223 |
+
regressors = {
|
224 |
+
"Linear Regression": LinearRegression(),
|
225 |
+
"SGD Regressor": SGDRegressor(),
|
226 |
+
"Ridge Regressor": Ridge(),
|
227 |
+
"Lasso Regressor": Lasso(),
|
228 |
+
"ElasticNet Regressor": ElasticNet(),
|
229 |
+
"Random Forest Regressor": RandomForestRegressor(),
|
230 |
+
"AdaBoost Regressor": AdaBoostRegressor(),
|
231 |
+
"Gradient Boosting Regressor": GradientBoostingRegressor(),
|
232 |
+
"Hist Gradient Boosting Regressor": HistGradientBoostingRegressor(),
|
233 |
+
"K Neighbors Regressor": KNeighborsRegressor(),
|
234 |
+
"Decision Tree Regressor": DecisionTreeRegressor(),
|
235 |
+
"SVR": SVR(),
|
236 |
+
"XGB Regressor": XGBRegressor(),
|
237 |
+
"XGBRF Regressor": XGBRFRegressor(),
|
238 |
+
"MLP Regressor": MLPRegressor(),
|
239 |
+
"LGBM Regressor": LGBMRegressor(),
|
240 |
+
"Gaussian Naive Bayes": GaussianNB()
|
241 |
+
}
|
242 |
+
|
243 |
+
classifiers = {
|
244 |
+
"Logistic Regression": LogisticRegression(),
|
245 |
+
"SGD Classifier": SGDClassifier(),
|
246 |
+
"Ridge Classifier": RidgeClassifier(),
|
247 |
+
"Random Forest Classifier": RandomForestClassifier(),
|
248 |
+
"AdaBoost Classifier": AdaBoostClassifier(),
|
249 |
+
"Gradient Boosting Classifier": GradientBoostingClassifier(),
|
250 |
+
"Hist Gradient Boosting Classifier": HistGradientBoostingClassifier(),
|
251 |
+
"K Neighbors Classifier": KNeighborsClassifier(),
|
252 |
+
"Decision Tree Classifier": DecisionTreeClassifier(),
|
253 |
+
"SVC": SVC(),
|
254 |
+
"XGB Classifier": XGBClassifier(),
|
255 |
+
"XGBRF Classifier": XGBRFClassifier(),
|
256 |
+
"MLP Classifier": MLPClassifier(),
|
257 |
+
"LGBM Classifier": LGBMClassifier(),
|
258 |
+
"Multinomial Naive Bayes": MultinomialNB(),
|
259 |
+
"Categorical Naive Bayes": CategoricalNB()
|
260 |
+
}
|
261 |
+
def perform_grid_search(model,model_name,X_train,X_test,y_train,y_test,eva):
|
262 |
+
if eva == "reg":
|
263 |
+
regressor = regressors[model_name]
|
264 |
+
|
265 |
+
param_grid_reg = param_grids_reg[model_name]
|
266 |
+
|
267 |
+
grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid_reg, cv=5, scoring='neg_mean_squared_error')
|
268 |
+
grid_search.fit(X_train,y_train)
|
269 |
+
st.write(f"Best Parameters for {model_name}: {grid_search.best_params_}")
|
270 |
+
st.write(f"Best Score for {model_name}: {grid_search.best_score_}")
|
271 |
+
best_model = grid_search.best_estimator_
|
272 |
+
y_pred = best_model.predict(X_test)
|
273 |
+
evaluationer.evaluation("best hyperparams",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
274 |
+
elif eva == "class":
|
275 |
+
classifier = classifiers[model_name]
|
276 |
+
param_grid_class = param_grids_class[model_name]
|
277 |
+
|
278 |
+
grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid_class, cv=5, scoring='accuracy')
|
279 |
+
grid_search.fit(X_train,y_train)
|
280 |
+
st.write(f"Best Parameters for {model_name}: {grid_search.best_params_}")
|
281 |
+
st.write(f"Best Score for {model_name}: {grid_search.best_score_}")
|
282 |
+
best_model = grid_search.best_estimator_
|
283 |
+
y_pred = best_model.predict(X_test)
|
284 |
+
evaluationer.evaluation("best hyperparams",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
models.py
CHANGED
@@ -23,6 +23,8 @@ from sklearn.neural_network import MLPRegressor
|
|
23 |
from lightgbm import LGBMRegressor
|
24 |
from sklearn.naive_bayes import GaussianNB
|
25 |
|
|
|
|
|
26 |
# dictionary where keys are name of algorithm and values are algorithm for classifier
|
27 |
algos_class = {
|
28 |
"Logistic Regression": LogisticRegression(),
|
|
|
23 |
from lightgbm import LGBMRegressor
|
24 |
from sklearn.naive_bayes import GaussianNB
|
25 |
|
26 |
+
|
27 |
+
|
28 |
# dictionary where keys are name of algorithm and values are algorithm for classifier
|
29 |
algos_class = {
|
30 |
"Logistic Regression": LogisticRegression(),
|
requirements.txt
CHANGED
@@ -1,10 +1,11 @@
|
|
|
|
1 |
streamlit==1.34.0
|
2 |
joblib==1.4.2
|
3 |
numpy==1.26.4
|
4 |
pandas==2.2.2
|
5 |
scikit-learn==1.4.2
|
6 |
-
|
|
|
|
|
7 |
matplotlib==3.9.0
|
8 |
-
|
9 |
-
lightgbm==4.3.0
|
10 |
-
statsmodels==0.14.2
|
|
|
1 |
+
|
2 |
streamlit==1.34.0
|
3 |
joblib==1.4.2
|
4 |
numpy==1.26.4
|
5 |
pandas==2.2.2
|
6 |
scikit-learn==1.4.2
|
7 |
+
datashader==0.16.2
|
8 |
+
colorcet==3.1.0
|
9 |
+
plotly==5.22.0
|
10 |
matplotlib==3.9.0
|
11 |
+
seaborn==0.13.2
|
|
|
|