Update auto_optimizer.py
Browse files- auto_optimizer.py +317 -317
auto_optimizer.py
CHANGED
@@ -1,317 +1,317 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
import numpy as np
|
3 |
-
import streamlit as st
|
4 |
-
from sklearn.impute import KNNImputer,SimpleImputer,IterativeImputer
|
5 |
-
import best_tts, evaluationer,models
|
6 |
-
from sklearn.experimental import enable_iterative_imputer
|
7 |
-
from sklearn.model_selection import train_test_split as tts
|
8 |
-
from collections import Counter
|
9 |
-
#root_mean_squared_error
|
10 |
-
from sklearn.metrics import root_mean_squared_error
|
11 |
-
import seaborn as sns
|
12 |
-
import matplotlib.pyplot as plt
|
13 |
-
import outliers,best_tts
|
14 |
-
import feature_selections
|
15 |
-
def Auto_optimizer(X,y,eva,model,test= None):
|
16 |
-
|
17 |
-
num_cols = X.select_dtypes(exclude = "O").columns
|
18 |
-
cat_cols = X.select_dtypes(include = "O").columns
|
19 |
-
st.write("Num_cols",tuple(num_cols))
|
20 |
-
st.write("cat_cols",tuple(cat_cols))
|
21 |
-
|
22 |
-
# check for Duplicate and drop duplicated in X
|
23 |
-
|
24 |
-
if len(X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40]) >0:
|
25 |
-
X = X.drop(columns = X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40].index)
|
26 |
-
st.write("Columns with more than 40% null values removed")
|
27 |
-
# st.write("csx",X)
|
28 |
-
|
29 |
-
len_null = X.isnull().sum().sum()
|
30 |
-
|
31 |
-
st.write(f"There are {len_null} null values in Train")
|
32 |
-
|
33 |
-
knn_imputed_num_X = X.copy()
|
34 |
-
si_mean_imputed_num_X = X.copy()
|
35 |
-
# st.write("sf",si_mean_imputed_num_X)
|
36 |
-
si_median_imputed_num_X = X.copy()
|
37 |
-
si_most_frequent_imputed_num_X = X.copy()
|
38 |
-
iter_imputed_num_X = X.copy()
|
39 |
-
knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
|
40 |
-
si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
|
41 |
-
si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
|
42 |
-
si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
|
43 |
-
iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
|
44 |
-
if len_null >0:
|
45 |
-
|
46 |
-
if X[num_cols].isnull().sum().sum() >0:
|
47 |
-
|
48 |
-
knn_imputer = KNNImputer(n_neighbors = 5)
|
49 |
-
knn_imputed_num_X[num_cols] = knn_imputer.fit_transform(knn_imputed_num_X[num_cols])
|
50 |
-
si_imputer = SimpleImputer(strategy = "mean")
|
51 |
-
si_mean_imputed_num_X[num_cols] = si_imputer.fit_transform(si_mean_imputed_num_X[num_cols])
|
52 |
-
si_imputer = SimpleImputer(strategy = "median")
|
53 |
-
si_median_imputed_num_X[num_cols] = si_imputer.fit_transform(si_median_imputed_num_X[num_cols])
|
54 |
-
si_imputer = SimpleImputer(strategy = "most_frequent")
|
55 |
-
si_most_frequent_imputed_num_X[num_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[num_cols])
|
56 |
-
iter_imputer = IterativeImputer(max_iter = 200,random_state= 42)
|
57 |
-
iter_imputed_num_X[num_cols] = iter_imputer.fit_transform(iter_imputed_num_X[num_cols])
|
58 |
-
knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
|
59 |
-
si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
|
60 |
-
si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
|
61 |
-
si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
|
62 |
-
iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
|
63 |
-
|
64 |
-
if X[cat_cols].isnull().sum().sum() >0:
|
65 |
-
# treating missing values in categorical columns
|
66 |
-
# st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
|
67 |
-
si_imputer = SimpleImputer(strategy = "most_frequent")
|
68 |
-
|
69 |
-
knn_imputed_num_X[cat_cols] = si_imputer.fit_transform(knn_imputed_num_X[cat_cols])
|
70 |
-
si_imputer = SimpleImputer(strategy = "most_frequent")
|
71 |
-
si_mean_imputed_num_X.loc[:,cat_cols] = si_imputer.fit_transform(si_mean_imputed_num_X.loc[:,cat_cols])
|
72 |
-
# st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
|
73 |
-
si_median_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_median_imputed_num_X[cat_cols])
|
74 |
-
si_most_frequent_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[cat_cols])
|
75 |
-
iter_imputed_num_X[cat_cols] = si_imputer.fit_transform(iter_imputed_num_X[cat_cols])
|
76 |
-
|
77 |
-
knn_imputed_X_cat_dropped = knn_imputed_X_cat_dropped.dropna()
|
78 |
-
si_mean_imputed_X_cat_dropped =si_mean_imputed_X_cat_dropped.dropna()
|
79 |
-
si_median_imputed_X_cat_dropped =si_median_imputed_X_cat_dropped.dropna()
|
80 |
-
si_most_frequent_imputed_X_cat_dropped =si_most_frequent_imputed_X_cat_dropped.dropna()
|
81 |
-
iter_imputed_X_cat_dropped =iter_imputed_X_cat_dropped.dropna()
|
82 |
-
st.write("sdds",knn_imputed_num_X)
|
83 |
-
st.write("sddssd",knn_imputed_X_cat_dropped)
|
84 |
-
|
85 |
-
miss_val_dropped_X = X.dropna()
|
86 |
-
|
87 |
-
# list of dataframes
|
88 |
-
|
89 |
-
list_X_after_missing_values= [knn_imputed_num_X,
|
90 |
-
si_mean_imputed_num_X,
|
91 |
-
si_median_imputed_num_X,
|
92 |
-
si_most_frequent_imputed_num_X,
|
93 |
-
iter_imputed_num_X,
|
94 |
-
knn_imputed_X_cat_dropped,
|
95 |
-
si_mean_imputed_X_cat_dropped,
|
96 |
-
si_median_imputed_X_cat_dropped,
|
97 |
-
si_most_frequent_imputed_X_cat_dropped,
|
98 |
-
iter_imputed_X_cat_dropped,
|
99 |
-
miss_val_dropped_X]
|
100 |
-
list_X_after_missing_values_names= ["knn_imputed_num_X",
|
101 |
-
"si_mean_imputed_num_X",
|
102 |
-
"si_median_imputed_num_X",
|
103 |
-
"si_most_frequent_imputed_num_X",
|
104 |
-
"iter_imputed_num_X",
|
105 |
-
"knn_imputed_X_cat_dropped",
|
106 |
-
"si_mean_imputed_X_cat_dropped",
|
107 |
-
"si_median_imputed_X_cat_dropped",
|
108 |
-
"si_most_frequent_imputed_X_cat_dropped",
|
109 |
-
"iter_imputed_X_cat_dropped",
|
110 |
-
"miss_val_dropped_X"]
|
111 |
-
# st.write("si_most_frequent_imputed_num_X",si_most_frequent_imputed_num_X,)
|
112 |
-
ord_enc_cols = []
|
113 |
-
ohe_enc_cols = []
|
114 |
-
|
115 |
-
if len(cat_cols) == 0:
|
116 |
-
st.write("No Categorical Columns in Train")
|
117 |
-
else:
|
118 |
-
st.write("Select Columns for Ordinal Encoding")
|
119 |
-
for column in cat_cols:
|
120 |
-
selected = st.checkbox(column)
|
121 |
-
if selected:
|
122 |
-
st.write(f"No. of Unique value in {column} column are", X[column].nunique())
|
123 |
-
ord_enc_cols.append(column)
|
124 |
-
ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
|
125 |
-
ohe_enc_cols = list(ohe_enc_cols)
|
126 |
-
|
127 |
-
if len(ord_enc_cols)>0:
|
128 |
-
st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
|
129 |
-
if len(ohe_enc_cols)>0:
|
130 |
-
st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
|
131 |
-
|
132 |
-
if len(ord_enc_cols)>0:
|
133 |
-
|
134 |
-
ordinal_order_vals = []
|
135 |
-
|
136 |
-
for column in ord_enc_cols:
|
137 |
-
unique_vals = X.dropna()[column].unique()
|
138 |
-
# st.write(f"No. of Unique value in {column} column are", len(unique_vals))
|
139 |
-
|
140 |
-
ordered_unique_vals = st.multiselect("Select values in order for Ordinal Encoding",unique_vals,unique_vals)
|
141 |
-
ordinal_order_vals.append(ordered_unique_vals)
|
142 |
-
|
143 |
-
st.write("order of values for Ordinal Encoding",tuple(ordinal_order_vals))
|
144 |
-
|
145 |
-
if len_null > 0:
|
146 |
-
|
147 |
-
for df_name, df in enumerate(list_X_after_missing_values):
|
148 |
-
# st.write(f"{list_X_after_missing_values_names[df_name]}",df)
|
149 |
-
from sklearn.preprocessing import OrdinalEncoder
|
150 |
-
ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
|
151 |
-
df[ord_enc_cols] = ord.fit_transform(df[ord_enc_cols])
|
152 |
-
# st.write(f"{list_X_after_missing_values_names[df_name]}",df)
|
153 |
-
else :
|
154 |
-
from sklearn.preprocessing import OrdinalEncoder
|
155 |
-
ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
|
156 |
-
X[ord_enc_cols] = ord.fit_transform(X[ord_enc_cols])
|
157 |
-
|
158 |
-
st.write("Ordinal Encoding Completed β
")
|
159 |
-
|
160 |
-
if len(ohe_enc_cols)>0:
|
161 |
-
if len_null > 0:
|
162 |
-
for df_name, df in enumerate(list_X_after_missing_values):
|
163 |
-
from sklearn.preprocessing import OneHotEncoder
|
164 |
-
ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
|
165 |
-
pd.options.mode.chained_assignment = None
|
166 |
-
df.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(df[ohe_enc_cols])
|
167 |
-
df.drop(columns = ohe_enc_cols,inplace = True)
|
168 |
-
pd.options.mode.chained_assignment = 'warn'
|
169 |
-
else:
|
170 |
-
from sklearn.preprocessing import OneHotEncoder
|
171 |
-
ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
|
172 |
-
pd.options.mode.chained_assignment = None
|
173 |
-
X.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(X[ohe_enc_cols])
|
174 |
-
X.drop(columns = ohe_enc_cols,inplace = True)
|
175 |
-
pd.options.mode.chained_assignment = 'warn'
|
176 |
-
st.write("OneHot Encoding Completed β
")
|
177 |
-
|
178 |
-
|
179 |
-
if len(ohe_enc_cols)>0:
|
180 |
-
if len_null > 0:
|
181 |
-
for name,df in enumerate(list_X_after_missing_values):
|
182 |
-
X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
|
183 |
-
# best_tts.best_tts(df,y,model,eva)
|
184 |
-
evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
185 |
-
else:
|
186 |
-
X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =.2 ,random_state = 42)
|
187 |
-
# best_tts.best_tts(X,y,model,eva)
|
188 |
-
|
189 |
-
evaluationer.evaluation(f"baseline_model",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
190 |
-
|
191 |
-
if len_null >0:
|
192 |
-
for name,df in enumerate(list_X_after_missing_values):
|
193 |
-
X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
|
194 |
-
st.write(f"this is test{list_X_after_missing_values_names[name]}",X_train.isnull().sum().sum())
|
195 |
-
evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
196 |
-
|
197 |
-
if eva == "class":
|
198 |
-
counter = Counter(y)
|
199 |
-
total = sum(counter.values())
|
200 |
-
balance_ratio = {cls: count / total for cls, count in counter.items()}
|
201 |
-
num_classes = len(balance_ratio)
|
202 |
-
ideal_ratio = 1 / num_classes
|
203 |
-
a = all(abs(ratio - ideal_ratio) <= 0.1 * ideal_ratio for ratio in balance_ratio.values())
|
204 |
-
if a == True:
|
205 |
-
st.write("Balanced Dataset β
")
|
206 |
-
st.write("Using accuracy for Evaluation")
|
207 |
-
value = "test_acc"
|
208 |
-
else:
|
209 |
-
st.write("Unbalanced Dataset β")
|
210 |
-
st.write("Using F1 score for Evaluation")
|
211 |
-
value = "test_f1"
|
212 |
-
st.write("SFdfs",evaluationer.classification_evaluation_df)
|
213 |
-
evaluationer.classification_evaluation_df.sort_values(by = value,inplace= True)
|
214 |
-
name = str(evaluationer.classification_evaluation_df.iloc[-1,0])
|
215 |
-
st.write("df name",evaluationer.classification_evaluation_df.iloc[-1,0])
|
216 |
-
if len_null >0:
|
217 |
-
b = list_X_after_missing_values_names.index(name)
|
218 |
-
st.write("Sdffsf",b)
|
219 |
-
st.write("df",list_X_after_missing_values[b])
|
220 |
-
X = list_X_after_missing_values[b]
|
221 |
-
if eva == "reg":
|
222 |
-
st.write("Using R2 score for Evaluation",evaluationer.reg_evaluation_df)
|
223 |
-
value = "test_r2"
|
224 |
-
evaluationer.reg_evaluation_df.sort_values(by = value,inplace= True)
|
225 |
-
st.write("adfsdf",evaluationer.reg_evaluation_df.iloc[-1,0])
|
226 |
-
name = str(evaluationer.reg_evaluation_df.iloc[-1,0])
|
227 |
-
st.write("Sdffsf",name)
|
228 |
-
if len_null >0:
|
229 |
-
b = list_X_after_missing_values_names.index(name)
|
230 |
-
st.write("Sdffsf",b)
|
231 |
-
st.write("df",list_X_after_missing_values[b])
|
232 |
-
X = list_X_after_missing_values[b]
|
233 |
-
|
234 |
-
|
235 |
-
# Create a figure and axes
|
236 |
-
num_plots = len(num_cols)
|
237 |
-
cols = 2 # Number of columns in the subplot grid
|
238 |
-
rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
|
239 |
-
|
240 |
-
fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
|
241 |
-
|
242 |
-
# Flatten the axes array for easy iteration, and remove any excess subplots
|
243 |
-
axes = axes.flatten()
|
244 |
-
for ax in axes[num_plots:]:
|
245 |
-
fig.delaxes(ax)
|
246 |
-
|
247 |
-
for i, col in enumerate(num_cols):
|
248 |
-
sns.histplot(X[col], ax=axes[i],kde = True,color=sns.color_palette('Oranges', as_cmap=True)(0.7))
|
249 |
-
axes[i].set_title(col)
|
250 |
-
|
251 |
-
# Adjust layout
|
252 |
-
plt.tight_layout()
|
253 |
-
|
254 |
-
# Show the plot in Streamlit
|
255 |
-
st.pyplot(fig)
|
256 |
-
|
257 |
-
# Create a figure and axes
|
258 |
-
num_plots = len(num_cols)
|
259 |
-
cols = 3 # Number of columns in the subplot grid
|
260 |
-
rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
|
261 |
-
|
262 |
-
fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
|
263 |
-
|
264 |
-
# Flatten the axes array for easy iteration, and remove any excess subplots
|
265 |
-
axes = axes.flatten()
|
266 |
-
for ax in axes[num_plots:]:
|
267 |
-
fig.delaxes(ax)
|
268 |
-
|
269 |
-
for i, col in enumerate(num_cols):
|
270 |
-
sns.boxplot(y=X[col], ax=axes[i],palette="magma")
|
271 |
-
axes[i].set_title(col)
|
272 |
-
|
273 |
-
# Adjust layout
|
274 |
-
plt.tight_layout()
|
275 |
-
|
276 |
-
# Show the plot in Streamlit
|
277 |
-
st.pyplot(fig)
|
278 |
-
|
279 |
-
outlier_cols = st.multiselect("De-Select columns for Detecting Outliers", num_cols,default= list(num_cols))
|
280 |
-
|
281 |
-
st.write("Checking for Outliers")
|
282 |
-
outliers_df_X,outlier_indexes = outliers.detect_outliers(X,list(outlier_cols))
|
283 |
-
st.write("Outliers in Dataframe Summary",outliers_df_X)
|
284 |
-
st.write("Columns for Outliers handling",tuple(outliers_df_X["columns name"]))
|
285 |
-
|
286 |
-
select_outlier_cols = st.multiselect("Select columns for Outlier Handling",tuple(outliers_df_X["columns name"]),default =tuple(outliers_df_X["columns name"]))
|
287 |
-
resultant,outlier_handled_df,outlier_handled_df_name= outliers.outlier_handling(X,y,model,outlier_indexes = outlier_indexes,outlier_cols = select_outlier_cols ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = "reg")
|
288 |
-
st.write("outlier handling with methods",resultant)
|
289 |
-
st.write("Best method with outlier handling",resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])
|
290 |
-
try :
|
291 |
-
st.write("Best X Data Index No.",outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0]))
|
292 |
-
|
293 |
-
st.write("Best X DataFrame after outlier handling ",outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
|
294 |
-
X = outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])]
|
295 |
-
except :
|
296 |
-
"evaluation of baseline model is better continuing with baseline model"
|
297 |
-
|
298 |
-
# result_df ,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
|
299 |
-
X_train,X_test,y_train,y_test = tts(X,y[X.index],random_state = 42,test_size = 0.2)
|
300 |
-
st.write("result_df",X)
|
301 |
-
st.write("fsdfs",X_train)
|
302 |
-
result_df_1 = feature_selections.feature_selection(X_train,X_test,y_train,y_test,model,alpha = 0.05)
|
303 |
-
st.write("sdchsvdgj",result_df_1)
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import streamlit as st
|
4 |
+
from sklearn.impute import KNNImputer,SimpleImputer,IterativeImputer
|
5 |
+
import best_tts, evaluationer,models
|
6 |
+
from sklearn.experimental import enable_iterative_imputer
|
7 |
+
from sklearn.model_selection import train_test_split as tts
|
8 |
+
from collections import Counter
|
9 |
+
#root_mean_squared_error
|
10 |
+
from sklearn.metrics import root_mean_squared_error
|
11 |
+
import seaborn as sns
|
12 |
+
import matplotlib.pyplot as plt
|
13 |
+
import outliers,best_tts
|
14 |
+
import feature_selections
|
15 |
+
def Auto_optimizer(X,y,eva,model,test= None):
|
16 |
+
evaluationer.reg_evaluation_df.drop(index =evaluationer.reg_evaluation_df.index)
|
17 |
+
num_cols = X.select_dtypes(exclude = "O").columns
|
18 |
+
cat_cols = X.select_dtypes(include = "O").columns
|
19 |
+
st.write("Num_cols",tuple(num_cols))
|
20 |
+
st.write("cat_cols",tuple(cat_cols))
|
21 |
+
|
22 |
+
# check for Duplicate and drop duplicated in X
|
23 |
+
|
24 |
+
if len(X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40]) >0:
|
25 |
+
X = X.drop(columns = X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40].index)
|
26 |
+
st.write("Columns with more than 40% null values removed")
|
27 |
+
# st.write("csx",X)
|
28 |
+
|
29 |
+
len_null = X.isnull().sum().sum()
|
30 |
+
|
31 |
+
st.write(f"There are {len_null} null values in Train")
|
32 |
+
|
33 |
+
knn_imputed_num_X = X.copy()
|
34 |
+
si_mean_imputed_num_X = X.copy()
|
35 |
+
# st.write("sf",si_mean_imputed_num_X)
|
36 |
+
si_median_imputed_num_X = X.copy()
|
37 |
+
si_most_frequent_imputed_num_X = X.copy()
|
38 |
+
iter_imputed_num_X = X.copy()
|
39 |
+
knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
|
40 |
+
si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
|
41 |
+
si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
|
42 |
+
si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
|
43 |
+
iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
|
44 |
+
if len_null >0:
|
45 |
+
|
46 |
+
if X[num_cols].isnull().sum().sum() >0:
|
47 |
+
|
48 |
+
knn_imputer = KNNImputer(n_neighbors = 5)
|
49 |
+
knn_imputed_num_X[num_cols] = knn_imputer.fit_transform(knn_imputed_num_X[num_cols])
|
50 |
+
si_imputer = SimpleImputer(strategy = "mean")
|
51 |
+
si_mean_imputed_num_X[num_cols] = si_imputer.fit_transform(si_mean_imputed_num_X[num_cols])
|
52 |
+
si_imputer = SimpleImputer(strategy = "median")
|
53 |
+
si_median_imputed_num_X[num_cols] = si_imputer.fit_transform(si_median_imputed_num_X[num_cols])
|
54 |
+
si_imputer = SimpleImputer(strategy = "most_frequent")
|
55 |
+
si_most_frequent_imputed_num_X[num_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[num_cols])
|
56 |
+
iter_imputer = IterativeImputer(max_iter = 200,random_state= 42)
|
57 |
+
iter_imputed_num_X[num_cols] = iter_imputer.fit_transform(iter_imputed_num_X[num_cols])
|
58 |
+
knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
|
59 |
+
si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
|
60 |
+
si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
|
61 |
+
si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
|
62 |
+
iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
|
63 |
+
|
64 |
+
if X[cat_cols].isnull().sum().sum() >0:
|
65 |
+
# treating missing values in categorical columns
|
66 |
+
# st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
|
67 |
+
si_imputer = SimpleImputer(strategy = "most_frequent")
|
68 |
+
|
69 |
+
knn_imputed_num_X[cat_cols] = si_imputer.fit_transform(knn_imputed_num_X[cat_cols])
|
70 |
+
si_imputer = SimpleImputer(strategy = "most_frequent")
|
71 |
+
si_mean_imputed_num_X.loc[:,cat_cols] = si_imputer.fit_transform(si_mean_imputed_num_X.loc[:,cat_cols])
|
72 |
+
# st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
|
73 |
+
si_median_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_median_imputed_num_X[cat_cols])
|
74 |
+
si_most_frequent_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[cat_cols])
|
75 |
+
iter_imputed_num_X[cat_cols] = si_imputer.fit_transform(iter_imputed_num_X[cat_cols])
|
76 |
+
|
77 |
+
knn_imputed_X_cat_dropped = knn_imputed_X_cat_dropped.dropna()
|
78 |
+
si_mean_imputed_X_cat_dropped =si_mean_imputed_X_cat_dropped.dropna()
|
79 |
+
si_median_imputed_X_cat_dropped =si_median_imputed_X_cat_dropped.dropna()
|
80 |
+
si_most_frequent_imputed_X_cat_dropped =si_most_frequent_imputed_X_cat_dropped.dropna()
|
81 |
+
iter_imputed_X_cat_dropped =iter_imputed_X_cat_dropped.dropna()
|
82 |
+
st.write("sdds",knn_imputed_num_X)
|
83 |
+
st.write("sddssd",knn_imputed_X_cat_dropped)
|
84 |
+
|
85 |
+
miss_val_dropped_X = X.dropna()
|
86 |
+
|
87 |
+
# list of dataframes
|
88 |
+
|
89 |
+
list_X_after_missing_values= [knn_imputed_num_X,
|
90 |
+
si_mean_imputed_num_X,
|
91 |
+
si_median_imputed_num_X,
|
92 |
+
si_most_frequent_imputed_num_X,
|
93 |
+
iter_imputed_num_X,
|
94 |
+
knn_imputed_X_cat_dropped,
|
95 |
+
si_mean_imputed_X_cat_dropped,
|
96 |
+
si_median_imputed_X_cat_dropped,
|
97 |
+
si_most_frequent_imputed_X_cat_dropped,
|
98 |
+
iter_imputed_X_cat_dropped,
|
99 |
+
miss_val_dropped_X]
|
100 |
+
list_X_after_missing_values_names= ["knn_imputed_num_X",
|
101 |
+
"si_mean_imputed_num_X",
|
102 |
+
"si_median_imputed_num_X",
|
103 |
+
"si_most_frequent_imputed_num_X",
|
104 |
+
"iter_imputed_num_X",
|
105 |
+
"knn_imputed_X_cat_dropped",
|
106 |
+
"si_mean_imputed_X_cat_dropped",
|
107 |
+
"si_median_imputed_X_cat_dropped",
|
108 |
+
"si_most_frequent_imputed_X_cat_dropped",
|
109 |
+
"iter_imputed_X_cat_dropped",
|
110 |
+
"miss_val_dropped_X"]
|
111 |
+
# st.write("si_most_frequent_imputed_num_X",si_most_frequent_imputed_num_X,)
|
112 |
+
ord_enc_cols = []
|
113 |
+
ohe_enc_cols = []
|
114 |
+
|
115 |
+
if len(cat_cols) == 0:
|
116 |
+
st.write("No Categorical Columns in Train")
|
117 |
+
else:
|
118 |
+
st.write("Select Columns for Ordinal Encoding")
|
119 |
+
for column in cat_cols:
|
120 |
+
selected = st.checkbox(column)
|
121 |
+
if selected:
|
122 |
+
st.write(f"No. of Unique value in {column} column are", X[column].nunique())
|
123 |
+
ord_enc_cols.append(column)
|
124 |
+
ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
|
125 |
+
ohe_enc_cols = list(ohe_enc_cols)
|
126 |
+
|
127 |
+
if len(ord_enc_cols)>0:
|
128 |
+
st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
|
129 |
+
if len(ohe_enc_cols)>0:
|
130 |
+
st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
|
131 |
+
|
132 |
+
if len(ord_enc_cols)>0:
|
133 |
+
|
134 |
+
ordinal_order_vals = []
|
135 |
+
|
136 |
+
for column in ord_enc_cols:
|
137 |
+
unique_vals = X.dropna()[column].unique()
|
138 |
+
# st.write(f"No. of Unique value in {column} column are", len(unique_vals))
|
139 |
+
|
140 |
+
ordered_unique_vals = st.multiselect("Select values in order for Ordinal Encoding",unique_vals,unique_vals)
|
141 |
+
ordinal_order_vals.append(ordered_unique_vals)
|
142 |
+
|
143 |
+
st.write("order of values for Ordinal Encoding",tuple(ordinal_order_vals))
|
144 |
+
|
145 |
+
if len_null > 0:
|
146 |
+
|
147 |
+
for df_name, df in enumerate(list_X_after_missing_values):
|
148 |
+
# st.write(f"{list_X_after_missing_values_names[df_name]}",df)
|
149 |
+
from sklearn.preprocessing import OrdinalEncoder
|
150 |
+
ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
|
151 |
+
df[ord_enc_cols] = ord.fit_transform(df[ord_enc_cols])
|
152 |
+
# st.write(f"{list_X_after_missing_values_names[df_name]}",df)
|
153 |
+
else :
|
154 |
+
from sklearn.preprocessing import OrdinalEncoder
|
155 |
+
ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
|
156 |
+
X[ord_enc_cols] = ord.fit_transform(X[ord_enc_cols])
|
157 |
+
|
158 |
+
st.write("Ordinal Encoding Completed β
")
|
159 |
+
|
160 |
+
if len(ohe_enc_cols)>0:
|
161 |
+
if len_null > 0:
|
162 |
+
for df_name, df in enumerate(list_X_after_missing_values):
|
163 |
+
from sklearn.preprocessing import OneHotEncoder
|
164 |
+
ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
|
165 |
+
pd.options.mode.chained_assignment = None
|
166 |
+
df.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(df[ohe_enc_cols])
|
167 |
+
df.drop(columns = ohe_enc_cols,inplace = True)
|
168 |
+
pd.options.mode.chained_assignment = 'warn'
|
169 |
+
else:
|
170 |
+
from sklearn.preprocessing import OneHotEncoder
|
171 |
+
ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
|
172 |
+
pd.options.mode.chained_assignment = None
|
173 |
+
X.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(X[ohe_enc_cols])
|
174 |
+
X.drop(columns = ohe_enc_cols,inplace = True)
|
175 |
+
pd.options.mode.chained_assignment = 'warn'
|
176 |
+
st.write("OneHot Encoding Completed β
")
|
177 |
+
|
178 |
+
|
179 |
+
if len(ohe_enc_cols)>0:
|
180 |
+
if len_null > 0:
|
181 |
+
for name,df in enumerate(list_X_after_missing_values):
|
182 |
+
X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
|
183 |
+
# best_tts.best_tts(df,y,model,eva)
|
184 |
+
evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
185 |
+
else:
|
186 |
+
X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =.2 ,random_state = 42)
|
187 |
+
# best_tts.best_tts(X,y,model,eva)
|
188 |
+
|
189 |
+
evaluationer.evaluation(f"baseline_model",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
190 |
+
|
191 |
+
if len_null >0:
|
192 |
+
for name,df in enumerate(list_X_after_missing_values):
|
193 |
+
X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
|
194 |
+
st.write(f"this is test{list_X_after_missing_values_names[name]}",X_train.isnull().sum().sum())
|
195 |
+
evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
196 |
+
|
197 |
+
if eva == "class":
|
198 |
+
counter = Counter(y)
|
199 |
+
total = sum(counter.values())
|
200 |
+
balance_ratio = {cls: count / total for cls, count in counter.items()}
|
201 |
+
num_classes = len(balance_ratio)
|
202 |
+
ideal_ratio = 1 / num_classes
|
203 |
+
a = all(abs(ratio - ideal_ratio) <= 0.1 * ideal_ratio for ratio in balance_ratio.values())
|
204 |
+
if a == True:
|
205 |
+
st.write("Balanced Dataset β
")
|
206 |
+
st.write("Using accuracy for Evaluation")
|
207 |
+
value = "test_acc"
|
208 |
+
else:
|
209 |
+
st.write("Unbalanced Dataset β")
|
210 |
+
st.write("Using F1 score for Evaluation")
|
211 |
+
value = "test_f1"
|
212 |
+
st.write("SFdfs",evaluationer.classification_evaluation_df)
|
213 |
+
evaluationer.classification_evaluation_df.sort_values(by = value,inplace= True)
|
214 |
+
name = str(evaluationer.classification_evaluation_df.iloc[-1,0])
|
215 |
+
st.write("df name",evaluationer.classification_evaluation_df.iloc[-1,0])
|
216 |
+
if len_null >0:
|
217 |
+
b = list_X_after_missing_values_names.index(name)
|
218 |
+
st.write("Sdffsf",b)
|
219 |
+
st.write("df",list_X_after_missing_values[b])
|
220 |
+
X = list_X_after_missing_values[b]
|
221 |
+
if eva == "reg":
|
222 |
+
st.write("Using R2 score for Evaluation",evaluationer.reg_evaluation_df)
|
223 |
+
value = "test_r2"
|
224 |
+
evaluationer.reg_evaluation_df.sort_values(by = value,inplace= True)
|
225 |
+
st.write("adfsdf",evaluationer.reg_evaluation_df.iloc[-1,0])
|
226 |
+
name = str(evaluationer.reg_evaluation_df.iloc[-1,0])
|
227 |
+
st.write("Sdffsf",name)
|
228 |
+
if len_null >0:
|
229 |
+
b = list_X_after_missing_values_names.index(name)
|
230 |
+
st.write("Sdffsf",b)
|
231 |
+
st.write("df",list_X_after_missing_values[b])
|
232 |
+
X = list_X_after_missing_values[b]
|
233 |
+
|
234 |
+
|
235 |
+
# Create a figure and axes
|
236 |
+
num_plots = len(num_cols)
|
237 |
+
cols = 2 # Number of columns in the subplot grid
|
238 |
+
rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
|
239 |
+
|
240 |
+
fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
|
241 |
+
|
242 |
+
# Flatten the axes array for easy iteration, and remove any excess subplots
|
243 |
+
axes = axes.flatten()
|
244 |
+
for ax in axes[num_plots:]:
|
245 |
+
fig.delaxes(ax)
|
246 |
+
|
247 |
+
for i, col in enumerate(num_cols):
|
248 |
+
sns.histplot(X[col], ax=axes[i],kde = True,color=sns.color_palette('Oranges', as_cmap=True)(0.7))
|
249 |
+
axes[i].set_title(col)
|
250 |
+
|
251 |
+
# Adjust layout
|
252 |
+
plt.tight_layout()
|
253 |
+
|
254 |
+
# Show the plot in Streamlit
|
255 |
+
st.pyplot(fig)
|
256 |
+
|
257 |
+
# Create a figure and axes
|
258 |
+
num_plots = len(num_cols)
|
259 |
+
cols = 3 # Number of columns in the subplot grid
|
260 |
+
rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
|
261 |
+
|
262 |
+
fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
|
263 |
+
|
264 |
+
# Flatten the axes array for easy iteration, and remove any excess subplots
|
265 |
+
axes = axes.flatten()
|
266 |
+
for ax in axes[num_plots:]:
|
267 |
+
fig.delaxes(ax)
|
268 |
+
|
269 |
+
for i, col in enumerate(num_cols):
|
270 |
+
sns.boxplot(y=X[col], ax=axes[i],palette="magma")
|
271 |
+
axes[i].set_title(col)
|
272 |
+
|
273 |
+
# Adjust layout
|
274 |
+
plt.tight_layout()
|
275 |
+
|
276 |
+
# Show the plot in Streamlit
|
277 |
+
st.pyplot(fig)
|
278 |
+
|
279 |
+
outlier_cols = st.multiselect("De-Select columns for Detecting Outliers", num_cols,default= list(num_cols))
|
280 |
+
|
281 |
+
st.write("Checking for Outliers")
|
282 |
+
outliers_df_X,outlier_indexes = outliers.detect_outliers(X,list(outlier_cols))
|
283 |
+
st.write("Outliers in Dataframe Summary",outliers_df_X)
|
284 |
+
st.write("Columns for Outliers handling",tuple(outliers_df_X["columns name"]))
|
285 |
+
|
286 |
+
select_outlier_cols = st.multiselect("Select columns for Outlier Handling",tuple(outliers_df_X["columns name"]),default =tuple(outliers_df_X["columns name"]))
|
287 |
+
resultant,outlier_handled_df,outlier_handled_df_name= outliers.outlier_handling(X,y,model,outlier_indexes = outlier_indexes,outlier_cols = select_outlier_cols ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = "reg")
|
288 |
+
st.write("outlier handling with methods",resultant)
|
289 |
+
st.write("Best method with outlier handling",resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])
|
290 |
+
try :
|
291 |
+
st.write("Best X Data Index No.",outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0]))
|
292 |
+
|
293 |
+
st.write("Best X DataFrame after outlier handling ",outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
|
294 |
+
X = outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])]
|
295 |
+
except :
|
296 |
+
"evaluation of baseline model is better continuing with baseline model"
|
297 |
+
|
298 |
+
# result_df ,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
|
299 |
+
X_train,X_test,y_train,y_test = tts(X,y[X.index],random_state = 42,test_size = 0.2)
|
300 |
+
st.write("result_df",X)
|
301 |
+
st.write("fsdfs",X_train)
|
302 |
+
result_df_1 = feature_selections.feature_selection(X_train,X_test,y_train,y_test,model,alpha = 0.05)
|
303 |
+
st.write("sdchsvdgj",result_df_1)
|
304 |
+
|
305 |
+
|
306 |
+
|
307 |
+
|
308 |
+
|
309 |
+
|
310 |
+
|
311 |
+
|
312 |
+
|
313 |
+
|
314 |
+
|
315 |
+
|
316 |
+
|
317 |
+
|