Gaurav069 commited on
Commit
025a1cd
β€’
1 Parent(s): a8af817

Update auto_optimizer.py

Browse files
Files changed (1) hide show
  1. auto_optimizer.py +317 -317
auto_optimizer.py CHANGED
@@ -1,317 +1,317 @@
1
- import pandas as pd
2
- import numpy as np
3
- import streamlit as st
4
- from sklearn.impute import KNNImputer,SimpleImputer,IterativeImputer
5
- import best_tts, evaluationer,models
6
- from sklearn.experimental import enable_iterative_imputer
7
- from sklearn.model_selection import train_test_split as tts
8
- from collections import Counter
9
- #root_mean_squared_error
10
- from sklearn.metrics import root_mean_squared_error
11
- import seaborn as sns
12
- import matplotlib.pyplot as plt
13
- import outliers,best_tts
14
- import feature_selections
15
- def Auto_optimizer(X,y,eva,model,test= None):
16
- pass
17
- num_cols = X.select_dtypes(exclude = "O").columns
18
- cat_cols = X.select_dtypes(include = "O").columns
19
- st.write("Num_cols",tuple(num_cols))
20
- st.write("cat_cols",tuple(cat_cols))
21
-
22
- # check for Duplicate and drop duplicated in X
23
-
24
- if len(X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40]) >0:
25
- X = X.drop(columns = X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40].index)
26
- st.write("Columns with more than 40% null values removed")
27
- # st.write("csx",X)
28
-
29
- len_null = X.isnull().sum().sum()
30
-
31
- st.write(f"There are {len_null} null values in Train")
32
-
33
- knn_imputed_num_X = X.copy()
34
- si_mean_imputed_num_X = X.copy()
35
- # st.write("sf",si_mean_imputed_num_X)
36
- si_median_imputed_num_X = X.copy()
37
- si_most_frequent_imputed_num_X = X.copy()
38
- iter_imputed_num_X = X.copy()
39
- knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
40
- si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
41
- si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
42
- si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
43
- iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
44
- if len_null >0:
45
-
46
- if X[num_cols].isnull().sum().sum() >0:
47
-
48
- knn_imputer = KNNImputer(n_neighbors = 5)
49
- knn_imputed_num_X[num_cols] = knn_imputer.fit_transform(knn_imputed_num_X[num_cols])
50
- si_imputer = SimpleImputer(strategy = "mean")
51
- si_mean_imputed_num_X[num_cols] = si_imputer.fit_transform(si_mean_imputed_num_X[num_cols])
52
- si_imputer = SimpleImputer(strategy = "median")
53
- si_median_imputed_num_X[num_cols] = si_imputer.fit_transform(si_median_imputed_num_X[num_cols])
54
- si_imputer = SimpleImputer(strategy = "most_frequent")
55
- si_most_frequent_imputed_num_X[num_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[num_cols])
56
- iter_imputer = IterativeImputer(max_iter = 200,random_state= 42)
57
- iter_imputed_num_X[num_cols] = iter_imputer.fit_transform(iter_imputed_num_X[num_cols])
58
- knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
59
- si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
60
- si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
61
- si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
62
- iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
63
-
64
- if X[cat_cols].isnull().sum().sum() >0:
65
- # treating missing values in categorical columns
66
- # st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
67
- si_imputer = SimpleImputer(strategy = "most_frequent")
68
-
69
- knn_imputed_num_X[cat_cols] = si_imputer.fit_transform(knn_imputed_num_X[cat_cols])
70
- si_imputer = SimpleImputer(strategy = "most_frequent")
71
- si_mean_imputed_num_X.loc[:,cat_cols] = si_imputer.fit_transform(si_mean_imputed_num_X.loc[:,cat_cols])
72
- # st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
73
- si_median_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_median_imputed_num_X[cat_cols])
74
- si_most_frequent_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[cat_cols])
75
- iter_imputed_num_X[cat_cols] = si_imputer.fit_transform(iter_imputed_num_X[cat_cols])
76
-
77
- knn_imputed_X_cat_dropped = knn_imputed_X_cat_dropped.dropna()
78
- si_mean_imputed_X_cat_dropped =si_mean_imputed_X_cat_dropped.dropna()
79
- si_median_imputed_X_cat_dropped =si_median_imputed_X_cat_dropped.dropna()
80
- si_most_frequent_imputed_X_cat_dropped =si_most_frequent_imputed_X_cat_dropped.dropna()
81
- iter_imputed_X_cat_dropped =iter_imputed_X_cat_dropped.dropna()
82
- st.write("sdds",knn_imputed_num_X)
83
- st.write("sddssd",knn_imputed_X_cat_dropped)
84
-
85
- miss_val_dropped_X = X.dropna()
86
-
87
- # list of dataframes
88
-
89
- list_X_after_missing_values= [knn_imputed_num_X,
90
- si_mean_imputed_num_X,
91
- si_median_imputed_num_X,
92
- si_most_frequent_imputed_num_X,
93
- iter_imputed_num_X,
94
- knn_imputed_X_cat_dropped,
95
- si_mean_imputed_X_cat_dropped,
96
- si_median_imputed_X_cat_dropped,
97
- si_most_frequent_imputed_X_cat_dropped,
98
- iter_imputed_X_cat_dropped,
99
- miss_val_dropped_X]
100
- list_X_after_missing_values_names= ["knn_imputed_num_X",
101
- "si_mean_imputed_num_X",
102
- "si_median_imputed_num_X",
103
- "si_most_frequent_imputed_num_X",
104
- "iter_imputed_num_X",
105
- "knn_imputed_X_cat_dropped",
106
- "si_mean_imputed_X_cat_dropped",
107
- "si_median_imputed_X_cat_dropped",
108
- "si_most_frequent_imputed_X_cat_dropped",
109
- "iter_imputed_X_cat_dropped",
110
- "miss_val_dropped_X"]
111
- # st.write("si_most_frequent_imputed_num_X",si_most_frequent_imputed_num_X,)
112
- ord_enc_cols = []
113
- ohe_enc_cols = []
114
-
115
- if len(cat_cols) == 0:
116
- st.write("No Categorical Columns in Train")
117
- else:
118
- st.write("Select Columns for Ordinal Encoding")
119
- for column in cat_cols:
120
- selected = st.checkbox(column)
121
- if selected:
122
- st.write(f"No. of Unique value in {column} column are", X[column].nunique())
123
- ord_enc_cols.append(column)
124
- ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
125
- ohe_enc_cols = list(ohe_enc_cols)
126
-
127
- if len(ord_enc_cols)>0:
128
- st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
129
- if len(ohe_enc_cols)>0:
130
- st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
131
-
132
- if len(ord_enc_cols)>0:
133
-
134
- ordinal_order_vals = []
135
-
136
- for column in ord_enc_cols:
137
- unique_vals = X.dropna()[column].unique()
138
- # st.write(f"No. of Unique value in {column} column are", len(unique_vals))
139
-
140
- ordered_unique_vals = st.multiselect("Select values in order for Ordinal Encoding",unique_vals,unique_vals)
141
- ordinal_order_vals.append(ordered_unique_vals)
142
-
143
- st.write("order of values for Ordinal Encoding",tuple(ordinal_order_vals))
144
-
145
- if len_null > 0:
146
-
147
- for df_name, df in enumerate(list_X_after_missing_values):
148
- # st.write(f"{list_X_after_missing_values_names[df_name]}",df)
149
- from sklearn.preprocessing import OrdinalEncoder
150
- ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
151
- df[ord_enc_cols] = ord.fit_transform(df[ord_enc_cols])
152
- # st.write(f"{list_X_after_missing_values_names[df_name]}",df)
153
- else :
154
- from sklearn.preprocessing import OrdinalEncoder
155
- ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
156
- X[ord_enc_cols] = ord.fit_transform(X[ord_enc_cols])
157
-
158
- st.write("Ordinal Encoding Completed βœ…")
159
-
160
- if len(ohe_enc_cols)>0:
161
- if len_null > 0:
162
- for df_name, df in enumerate(list_X_after_missing_values):
163
- from sklearn.preprocessing import OneHotEncoder
164
- ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
165
- pd.options.mode.chained_assignment = None
166
- df.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(df[ohe_enc_cols])
167
- df.drop(columns = ohe_enc_cols,inplace = True)
168
- pd.options.mode.chained_assignment = 'warn'
169
- else:
170
- from sklearn.preprocessing import OneHotEncoder
171
- ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
172
- pd.options.mode.chained_assignment = None
173
- X.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(X[ohe_enc_cols])
174
- X.drop(columns = ohe_enc_cols,inplace = True)
175
- pd.options.mode.chained_assignment = 'warn'
176
- st.write("OneHot Encoding Completed βœ…")
177
-
178
-
179
- if len(ohe_enc_cols)>0:
180
- if len_null > 0:
181
- for name,df in enumerate(list_X_after_missing_values):
182
- X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
183
- # best_tts.best_tts(df,y,model,eva)
184
- evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
185
- else:
186
- X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =.2 ,random_state = 42)
187
- # best_tts.best_tts(X,y,model,eva)
188
-
189
- evaluationer.evaluation(f"baseline_model",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
190
-
191
- if len_null >0:
192
- for name,df in enumerate(list_X_after_missing_values):
193
- X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
194
- st.write(f"this is test{list_X_after_missing_values_names[name]}",X_train.isnull().sum().sum())
195
- evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
196
-
197
- if eva == "class":
198
- counter = Counter(y)
199
- total = sum(counter.values())
200
- balance_ratio = {cls: count / total for cls, count in counter.items()}
201
- num_classes = len(balance_ratio)
202
- ideal_ratio = 1 / num_classes
203
- a = all(abs(ratio - ideal_ratio) <= 0.1 * ideal_ratio for ratio in balance_ratio.values())
204
- if a == True:
205
- st.write("Balanced Dataset βœ…")
206
- st.write("Using accuracy for Evaluation")
207
- value = "test_acc"
208
- else:
209
- st.write("Unbalanced Dataset ❌")
210
- st.write("Using F1 score for Evaluation")
211
- value = "test_f1"
212
- st.write("SFdfs",evaluationer.classification_evaluation_df)
213
- evaluationer.classification_evaluation_df.sort_values(by = value,inplace= True)
214
- name = str(evaluationer.classification_evaluation_df.iloc[-1,0])
215
- st.write("df name",evaluationer.classification_evaluation_df.iloc[-1,0])
216
- if len_null >0:
217
- b = list_X_after_missing_values_names.index(name)
218
- st.write("Sdffsf",b)
219
- st.write("df",list_X_after_missing_values[b])
220
- X = list_X_after_missing_values[b]
221
- if eva == "reg":
222
- st.write("Using R2 score for Evaluation",evaluationer.reg_evaluation_df)
223
- value = "test_r2"
224
- evaluationer.reg_evaluation_df.sort_values(by = value,inplace= True)
225
- st.write("adfsdf",evaluationer.reg_evaluation_df.iloc[-1,0])
226
- name = str(evaluationer.reg_evaluation_df.iloc[-1,0])
227
- st.write("Sdffsf",name)
228
- if len_null >0:
229
- b = list_X_after_missing_values_names.index(name)
230
- st.write("Sdffsf",b)
231
- st.write("df",list_X_after_missing_values[b])
232
- X = list_X_after_missing_values[b]
233
-
234
-
235
- # Create a figure and axes
236
- num_plots = len(num_cols)
237
- cols = 2 # Number of columns in the subplot grid
238
- rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
239
-
240
- fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
241
-
242
- # Flatten the axes array for easy iteration, and remove any excess subplots
243
- axes = axes.flatten()
244
- for ax in axes[num_plots:]:
245
- fig.delaxes(ax)
246
-
247
- for i, col in enumerate(num_cols):
248
- sns.histplot(X[col], ax=axes[i],kde = True,color=sns.color_palette('Oranges', as_cmap=True)(0.7))
249
- axes[i].set_title(col)
250
-
251
- # Adjust layout
252
- plt.tight_layout()
253
-
254
- # Show the plot in Streamlit
255
- st.pyplot(fig)
256
-
257
- # Create a figure and axes
258
- num_plots = len(num_cols)
259
- cols = 3 # Number of columns in the subplot grid
260
- rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
261
-
262
- fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
263
-
264
- # Flatten the axes array for easy iteration, and remove any excess subplots
265
- axes = axes.flatten()
266
- for ax in axes[num_plots:]:
267
- fig.delaxes(ax)
268
-
269
- for i, col in enumerate(num_cols):
270
- sns.boxplot(y=X[col], ax=axes[i],palette="magma")
271
- axes[i].set_title(col)
272
-
273
- # Adjust layout
274
- plt.tight_layout()
275
-
276
- # Show the plot in Streamlit
277
- st.pyplot(fig)
278
-
279
- outlier_cols = st.multiselect("De-Select columns for Detecting Outliers", num_cols,default= list(num_cols))
280
-
281
- st.write("Checking for Outliers")
282
- outliers_df_X,outlier_indexes = outliers.detect_outliers(X,list(outlier_cols))
283
- st.write("Outliers in Dataframe Summary",outliers_df_X)
284
- st.write("Columns for Outliers handling",tuple(outliers_df_X["columns name"]))
285
-
286
- select_outlier_cols = st.multiselect("Select columns for Outlier Handling",tuple(outliers_df_X["columns name"]),default =tuple(outliers_df_X["columns name"]))
287
- resultant,outlier_handled_df,outlier_handled_df_name= outliers.outlier_handling(X,y,model,outlier_indexes = outlier_indexes,outlier_cols = select_outlier_cols ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = "reg")
288
- st.write("outlier handling with methods",resultant)
289
- st.write("Best method with outlier handling",resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])
290
- try :
291
- st.write("Best X Data Index No.",outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0]))
292
-
293
- st.write("Best X DataFrame after outlier handling ",outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
294
- X = outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])]
295
- except :
296
- "evaluation of baseline model is better continuing with baseline model"
297
-
298
- # result_df ,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
299
- X_train,X_test,y_train,y_test = tts(X,y[X.index],random_state = 42,test_size = 0.2)
300
- st.write("result_df",X)
301
- st.write("fsdfs",X_train)
302
- result_df_1 = feature_selections.feature_selection(X_train,X_test,y_train,y_test,model,alpha = 0.05)
303
- st.write("sdchsvdgj",result_df_1)
304
-
305
-
306
-
307
-
308
-
309
-
310
-
311
-
312
-
313
-
314
-
315
-
316
-
317
-
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import streamlit as st
4
+ from sklearn.impute import KNNImputer,SimpleImputer,IterativeImputer
5
+ import best_tts, evaluationer,models
6
+ from sklearn.experimental import enable_iterative_imputer
7
+ from sklearn.model_selection import train_test_split as tts
8
+ from collections import Counter
9
+ #root_mean_squared_error
10
+ from sklearn.metrics import root_mean_squared_error
11
+ import seaborn as sns
12
+ import matplotlib.pyplot as plt
13
+ import outliers,best_tts
14
+ import feature_selections
15
+ def Auto_optimizer(X,y,eva,model,test= None):
16
+ evaluationer.reg_evaluation_df.drop(index =evaluationer.reg_evaluation_df.index)
17
+ num_cols = X.select_dtypes(exclude = "O").columns
18
+ cat_cols = X.select_dtypes(include = "O").columns
19
+ st.write("Num_cols",tuple(num_cols))
20
+ st.write("cat_cols",tuple(cat_cols))
21
+
22
+ # check for Duplicate and drop duplicated in X
23
+
24
+ if len(X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40]) >0:
25
+ X = X.drop(columns = X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40].index)
26
+ st.write("Columns with more than 40% null values removed")
27
+ # st.write("csx",X)
28
+
29
+ len_null = X.isnull().sum().sum()
30
+
31
+ st.write(f"There are {len_null} null values in Train")
32
+
33
+ knn_imputed_num_X = X.copy()
34
+ si_mean_imputed_num_X = X.copy()
35
+ # st.write("sf",si_mean_imputed_num_X)
36
+ si_median_imputed_num_X = X.copy()
37
+ si_most_frequent_imputed_num_X = X.copy()
38
+ iter_imputed_num_X = X.copy()
39
+ knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
40
+ si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
41
+ si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
42
+ si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
43
+ iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
44
+ if len_null >0:
45
+
46
+ if X[num_cols].isnull().sum().sum() >0:
47
+
48
+ knn_imputer = KNNImputer(n_neighbors = 5)
49
+ knn_imputed_num_X[num_cols] = knn_imputer.fit_transform(knn_imputed_num_X[num_cols])
50
+ si_imputer = SimpleImputer(strategy = "mean")
51
+ si_mean_imputed_num_X[num_cols] = si_imputer.fit_transform(si_mean_imputed_num_X[num_cols])
52
+ si_imputer = SimpleImputer(strategy = "median")
53
+ si_median_imputed_num_X[num_cols] = si_imputer.fit_transform(si_median_imputed_num_X[num_cols])
54
+ si_imputer = SimpleImputer(strategy = "most_frequent")
55
+ si_most_frequent_imputed_num_X[num_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[num_cols])
56
+ iter_imputer = IterativeImputer(max_iter = 200,random_state= 42)
57
+ iter_imputed_num_X[num_cols] = iter_imputer.fit_transform(iter_imputed_num_X[num_cols])
58
+ knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
59
+ si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
60
+ si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
61
+ si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
62
+ iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
63
+
64
+ if X[cat_cols].isnull().sum().sum() >0:
65
+ # treating missing values in categorical columns
66
+ # st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
67
+ si_imputer = SimpleImputer(strategy = "most_frequent")
68
+
69
+ knn_imputed_num_X[cat_cols] = si_imputer.fit_transform(knn_imputed_num_X[cat_cols])
70
+ si_imputer = SimpleImputer(strategy = "most_frequent")
71
+ si_mean_imputed_num_X.loc[:,cat_cols] = si_imputer.fit_transform(si_mean_imputed_num_X.loc[:,cat_cols])
72
+ # st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
73
+ si_median_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_median_imputed_num_X[cat_cols])
74
+ si_most_frequent_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[cat_cols])
75
+ iter_imputed_num_X[cat_cols] = si_imputer.fit_transform(iter_imputed_num_X[cat_cols])
76
+
77
+ knn_imputed_X_cat_dropped = knn_imputed_X_cat_dropped.dropna()
78
+ si_mean_imputed_X_cat_dropped =si_mean_imputed_X_cat_dropped.dropna()
79
+ si_median_imputed_X_cat_dropped =si_median_imputed_X_cat_dropped.dropna()
80
+ si_most_frequent_imputed_X_cat_dropped =si_most_frequent_imputed_X_cat_dropped.dropna()
81
+ iter_imputed_X_cat_dropped =iter_imputed_X_cat_dropped.dropna()
82
+ st.write("sdds",knn_imputed_num_X)
83
+ st.write("sddssd",knn_imputed_X_cat_dropped)
84
+
85
+ miss_val_dropped_X = X.dropna()
86
+
87
+ # list of dataframes
88
+
89
+ list_X_after_missing_values= [knn_imputed_num_X,
90
+ si_mean_imputed_num_X,
91
+ si_median_imputed_num_X,
92
+ si_most_frequent_imputed_num_X,
93
+ iter_imputed_num_X,
94
+ knn_imputed_X_cat_dropped,
95
+ si_mean_imputed_X_cat_dropped,
96
+ si_median_imputed_X_cat_dropped,
97
+ si_most_frequent_imputed_X_cat_dropped,
98
+ iter_imputed_X_cat_dropped,
99
+ miss_val_dropped_X]
100
+ list_X_after_missing_values_names= ["knn_imputed_num_X",
101
+ "si_mean_imputed_num_X",
102
+ "si_median_imputed_num_X",
103
+ "si_most_frequent_imputed_num_X",
104
+ "iter_imputed_num_X",
105
+ "knn_imputed_X_cat_dropped",
106
+ "si_mean_imputed_X_cat_dropped",
107
+ "si_median_imputed_X_cat_dropped",
108
+ "si_most_frequent_imputed_X_cat_dropped",
109
+ "iter_imputed_X_cat_dropped",
110
+ "miss_val_dropped_X"]
111
+ # st.write("si_most_frequent_imputed_num_X",si_most_frequent_imputed_num_X,)
112
+ ord_enc_cols = []
113
+ ohe_enc_cols = []
114
+
115
+ if len(cat_cols) == 0:
116
+ st.write("No Categorical Columns in Train")
117
+ else:
118
+ st.write("Select Columns for Ordinal Encoding")
119
+ for column in cat_cols:
120
+ selected = st.checkbox(column)
121
+ if selected:
122
+ st.write(f"No. of Unique value in {column} column are", X[column].nunique())
123
+ ord_enc_cols.append(column)
124
+ ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
125
+ ohe_enc_cols = list(ohe_enc_cols)
126
+
127
+ if len(ord_enc_cols)>0:
128
+ st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
129
+ if len(ohe_enc_cols)>0:
130
+ st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
131
+
132
+ if len(ord_enc_cols)>0:
133
+
134
+ ordinal_order_vals = []
135
+
136
+ for column in ord_enc_cols:
137
+ unique_vals = X.dropna()[column].unique()
138
+ # st.write(f"No. of Unique value in {column} column are", len(unique_vals))
139
+
140
+ ordered_unique_vals = st.multiselect("Select values in order for Ordinal Encoding",unique_vals,unique_vals)
141
+ ordinal_order_vals.append(ordered_unique_vals)
142
+
143
+ st.write("order of values for Ordinal Encoding",tuple(ordinal_order_vals))
144
+
145
+ if len_null > 0:
146
+
147
+ for df_name, df in enumerate(list_X_after_missing_values):
148
+ # st.write(f"{list_X_after_missing_values_names[df_name]}",df)
149
+ from sklearn.preprocessing import OrdinalEncoder
150
+ ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
151
+ df[ord_enc_cols] = ord.fit_transform(df[ord_enc_cols])
152
+ # st.write(f"{list_X_after_missing_values_names[df_name]}",df)
153
+ else :
154
+ from sklearn.preprocessing import OrdinalEncoder
155
+ ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
156
+ X[ord_enc_cols] = ord.fit_transform(X[ord_enc_cols])
157
+
158
+ st.write("Ordinal Encoding Completed βœ…")
159
+
160
+ if len(ohe_enc_cols)>0:
161
+ if len_null > 0:
162
+ for df_name, df in enumerate(list_X_after_missing_values):
163
+ from sklearn.preprocessing import OneHotEncoder
164
+ ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
165
+ pd.options.mode.chained_assignment = None
166
+ df.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(df[ohe_enc_cols])
167
+ df.drop(columns = ohe_enc_cols,inplace = True)
168
+ pd.options.mode.chained_assignment = 'warn'
169
+ else:
170
+ from sklearn.preprocessing import OneHotEncoder
171
+ ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
172
+ pd.options.mode.chained_assignment = None
173
+ X.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(X[ohe_enc_cols])
174
+ X.drop(columns = ohe_enc_cols,inplace = True)
175
+ pd.options.mode.chained_assignment = 'warn'
176
+ st.write("OneHot Encoding Completed βœ…")
177
+
178
+
179
+ if len(ohe_enc_cols)>0:
180
+ if len_null > 0:
181
+ for name,df in enumerate(list_X_after_missing_values):
182
+ X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
183
+ # best_tts.best_tts(df,y,model,eva)
184
+ evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
185
+ else:
186
+ X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =.2 ,random_state = 42)
187
+ # best_tts.best_tts(X,y,model,eva)
188
+
189
+ evaluationer.evaluation(f"baseline_model",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
190
+
191
+ if len_null >0:
192
+ for name,df in enumerate(list_X_after_missing_values):
193
+ X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
194
+ st.write(f"this is test{list_X_after_missing_values_names[name]}",X_train.isnull().sum().sum())
195
+ evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
196
+
197
+ if eva == "class":
198
+ counter = Counter(y)
199
+ total = sum(counter.values())
200
+ balance_ratio = {cls: count / total for cls, count in counter.items()}
201
+ num_classes = len(balance_ratio)
202
+ ideal_ratio = 1 / num_classes
203
+ a = all(abs(ratio - ideal_ratio) <= 0.1 * ideal_ratio for ratio in balance_ratio.values())
204
+ if a == True:
205
+ st.write("Balanced Dataset βœ…")
206
+ st.write("Using accuracy for Evaluation")
207
+ value = "test_acc"
208
+ else:
209
+ st.write("Unbalanced Dataset ❌")
210
+ st.write("Using F1 score for Evaluation")
211
+ value = "test_f1"
212
+ st.write("SFdfs",evaluationer.classification_evaluation_df)
213
+ evaluationer.classification_evaluation_df.sort_values(by = value,inplace= True)
214
+ name = str(evaluationer.classification_evaluation_df.iloc[-1,0])
215
+ st.write("df name",evaluationer.classification_evaluation_df.iloc[-1,0])
216
+ if len_null >0:
217
+ b = list_X_after_missing_values_names.index(name)
218
+ st.write("Sdffsf",b)
219
+ st.write("df",list_X_after_missing_values[b])
220
+ X = list_X_after_missing_values[b]
221
+ if eva == "reg":
222
+ st.write("Using R2 score for Evaluation",evaluationer.reg_evaluation_df)
223
+ value = "test_r2"
224
+ evaluationer.reg_evaluation_df.sort_values(by = value,inplace= True)
225
+ st.write("adfsdf",evaluationer.reg_evaluation_df.iloc[-1,0])
226
+ name = str(evaluationer.reg_evaluation_df.iloc[-1,0])
227
+ st.write("Sdffsf",name)
228
+ if len_null >0:
229
+ b = list_X_after_missing_values_names.index(name)
230
+ st.write("Sdffsf",b)
231
+ st.write("df",list_X_after_missing_values[b])
232
+ X = list_X_after_missing_values[b]
233
+
234
+
235
+ # Create a figure and axes
236
+ num_plots = len(num_cols)
237
+ cols = 2 # Number of columns in the subplot grid
238
+ rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
239
+
240
+ fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
241
+
242
+ # Flatten the axes array for easy iteration, and remove any excess subplots
243
+ axes = axes.flatten()
244
+ for ax in axes[num_plots:]:
245
+ fig.delaxes(ax)
246
+
247
+ for i, col in enumerate(num_cols):
248
+ sns.histplot(X[col], ax=axes[i],kde = True,color=sns.color_palette('Oranges', as_cmap=True)(0.7))
249
+ axes[i].set_title(col)
250
+
251
+ # Adjust layout
252
+ plt.tight_layout()
253
+
254
+ # Show the plot in Streamlit
255
+ st.pyplot(fig)
256
+
257
+ # Create a figure and axes
258
+ num_plots = len(num_cols)
259
+ cols = 3 # Number of columns in the subplot grid
260
+ rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
261
+
262
+ fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
263
+
264
+ # Flatten the axes array for easy iteration, and remove any excess subplots
265
+ axes = axes.flatten()
266
+ for ax in axes[num_plots:]:
267
+ fig.delaxes(ax)
268
+
269
+ for i, col in enumerate(num_cols):
270
+ sns.boxplot(y=X[col], ax=axes[i],palette="magma")
271
+ axes[i].set_title(col)
272
+
273
+ # Adjust layout
274
+ plt.tight_layout()
275
+
276
+ # Show the plot in Streamlit
277
+ st.pyplot(fig)
278
+
279
+ outlier_cols = st.multiselect("De-Select columns for Detecting Outliers", num_cols,default= list(num_cols))
280
+
281
+ st.write("Checking for Outliers")
282
+ outliers_df_X,outlier_indexes = outliers.detect_outliers(X,list(outlier_cols))
283
+ st.write("Outliers in Dataframe Summary",outliers_df_X)
284
+ st.write("Columns for Outliers handling",tuple(outliers_df_X["columns name"]))
285
+
286
+ select_outlier_cols = st.multiselect("Select columns for Outlier Handling",tuple(outliers_df_X["columns name"]),default =tuple(outliers_df_X["columns name"]))
287
+ resultant,outlier_handled_df,outlier_handled_df_name= outliers.outlier_handling(X,y,model,outlier_indexes = outlier_indexes,outlier_cols = select_outlier_cols ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = "reg")
288
+ st.write("outlier handling with methods",resultant)
289
+ st.write("Best method with outlier handling",resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])
290
+ try :
291
+ st.write("Best X Data Index No.",outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0]))
292
+
293
+ st.write("Best X DataFrame after outlier handling ",outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
294
+ X = outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])]
295
+ except :
296
+ "evaluation of baseline model is better continuing with baseline model"
297
+
298
+ # result_df ,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
299
+ X_train,X_test,y_train,y_test = tts(X,y[X.index],random_state = 42,test_size = 0.2)
300
+ st.write("result_df",X)
301
+ st.write("fsdfs",X_train)
302
+ result_df_1 = feature_selections.feature_selection(X_train,X_test,y_train,y_test,model,alpha = 0.05)
303
+ st.write("sdchsvdgj",result_df_1)
304
+
305
+
306
+
307
+
308
+
309
+
310
+
311
+
312
+
313
+
314
+
315
+
316
+
317
+