Upload 9 files
Browse files- app.py +535 -0
- auto_optimizer.py +317 -0
- best_tts.py +110 -0
- evaluationer.py +151 -0
- feature_selections.py +104 -0
- models.py +70 -0
- null_value_handling.py +49 -0
- outliers.py +233 -0
- requirements.txt +10 -0
app.py
ADDED
@@ -0,0 +1,535 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import libraries
|
2 |
+
import streamlit as st
|
3 |
+
import joblib
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
from sklearn.model_selection import train_test_split as tts
|
7 |
+
import evaluationer,models, null_value_handling
|
8 |
+
import auto_optimizer
|
9 |
+
from sklearn.experimental import enable_iterative_imputer
|
10 |
+
from sklearn.impute import SimpleImputer, IterativeImputer
|
11 |
+
# st.set_page_config(layout="wide")
|
12 |
+
|
13 |
+
st.set_page_config(
|
14 |
+
page_title="LazyML App",
|
15 |
+
page_icon="🧊",
|
16 |
+
initial_sidebar_state="expanded",
|
17 |
+
menu_items={
|
18 |
+
'Get Help': 'https://www.extremelycoolapp.com/help',
|
19 |
+
'Report a bug': "https://www.extremelycoolapp.com/bug",
|
20 |
+
'About': "# This is a header. This is an *extremely* cool app!"
|
21 |
+
}
|
22 |
+
)
|
23 |
+
|
24 |
+
import streamlit as st
|
25 |
+
|
26 |
+
# Title with Rainbow Transition Effect and Neon Glow
|
27 |
+
html_code = """
|
28 |
+
<div class="title-container">
|
29 |
+
<h1 class="neon-text">
|
30 |
+
LazyML
|
31 |
+
</h1>
|
32 |
+
</div>
|
33 |
+
|
34 |
+
<style>
|
35 |
+
@keyframes rainbow-text-animation {
|
36 |
+
0% { color: red; }
|
37 |
+
16.67% { color: orange; }
|
38 |
+
33.33% { color: yellow; }
|
39 |
+
50% { color: green; }
|
40 |
+
66.67% { color: blue; }
|
41 |
+
83.33% { color: indigo; }
|
42 |
+
100% { color: violet; }
|
43 |
+
}
|
44 |
+
|
45 |
+
.title-container {
|
46 |
+
text-align: center;
|
47 |
+
margin: 1em 0;
|
48 |
+
padding-bottom: 10px;
|
49 |
+
border-bottom: 4 px solid #fcdee9; /* Magenta underline */
|
50 |
+
}
|
51 |
+
|
52 |
+
.neon-text {
|
53 |
+
font-family: Arial, sans-serif;
|
54 |
+
font-size: 4em;
|
55 |
+
margin: 0;
|
56 |
+
animation: rainbow-text-animation 5s infinite linear;
|
57 |
+
text-shadow: 0 0 5px rgba(255, 255, 255, 0.8),
|
58 |
+
0 0 10px rgba(255, 255, 255, 0.7),
|
59 |
+
0 0 20px rgba(255, 255, 255, 0.6),
|
60 |
+
0 0 40px rgba(255, 0, 255, 0.6),
|
61 |
+
0 0 80px rgba(255, 0, 255, 0.6),
|
62 |
+
0 0 90px rgba(255, 0, 255, 0.6),
|
63 |
+
0 0 100px rgba(255, 0, 255, 0.6),
|
64 |
+
0 0 150px rgba(255, 0, 255, 0.6);
|
65 |
+
}
|
66 |
+
</style>
|
67 |
+
"""
|
68 |
+
|
69 |
+
st.markdown(html_code, unsafe_allow_html=True)
|
70 |
+
|
71 |
+
|
72 |
+
# file uploader
|
73 |
+
csv_upload = st.sidebar.file_uploader("Input CSV File for ML modelling", type=['csv'])
|
74 |
+
csv_upload2 = st.sidebar.file_uploader("Input CSV File of Test Data Prediction",type = ["csv"])
|
75 |
+
test = pd.DataFrame()
|
76 |
+
if csv_upload is not None:
|
77 |
+
# read the uploaded file into dataframe
|
78 |
+
df = pd.read_csv(csv_upload)
|
79 |
+
|
80 |
+
# saving the dataframe to a CSV file
|
81 |
+
df.to_csv('csv_upload.csv', index=False)
|
82 |
+
st.write("Train File uploaded successfully. ✅")
|
83 |
+
|
84 |
+
if csv_upload2 is not None:
|
85 |
+
test = pd.read_csv(csv_upload2)
|
86 |
+
id_col = st.selectbox("select column for submission i.e, ID",test.columns)
|
87 |
+
submission_id = test[id_col]
|
88 |
+
# st.write("Train File upl",submission_id)
|
89 |
+
|
90 |
+
|
91 |
+
|
92 |
+
|
93 |
+
if len(test) >0:
|
94 |
+
# saving the test dataframe to a CSV file
|
95 |
+
test.to_csv('csv_upload_test.csv', index=False)
|
96 |
+
st.write("Test File uploaded successfully. ✅")
|
97 |
+
|
98 |
+
display_train_data = st.radio("Display Train Data",["Yes","No"],index = 1)
|
99 |
+
if display_train_data == "Yes":
|
100 |
+
st.dataframe(df.head())
|
101 |
+
|
102 |
+
if len(test) >0:
|
103 |
+
display_test_data = st.radio("Display Test Data",["Yes","No"],index = 1)
|
104 |
+
if display_test_data == "Yes":
|
105 |
+
st.dataframe(test.head())
|
106 |
+
|
107 |
+
|
108 |
+
if st.radio("Select Supervision Category",["Supervised","Un-Supervised"],index =0) == "Supervised":
|
109 |
+
|
110 |
+
selected_column = st.selectbox('Select Target column', df.columns, index=(len(df.columns)-1))
|
111 |
+
|
112 |
+
# Display the selected column
|
113 |
+
st.write('You selected:', selected_column)
|
114 |
+
|
115 |
+
y = df[selected_column]
|
116 |
+
|
117 |
+
if y.dtype == "O":
|
118 |
+
st.write("⚠️⚠️⚠️ Target Column is Object Type ⚠️⚠️⚠️")
|
119 |
+
if st.radio("Proceed for Label Encoding ",["Yes","No"],index = 1) == "Yes":
|
120 |
+
from sklearn.preprocessing import LabelEncoder
|
121 |
+
le = LabelEncoder()
|
122 |
+
y= pd.Series(le.fit_transform(y))
|
123 |
+
st.write("Label Encoding Completed ✅")
|
124 |
+
|
125 |
+
if st.radio("Display Target Column",["Yes","No"],index =1) == "Yes":
|
126 |
+
st.dataframe(y.head())
|
127 |
+
|
128 |
+
|
129 |
+
select_target_trans = st.radio("Target column Transformation",["Yes","No"],index = 1)
|
130 |
+
if select_target_trans == "Yes":
|
131 |
+
selected_transformation = st.selectbox("Select Transformation method",["Log Transformation","Power Transformation"])
|
132 |
+
if selected_transformation == "Log Transformation":
|
133 |
+
if y.min() <=0:
|
134 |
+
st.write("Values in target columns are zeroes or negative, please select power transformation")
|
135 |
+
else:
|
136 |
+
log_selected_transformation = st.selectbox("Select Logarithmic method",["Natural Log base(e)","Log base 10","Log base (2)"])
|
137 |
+
if log_selected_transformation == "Natural Log base(e)":
|
138 |
+
y = np.log(y)
|
139 |
+
st.write("Log base (e) Transformation Completed ✅")
|
140 |
+
elif log_selected_transformation == "Log base 10":
|
141 |
+
y = np.log10(y)
|
142 |
+
st.write("Log base 10 Transformation Completed ✅")
|
143 |
+
elif log_selected_transformation == "Log base (2)":
|
144 |
+
y = np.log2(y)
|
145 |
+
st.write("Log base 2 Transformation Completed ✅")
|
146 |
+
elif selected_transformation == "Power Transformation":
|
147 |
+
power_selected_transformation = st.selectbox("Select Power Transformation method",["Square Root","Other"])
|
148 |
+
if power_selected_transformation == "Square Root":
|
149 |
+
y = np.sqrt(y)
|
150 |
+
st.write("Square root Transformation Completed ✅")
|
151 |
+
elif power_selected_transformation == "Other":
|
152 |
+
power_value = st.number_input("Enter Power Value",value=3)
|
153 |
+
y = y**(1/power_value)
|
154 |
+
st.write(f"power root of {power_value} Transformation Completed ✅")
|
155 |
+
|
156 |
+
if st.radio("Display Target Column after Transformation",["Yes","No"],index =1) == "Yes":
|
157 |
+
st.dataframe(y.head())
|
158 |
+
# inverse of transformation
|
159 |
+
|
160 |
+
X = df.drop(columns = selected_column)
|
161 |
+
|
162 |
+
if st.radio("Display X-Train Data",["Yes","No"],index =1) == "Yes":
|
163 |
+
st.dataframe(X.head())
|
164 |
+
if st.radio("Check for duplicate Values",["Yes","No"],index = 1) == "Yes":
|
165 |
+
len_duplicates = len(X[X.duplicated()])
|
166 |
+
if len_duplicates >0:
|
167 |
+
st.write(f"There are {len_duplicates} duplicate values in Train")
|
168 |
+
if st.selectbox("Drop Duplicate values",["Yes","No"],index = 1) == "Yes":
|
169 |
+
X = X.drop_duplicates()
|
170 |
+
st.write("Duplicate values removed ✅")
|
171 |
+
else:
|
172 |
+
st.write("There are no duplicate values in Train")
|
173 |
+
# dropping not important columns
|
174 |
+
if st.radio("Drop Un-Important Column(s)",["Yes","No"],index = 1) == "Yes":
|
175 |
+
selected_drop_column = st.multiselect('Select columns to be dropped', X.columns)
|
176 |
+
X = X.drop(columns = selected_drop_column)
|
177 |
+
if len(test) >0:
|
178 |
+
test = test.drop(columns = selected_drop_column)
|
179 |
+
st.write("Un-Important column(s) Delected ✅")
|
180 |
+
st.dataframe(X.head())
|
181 |
+
|
182 |
+
num_cols = X.select_dtypes(exclude = "O").columns
|
183 |
+
cat_cols = X.select_dtypes(include = "O").columns
|
184 |
+
st.write("Numerical Columns in Train Data: ", tuple(num_cols))
|
185 |
+
st.write("Categorical Columns in Train Data: ", tuple(cat_cols))
|
186 |
+
|
187 |
+
if st.radio("Select method for ML modelling", ["Manual","Auto Optimized"],index = 0) == "Auto Optimized":
|
188 |
+
ml_cat_ao = st.radio("Select Machine Learning Category",["Regression","Classification"],index =0)
|
189 |
+
|
190 |
+
if ml_cat_ao =="Regression":
|
191 |
+
eva = "reg"
|
192 |
+
st.write("Select ML algorithm")
|
193 |
+
reg_model_name = st.selectbox("select model",models.Regression_models.index)
|
194 |
+
reg_model = models.Regression_models.loc[reg_model_name].values[0]
|
195 |
+
auto_optimizer.Auto_optimizer(X,y,eva,reg_model)
|
196 |
+
|
197 |
+
elif ml_cat_ao =="Classification":
|
198 |
+
eva = "class"
|
199 |
+
st.write("Select ML algorithm")
|
200 |
+
class_model_name = st.selectbox("select model",models.Classification_models.index)
|
201 |
+
class_model = models.Classification_models.loc[class_model_name].values[0]
|
202 |
+
auto_optimizer.Auto_optimizer(X,y,eva,class_model)
|
203 |
+
|
204 |
+
|
205 |
+
else:
|
206 |
+
if X.isnull().sum().sum() >0 :
|
207 |
+
st.write("⚠️⚠️⚠️ There are missing values in Train Data ⚠️⚠️⚠️")
|
208 |
+
|
209 |
+
if st.selectbox("Drop null values or Impute",["Drop Null Values","Impute Null Values"],index = 1) == "Drop Null Values":
|
210 |
+
|
211 |
+
X = X.dropna()
|
212 |
+
if len(test) >0:
|
213 |
+
st.write("⚠️⚠️⚠️ If choosing drop values, test dataset will also drop those values please choose missing value imputation method befittingly.⚠️⚠️⚠️ ")
|
214 |
+
test = test.dropna()
|
215 |
+
|
216 |
+
clean_num_nvh_df = pd.DataFrame()
|
217 |
+
if X[num_cols].isnull().sum().sum() >0:
|
218 |
+
st.write("Numerical Columns with Percentage of Null Values: ")
|
219 |
+
num_cols_nvh = X[num_cols].isnull().sum()[X[num_cols].isnull().sum()>0].index
|
220 |
+
st.dataframe(round(X[num_cols].isnull().sum()[X[num_cols].isnull().sum()>0]/len(X)*100,2))
|
221 |
+
dict_1= {}
|
222 |
+
for nvh_method in null_value_handling.null_value_handling_method_num_cols :
|
223 |
+
|
224 |
+
selected_nvh_num_cols = st.multiselect(f'method:- \"{nvh_method}\" for Numerical columns', num_cols_nvh,)
|
225 |
+
dict_1[nvh_method] = selected_nvh_num_cols
|
226 |
+
|
227 |
+
num_cols_nvh = set(num_cols_nvh) - set(selected_nvh_num_cols)
|
228 |
+
if len(num_cols_nvh) ==0:
|
229 |
+
break
|
230 |
+
num_nvh_df = pd.DataFrame(data=dict_1.values(), index=dict_1.keys())
|
231 |
+
|
232 |
+
clean_num_nvh_df = num_nvh_df.T[num_nvh_df.T.count()[num_nvh_df.T.count()>0].index]
|
233 |
+
|
234 |
+
st.write("Methods for Numerical columns null value handling",clean_num_nvh_df )
|
235 |
+
|
236 |
+
if len(test) >0:
|
237 |
+
if test[num_cols].isnull().sum().sum() >0:
|
238 |
+
test_num_cols_nvh = test[num_cols].isnull().sum()[test[num_cols].isnull().sum()>0].index
|
239 |
+
st.write("Columns with Null Value in Test",test_num_cols_nvh)
|
240 |
+
test[num_cols] = IterativeImputer(max_iter = 200,random_state= 42).fit_transform(test[num_cols])
|
241 |
+
|
242 |
+
|
243 |
+
clean_num_nvh_df_cat = pd.DataFrame()
|
244 |
+
if X[cat_cols].isnull().sum().sum() >0:
|
245 |
+
st.write("Categorical Columns with Percentage of Null Values: ")
|
246 |
+
cat_cols_nvh = X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0].index
|
247 |
+
st.dataframe(round(X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0]/len(X)*100,2))
|
248 |
+
|
249 |
+
dict_2= {}
|
250 |
+
for nvh_method in null_value_handling.null_value_handling_method_cat_cols :
|
251 |
+
st.write("dsff",nvh_method)
|
252 |
+
|
253 |
+
selected_nvh_num_cols = st.multiselect(f'method:- \"{nvh_method}\" for Numerical columns', cat_cols_nvh,)
|
254 |
+
dict_2[nvh_method] = selected_nvh_num_cols
|
255 |
+
|
256 |
+
cat_cols_nvh = set(cat_cols_nvh) - set(selected_nvh_num_cols)
|
257 |
+
if len(cat_cols_nvh) ==0:
|
258 |
+
break
|
259 |
+
num_nvh_df_cat = pd.DataFrame(data=dict_2.values(), index=dict_2.keys())
|
260 |
+
clean_num_nvh_df_cat = num_nvh_df_cat.T
|
261 |
+
st.write("Methods for Categorical columns null value handling",[clean_num_nvh_df_cat])
|
262 |
+
|
263 |
+
if len(test) >0:
|
264 |
+
if test[cat_cols].isnull().sum().sum() >0:
|
265 |
+
test_num_cols_nvh_cat = test[cat_cols].isnull().sum()[test[cat_cols].isnull().sum()>0].index
|
266 |
+
st.write("sdgs",test_num_cols_nvh_cat)
|
267 |
+
test[cat_cols] = SimpleImputer(strategy = "most_frequent").fit_transform(test[cat_cols])
|
268 |
+
|
269 |
+
|
270 |
+
null_value_handling.null_handling(X,clean_num_nvh_df,clean_num_nvh_df_cat)
|
271 |
+
st.write("X Data after Null value handling", X.head())
|
272 |
+
|
273 |
+
new_df = pd.concat([X,y[X.index]],axis = 1)
|
274 |
+
|
275 |
+
csv = new_df.to_csv(index = False)
|
276 |
+
if st.radio("Download Null Value Handled DataFrame as CSV File ? ",["Yes","No"],index = 1) == "Yes":
|
277 |
+
st.download_button(label="Download Null Value Handled CSV File",data=csv,file_name='NVH_DataFrame.csv',mime='text/csv')
|
278 |
+
|
279 |
+
ord_enc_cols = []
|
280 |
+
|
281 |
+
if len(cat_cols) == 0:
|
282 |
+
st.write("No Categorical Columns in Train")
|
283 |
+
else:
|
284 |
+
st.write("Select Columns for Ordinal Encoding")
|
285 |
+
for column in cat_cols:
|
286 |
+
|
287 |
+
selected = st.checkbox(column)
|
288 |
+
if selected:
|
289 |
+
st.write(f"No. of Unique value in {column} column are", X[column].nunique())
|
290 |
+
ord_enc_cols.append(column)
|
291 |
+
ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
|
292 |
+
ohe_enc_cols = list(ohe_enc_cols)
|
293 |
+
if len(ord_enc_cols)>0:
|
294 |
+
st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
|
295 |
+
if len(ohe_enc_cols)>0:
|
296 |
+
st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
|
297 |
+
|
298 |
+
if len(ord_enc_cols)>0:
|
299 |
+
if st.radio("proceed for ordinal encoding",["Yes","No"],index = 1) == "Yes":
|
300 |
+
ordinal_order_vals = []
|
301 |
+
|
302 |
+
for column in ord_enc_cols:
|
303 |
+
unique_vals = X[column].unique()
|
304 |
+
# st.write(f"No. of Unique value in {column} column are", len(unique_vals))
|
305 |
+
|
306 |
+
ordered_unique_vals = st.multiselect("Select values in order for Ordinal Encoding",unique_vals,unique_vals)
|
307 |
+
ordinal_order_vals.append(ordered_unique_vals)
|
308 |
+
|
309 |
+
st.write("order of values for Ordinal Encoding",tuple(ordinal_order_vals))
|
310 |
+
# import ordinal encoder
|
311 |
+
from sklearn.preprocessing import OrdinalEncoder
|
312 |
+
ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
|
313 |
+
X[ord_enc_cols] = ord.fit_transform(X[ord_enc_cols])
|
314 |
+
if len(test) >0:
|
315 |
+
test[ord_enc_cols] = ord.transform(test[ord_enc_cols])
|
316 |
+
st.write("DataFrame after Ordinal Encoding",X.head())
|
317 |
+
st.write("Ordinal Encoding Completed ✅")
|
318 |
+
|
319 |
+
if len(ohe_enc_cols)>0:
|
320 |
+
if st.radio("proceed for OnehotEncoding ",["Yes","No"],index = 1) == "Yes": # import one hot encoder
|
321 |
+
from sklearn.preprocessing import OneHotEncoder
|
322 |
+
ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
|
323 |
+
pd.options.mode.chained_assignment = None
|
324 |
+
X.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(X[ohe_enc_cols])
|
325 |
+
X.drop(columns = ohe_enc_cols,inplace = True)
|
326 |
+
if len(test) >0:
|
327 |
+
test.loc[:, ohe.get_feature_names_out()] = ohe.transform(test[ohe_enc_cols])
|
328 |
+
test.drop(columns = ohe_enc_cols,inplace = True)
|
329 |
+
|
330 |
+
pd.options.mode.chained_assignment = 'warn'
|
331 |
+
|
332 |
+
st.write("DataFrame after One Hot Encoding",X.head())
|
333 |
+
st.write("OneHot Encoding Completed ✅")
|
334 |
+
|
335 |
+
new_df = pd.concat([X,y],axis = 1)
|
336 |
+
|
337 |
+
csv = new_df.to_csv(index = False)
|
338 |
+
if st.radio("Download Encoded DataFrame as CSV File ? ",["Yes","No"],index = 1) == "Yes":
|
339 |
+
st.download_button(label="Download Ordinal Encoded CSV File",data=csv,file_name='Encoded_DataFrame.csv',mime='text/csv')
|
340 |
+
|
341 |
+
|
342 |
+
random_state = st.number_input("Enter Random_state",max_value=100,min_value=1,value=42)
|
343 |
+
test_size = st.number_input("Enter test_size",max_value=0.99, min_value = 0.01,value =0.2)
|
344 |
+
if st.radio("select Train Validation Split Method",
|
345 |
+
[f"Train_Test_split, Default (Random_state = {random_state},Test_size = {test_size})",
|
346 |
+
"KFoldCV, Default (CV = 5)"], index = 0)== f"Train_Test_split, Default (Random_state = {random_state},Test_size = {test_size})":
|
347 |
+
ttsmethod = "Train_Test_split"
|
348 |
+
else:
|
349 |
+
ttsmethod = "KFoldCV"
|
350 |
+
st.write('You selected:', ttsmethod)
|
351 |
+
if ttsmethod == "Train_Test_split":
|
352 |
+
X_train,X_Val,y_train,y_val = tts(X,y[X.index],random_state = random_state,test_size = test_size)
|
353 |
+
st.write('X-Training Data shape:', (X_train.info()))
|
354 |
+
|
355 |
+
st.write('X-Training Data shape:', X_train.shape)
|
356 |
+
st.write('X-Validation Data shape:', X_Val.shape)
|
357 |
+
|
358 |
+
ml_cat = st.radio("Select Machine Learning Category",["Regression","Classification"],index =0)
|
359 |
+
|
360 |
+
if ml_cat =="Regression":
|
361 |
+
method_name_selector = st.selectbox("Select Error Evaluation Method",evaluationer.method_df.index,index = 0)
|
362 |
+
|
363 |
+
method = evaluationer.method_df.loc[method_name_selector].values[0]
|
364 |
+
reg_algorithm = []
|
365 |
+
selected_options = []
|
366 |
+
|
367 |
+
for option in models.Regression_models.index:
|
368 |
+
selected = st.checkbox(option)
|
369 |
+
if selected:
|
370 |
+
selected_options.append(option)
|
371 |
+
|
372 |
+
param = models.Regression_models.loc[option][0].get_params()
|
373 |
+
Temp_parameter = pd.DataFrame(data=param.values(), index=param.keys())
|
374 |
+
Temp_parameter_transposed = Temp_parameter.T
|
375 |
+
parameter = pd.DataFrame(data=param.values(), index=param.keys())
|
376 |
+
def is_boolean(val):
|
377 |
+
return isinstance(val, bool)
|
378 |
+
|
379 |
+
# Apply the function to the DataFrame column and create a new column with the resuSlts
|
380 |
+
bool_cols= parameter[parameter[0].apply(is_boolean)].index
|
381 |
+
param_transposed = parameter.T
|
382 |
+
# st.write("hrweurgesj",param_transposed.loc[:, bool_cols])
|
383 |
+
# st.write("bool_cols",bool_cols)
|
384 |
+
remaining_cols = set(param_transposed.columns) - set(bool_cols)
|
385 |
+
remaining_cols = tuple(remaining_cols)
|
386 |
+
# st.write("rem_Cols",remaining_cols)
|
387 |
+
|
388 |
+
for col in remaining_cols:
|
389 |
+
param_transposed[col] = pd.to_numeric(param_transposed[col],errors="ignore")
|
390 |
+
cat_cols = param_transposed.select_dtypes(include = ["O"]).T.index.to_list()
|
391 |
+
num_cols = set(remaining_cols) - set(cat_cols)
|
392 |
+
cat_cols = set(cat_cols) - set(bool_cols)
|
393 |
+
num_cols = tuple(num_cols)
|
394 |
+
# st.write("sdsafdsd",num_cols)
|
395 |
+
for i in num_cols:
|
396 |
+
param_transposed[i] = st.number_input(f"input \"{i}\" value \n{option}",value = parameter.T[i].values[0])
|
397 |
+
for i in cat_cols:
|
398 |
+
param_transposed[i] = st.text_input(f"input \"{i}\" value \n{option}",value = parameter.T[i].values[0])
|
399 |
+
for i in bool_cols:
|
400 |
+
st.write("default value to insert",Temp_parameter_transposed[i].values[0])
|
401 |
+
param_transposed[i] = st.selectbox(f"input \"{i}\" value \n{option}",[False, True], index=Temp_parameter_transposed[i].values[0])
|
402 |
+
|
403 |
+
inv_param = param_transposed.T
|
404 |
+
new_param = inv_param.dropna().loc[:,0].to_dict()
|
405 |
+
# st.write("asad",new_param)
|
406 |
+
models.Regression_models.loc[option][0].set_params(**new_param)
|
407 |
+
a = models.Regression_models.loc[option][0].get_params()
|
408 |
+
reg_algorithm.append(models.Regression_models.loc[option][0])
|
409 |
+
if st.button("Train Regression Model"):
|
410 |
+
for algorithm in reg_algorithm:
|
411 |
+
evaluationer.evaluation(f"{algorithm} baseline",X_train,X_Val,y_train,y_val,algorithm,method,"reg")
|
412 |
+
st.write("Regression Model Trained Successfully",evaluationer.reg_evaluation_df)
|
413 |
+
if len(test)>0:
|
414 |
+
if st.radio("Predict",["Yes","No"],index = 1) =="Yes":
|
415 |
+
|
416 |
+
if len(evaluationer.reg_evaluation_df) >0:
|
417 |
+
a = st.number_input("select index of best algorithm for test prediction",min_value = 0,max_value =len(evaluationer.reg_evaluation_df) -1, value = len(evaluationer.reg_evaluation_df) -1)
|
418 |
+
|
419 |
+
test_prediction = evaluationer.reg_evaluation_df.loc[a,"model"].predict(test)
|
420 |
+
if select_target_trans == "Yes":
|
421 |
+
if selected_transformation == "Log Transformation":
|
422 |
+
if log_selected_transformation == "Natural Log base(e)":
|
423 |
+
test_prediction = np.exp(test_prediction)
|
424 |
+
st.write("Natural Log base(e) Inverse Transformation Completed ✅")
|
425 |
+
elif log_selected_transformation == "Log base 10":
|
426 |
+
test_prediction = np.power(10,test_prediction)
|
427 |
+
st.write("Log base 10 Inverse Transformation Completed ✅")
|
428 |
+
elif log_selected_transformation == "Log base (2)":
|
429 |
+
test_prediction = np.power(2,test_prediction)
|
430 |
+
st.write("Log base 2 Inverse Transformation Completed ✅")
|
431 |
+
elif selected_transformation == "Power Transformation":
|
432 |
+
if power_selected_transformation == "Square Root":
|
433 |
+
test_prediction = np.power(test_prediction,2)
|
434 |
+
st.write("Square root Inverse Transformation Completed ✅")
|
435 |
+
elif power_selected_transformation == "Other":
|
436 |
+
test_prediction = test_prediction**(power_value)
|
437 |
+
st.write(f"power root of {power_value} Inverse Transformation Completed ✅")
|
438 |
+
submission_file = pd.DataFrame(index = [submission_id],data = test_prediction,columns = [selected_column])
|
439 |
+
st.write("Sample of Prediction File",submission_file.head())
|
440 |
+
csv_prediction = submission_file.to_csv()
|
441 |
+
if st.radio("Download Prediction File as CSV File ? ",["Yes","No"],index = 1) == "Yes":
|
442 |
+
st.download_button(label="Download Prediction CSV File",data=csv_prediction,file_name='prediction.csv',mime='text/csv')
|
443 |
+
|
444 |
+
|
445 |
+
|
446 |
+
|
447 |
+
if ml_cat =="Classification":
|
448 |
+
|
449 |
+
|
450 |
+
|
451 |
+
cla_algorithm = []
|
452 |
+
selected_options = []
|
453 |
+
|
454 |
+
for option in models.Classification_models.index:
|
455 |
+
selected = st.checkbox(option)
|
456 |
+
if selected:
|
457 |
+
selected_options.append(option)
|
458 |
+
|
459 |
+
param = models.Classification_models.loc[option][0].get_params()
|
460 |
+
|
461 |
+
|
462 |
+
parameter = pd.DataFrame(data=param.values(), index=param.keys())
|
463 |
+
Temp_parameter = parameter.copy()
|
464 |
+
Temp_parameter_transposed = (Temp_parameter.T).copy()
|
465 |
+
def is_boolean(val):
|
466 |
+
return isinstance(val, bool)
|
467 |
+
|
468 |
+
# Apply the function to the DataFrame column and create a new column with the resuSlts
|
469 |
+
bool_cols= parameter[parameter[0].apply(is_boolean)].index
|
470 |
+
param_transposed = parameter.T
|
471 |
+
st.write("bool_cols",bool_cols)
|
472 |
+
remaining_cols = set(param_transposed.columns) - set(bool_cols)
|
473 |
+
remaining_cols = tuple(remaining_cols)
|
474 |
+
st.write("rem_Cols",remaining_cols)
|
475 |
+
|
476 |
+
for col in remaining_cols:
|
477 |
+
param_transposed[col] = pd.to_numeric(param_transposed[col],errors="ignore")
|
478 |
+
cat_cols = param_transposed.select_dtypes(include = ["O"]).T.index.to_list()
|
479 |
+
num_cols = set(remaining_cols) - set(cat_cols)
|
480 |
+
num_cols = tuple(num_cols)
|
481 |
+
st.write("sdsafdsd",num_cols)
|
482 |
+
for i in num_cols:
|
483 |
+
param_transposed[i] = st.number_input(f"input \"{i}\" value \n{option}",value = parameter.T[i].values[0])
|
484 |
+
for i in cat_cols:
|
485 |
+
param_transposed[i] = st.text_input(f"input \"{i}\" value \n{option}",value = parameter.T[i].values[0])
|
486 |
+
for i in bool_cols:
|
487 |
+
st.write("default value to insert",Temp_parameter_transposed[i].values[0])
|
488 |
+
param_transposed[i] = st.selectbox(f"input \"{i}\" value \n{option}",[False,True], index=Temp_parameter_transposed[i].values[0])
|
489 |
+
inv_param = param_transposed.T
|
490 |
+
new_param = inv_param.dropna().loc[:,0].to_dict()
|
491 |
+
st.write("asad",new_param)
|
492 |
+
models.Classification_models.loc[option][0].set_params(**new_param)
|
493 |
+
a = models.Classification_models.loc[option][0].get_params()
|
494 |
+
cla_algorithm.append(models.Classification_models.loc[option][0])
|
495 |
+
# st.write("sada",reg_algorithm/)
|
496 |
+
if st.button("Train Regression Model"):
|
497 |
+
method = None
|
498 |
+
for algorithm in cla_algorithm:
|
499 |
+
evaluationer.evaluation(f"{algorithm} baseline",X_train,X_Val,y_train,y_val,algorithm,method,eva ="class")
|
500 |
+
st.write("Regression Model Trained Successfully",evaluationer.classification_evaluation_df)
|
501 |
+
|
502 |
+
if len(test)>0:
|
503 |
+
if st.radio("Predict",["Yes","No"],index = 1) =="Yes":
|
504 |
+
if len(evaluationer.classification_evaluation_df) >0:
|
505 |
+
a = st.number_input("select index of best algorithm for test prediction",min_value = 0,max_value =len(evaluationer.classification_evaluation_df) -1, value = len(evaluationer.classification_evaluation_df) -1)
|
506 |
+
|
507 |
+
test_prediction = evaluationer.classification_evaluation_df.loc[a,"model"].predict(test)
|
508 |
+
if select_target_trans == "Yes":
|
509 |
+
if selected_transformation == "Log Transformation":
|
510 |
+
if log_selected_transformation == "Natural Log base(e)":
|
511 |
+
test_prediction = np.exp(test_prediction)
|
512 |
+
st.write("Natural Log base(e) Inverse Transformation Completed ✅")
|
513 |
+
elif log_selected_transformation == "Log base 10":
|
514 |
+
test_prediction = np.power(10,test_prediction)
|
515 |
+
st.write("Log base 10 Inverse Transformation Completed ✅")
|
516 |
+
elif log_selected_transformation == "Log base (2)":
|
517 |
+
test_prediction = np.power(2,test_prediction)
|
518 |
+
st.write("Log base 2 Inverse Transformation Completed ✅")
|
519 |
+
elif selected_transformation == "Power Transformation":
|
520 |
+
if power_selected_transformation == "Square Root":
|
521 |
+
test_prediction = np.power(test_prediction,2)
|
522 |
+
st.write("Square root Inverse Transformation Completed ✅")
|
523 |
+
elif power_selected_transformation == "Other":
|
524 |
+
test_prediction = test_prediction**(power_value)
|
525 |
+
st.write(f"power root of {power_value} Inverse Transformation Completed ✅")
|
526 |
+
|
527 |
+
submission_file = pd.DataFrame(index = [submission_id],data = test_prediction,columns = [selected_column])
|
528 |
+
st.write("Sample of Prediction File",submission_file.head())
|
529 |
+
csv_prediction = submission_file.to_csv()
|
530 |
+
if st.radio("Download Prediction File as CSV File ? ",["Yes","No"],index = 1) == "Yes":
|
531 |
+
st.download_button(label="Download Prediction CSV File",data=csv_prediction,file_name='prediction.csv',mime='text/csv')
|
532 |
+
|
533 |
+
|
534 |
+
|
535 |
+
|
auto_optimizer.py
ADDED
@@ -0,0 +1,317 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import streamlit as st
|
4 |
+
from sklearn.impute import KNNImputer,SimpleImputer,IterativeImputer
|
5 |
+
import best_tts, evaluationer,models
|
6 |
+
from sklearn.experimental import enable_iterative_imputer
|
7 |
+
from sklearn.model_selection import train_test_split as tts
|
8 |
+
from collections import Counter
|
9 |
+
#root_mean_squared_error
|
10 |
+
from sklearn.metrics import root_mean_squared_error
|
11 |
+
import seaborn as sns
|
12 |
+
import matplotlib.pyplot as plt
|
13 |
+
import outliers,best_tts
|
14 |
+
import feature_selections
|
15 |
+
def Auto_optimizer(X,y,eva,model,test= None):
|
16 |
+
pass
|
17 |
+
num_cols = X.select_dtypes(exclude = "O").columns
|
18 |
+
cat_cols = X.select_dtypes(include = "O").columns
|
19 |
+
st.write("Num_cols",tuple(num_cols))
|
20 |
+
st.write("cat_cols",tuple(cat_cols))
|
21 |
+
|
22 |
+
# check for Duplicate and drop duplicated in X
|
23 |
+
|
24 |
+
if len(X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40]) >0:
|
25 |
+
X = X.drop(columns = X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40].index)
|
26 |
+
st.write("Columns with more than 40% null values removed")
|
27 |
+
# st.write("csx",X)
|
28 |
+
|
29 |
+
len_null = X.isnull().sum().sum()
|
30 |
+
|
31 |
+
st.write(f"There are {len_null} null values in Train")
|
32 |
+
|
33 |
+
knn_imputed_num_X = X.copy()
|
34 |
+
si_mean_imputed_num_X = X.copy()
|
35 |
+
# st.write("sf",si_mean_imputed_num_X)
|
36 |
+
si_median_imputed_num_X = X.copy()
|
37 |
+
si_most_frequent_imputed_num_X = X.copy()
|
38 |
+
iter_imputed_num_X = X.copy()
|
39 |
+
knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
|
40 |
+
si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
|
41 |
+
si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
|
42 |
+
si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
|
43 |
+
iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
|
44 |
+
if len_null >0:
|
45 |
+
|
46 |
+
if X[num_cols].isnull().sum().sum() >0:
|
47 |
+
|
48 |
+
knn_imputer = KNNImputer(n_neighbors = 5)
|
49 |
+
knn_imputed_num_X[num_cols] = knn_imputer.fit_transform(knn_imputed_num_X[num_cols])
|
50 |
+
si_imputer = SimpleImputer(strategy = "mean")
|
51 |
+
si_mean_imputed_num_X[num_cols] = si_imputer.fit_transform(si_mean_imputed_num_X[num_cols])
|
52 |
+
si_imputer = SimpleImputer(strategy = "median")
|
53 |
+
si_median_imputed_num_X[num_cols] = si_imputer.fit_transform(si_median_imputed_num_X[num_cols])
|
54 |
+
si_imputer = SimpleImputer(strategy = "most_frequent")
|
55 |
+
si_most_frequent_imputed_num_X[num_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[num_cols])
|
56 |
+
iter_imputer = IterativeImputer(max_iter = 200,random_state= 42)
|
57 |
+
iter_imputed_num_X[num_cols] = iter_imputer.fit_transform(iter_imputed_num_X[num_cols])
|
58 |
+
knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
|
59 |
+
si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
|
60 |
+
si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
|
61 |
+
si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
|
62 |
+
iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
|
63 |
+
|
64 |
+
if X[cat_cols].isnull().sum().sum() >0:
|
65 |
+
# treating missing values in categorical columns
|
66 |
+
# st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
|
67 |
+
si_imputer = SimpleImputer(strategy = "most_frequent")
|
68 |
+
|
69 |
+
knn_imputed_num_X[cat_cols] = si_imputer.fit_transform(knn_imputed_num_X[cat_cols])
|
70 |
+
si_imputer = SimpleImputer(strategy = "most_frequent")
|
71 |
+
si_mean_imputed_num_X.loc[:,cat_cols] = si_imputer.fit_transform(si_mean_imputed_num_X.loc[:,cat_cols])
|
72 |
+
# st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
|
73 |
+
si_median_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_median_imputed_num_X[cat_cols])
|
74 |
+
si_most_frequent_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[cat_cols])
|
75 |
+
iter_imputed_num_X[cat_cols] = si_imputer.fit_transform(iter_imputed_num_X[cat_cols])
|
76 |
+
|
77 |
+
knn_imputed_X_cat_dropped = knn_imputed_X_cat_dropped.dropna()
|
78 |
+
si_mean_imputed_X_cat_dropped =si_mean_imputed_X_cat_dropped.dropna()
|
79 |
+
si_median_imputed_X_cat_dropped =si_median_imputed_X_cat_dropped.dropna()
|
80 |
+
si_most_frequent_imputed_X_cat_dropped =si_most_frequent_imputed_X_cat_dropped.dropna()
|
81 |
+
iter_imputed_X_cat_dropped =iter_imputed_X_cat_dropped.dropna()
|
82 |
+
st.write("sdds",knn_imputed_num_X)
|
83 |
+
st.write("sddssd",knn_imputed_X_cat_dropped)
|
84 |
+
|
85 |
+
miss_val_dropped_X = X.dropna()
|
86 |
+
|
87 |
+
# list of dataframes
|
88 |
+
|
89 |
+
list_X_after_missing_values= [knn_imputed_num_X,
|
90 |
+
si_mean_imputed_num_X,
|
91 |
+
si_median_imputed_num_X,
|
92 |
+
si_most_frequent_imputed_num_X,
|
93 |
+
iter_imputed_num_X,
|
94 |
+
knn_imputed_X_cat_dropped,
|
95 |
+
si_mean_imputed_X_cat_dropped,
|
96 |
+
si_median_imputed_X_cat_dropped,
|
97 |
+
si_most_frequent_imputed_X_cat_dropped,
|
98 |
+
iter_imputed_X_cat_dropped,
|
99 |
+
miss_val_dropped_X]
|
100 |
+
list_X_after_missing_values_names= ["knn_imputed_num_X",
|
101 |
+
"si_mean_imputed_num_X",
|
102 |
+
"si_median_imputed_num_X",
|
103 |
+
"si_most_frequent_imputed_num_X",
|
104 |
+
"iter_imputed_num_X",
|
105 |
+
"knn_imputed_X_cat_dropped",
|
106 |
+
"si_mean_imputed_X_cat_dropped",
|
107 |
+
"si_median_imputed_X_cat_dropped",
|
108 |
+
"si_most_frequent_imputed_X_cat_dropped",
|
109 |
+
"iter_imputed_X_cat_dropped",
|
110 |
+
"miss_val_dropped_X"]
|
111 |
+
# st.write("si_most_frequent_imputed_num_X",si_most_frequent_imputed_num_X,)
|
112 |
+
ord_enc_cols = []
|
113 |
+
ohe_enc_cols = []
|
114 |
+
|
115 |
+
if len(cat_cols) == 0:
|
116 |
+
st.write("No Categorical Columns in Train")
|
117 |
+
else:
|
118 |
+
st.write("Select Columns for Ordinal Encoding")
|
119 |
+
for column in cat_cols:
|
120 |
+
selected = st.checkbox(column)
|
121 |
+
if selected:
|
122 |
+
st.write(f"No. of Unique value in {column} column are", X[column].nunique())
|
123 |
+
ord_enc_cols.append(column)
|
124 |
+
ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
|
125 |
+
ohe_enc_cols = list(ohe_enc_cols)
|
126 |
+
|
127 |
+
if len(ord_enc_cols)>0:
|
128 |
+
st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
|
129 |
+
if len(ohe_enc_cols)>0:
|
130 |
+
st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
|
131 |
+
|
132 |
+
if len(ord_enc_cols)>0:
|
133 |
+
|
134 |
+
ordinal_order_vals = []
|
135 |
+
|
136 |
+
for column in ord_enc_cols:
|
137 |
+
unique_vals = X.dropna()[column].unique()
|
138 |
+
# st.write(f"No. of Unique value in {column} column are", len(unique_vals))
|
139 |
+
|
140 |
+
ordered_unique_vals = st.multiselect("Select values in order for Ordinal Encoding",unique_vals,unique_vals)
|
141 |
+
ordinal_order_vals.append(ordered_unique_vals)
|
142 |
+
|
143 |
+
st.write("order of values for Ordinal Encoding",tuple(ordinal_order_vals))
|
144 |
+
|
145 |
+
if len_null > 0:
|
146 |
+
|
147 |
+
for df_name, df in enumerate(list_X_after_missing_values):
|
148 |
+
# st.write(f"{list_X_after_missing_values_names[df_name]}",df)
|
149 |
+
from sklearn.preprocessing import OrdinalEncoder
|
150 |
+
ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
|
151 |
+
df[ord_enc_cols] = ord.fit_transform(df[ord_enc_cols])
|
152 |
+
# st.write(f"{list_X_after_missing_values_names[df_name]}",df)
|
153 |
+
else :
|
154 |
+
from sklearn.preprocessing import OrdinalEncoder
|
155 |
+
ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
|
156 |
+
X[ord_enc_cols] = ord.fit_transform(X[ord_enc_cols])
|
157 |
+
|
158 |
+
st.write("Ordinal Encoding Completed ✅")
|
159 |
+
|
160 |
+
if len(ohe_enc_cols)>0:
|
161 |
+
if len_null > 0:
|
162 |
+
for df_name, df in enumerate(list_X_after_missing_values):
|
163 |
+
from sklearn.preprocessing import OneHotEncoder
|
164 |
+
ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
|
165 |
+
pd.options.mode.chained_assignment = None
|
166 |
+
df.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(df[ohe_enc_cols])
|
167 |
+
df.drop(columns = ohe_enc_cols,inplace = True)
|
168 |
+
pd.options.mode.chained_assignment = 'warn'
|
169 |
+
else:
|
170 |
+
from sklearn.preprocessing import OneHotEncoder
|
171 |
+
ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
|
172 |
+
pd.options.mode.chained_assignment = None
|
173 |
+
X.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(X[ohe_enc_cols])
|
174 |
+
X.drop(columns = ohe_enc_cols,inplace = True)
|
175 |
+
pd.options.mode.chained_assignment = 'warn'
|
176 |
+
st.write("OneHot Encoding Completed ✅")
|
177 |
+
|
178 |
+
|
179 |
+
if len(ohe_enc_cols)>0:
|
180 |
+
if len_null > 0:
|
181 |
+
for name,df in enumerate(list_X_after_missing_values):
|
182 |
+
X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
|
183 |
+
# best_tts.best_tts(df,y,model,eva)
|
184 |
+
evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
185 |
+
else:
|
186 |
+
X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =.2 ,random_state = 42)
|
187 |
+
# best_tts.best_tts(X,y,model,eva)
|
188 |
+
|
189 |
+
evaluationer.evaluation(f"baseline_model",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
190 |
+
|
191 |
+
if len_null >0:
|
192 |
+
for name,df in enumerate(list_X_after_missing_values):
|
193 |
+
X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
|
194 |
+
st.write(f"this is test{list_X_after_missing_values_names[name]}",X_train.isnull().sum().sum())
|
195 |
+
evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
196 |
+
|
197 |
+
if eva == "class":
|
198 |
+
counter = Counter(y)
|
199 |
+
total = sum(counter.values())
|
200 |
+
balance_ratio = {cls: count / total for cls, count in counter.items()}
|
201 |
+
num_classes = len(balance_ratio)
|
202 |
+
ideal_ratio = 1 / num_classes
|
203 |
+
a = all(abs(ratio - ideal_ratio) <= 0.1 * ideal_ratio for ratio in balance_ratio.values())
|
204 |
+
if a == True:
|
205 |
+
st.write("Balanced Dataset ✅")
|
206 |
+
st.write("Using accuracy for Evaluation")
|
207 |
+
value = "test_acc"
|
208 |
+
else:
|
209 |
+
st.write("Unbalanced Dataset ❌")
|
210 |
+
st.write("Using F1 score for Evaluation")
|
211 |
+
value = "test_f1"
|
212 |
+
st.write("SFdfs",evaluationer.classification_evaluation_df)
|
213 |
+
evaluationer.classification_evaluation_df.sort_values(by = value,inplace= True)
|
214 |
+
name = str(evaluationer.classification_evaluation_df.iloc[-1,0])
|
215 |
+
st.write("df name",evaluationer.classification_evaluation_df.iloc[-1,0])
|
216 |
+
if len_null >0:
|
217 |
+
b = list_X_after_missing_values_names.index(name)
|
218 |
+
st.write("Sdffsf",b)
|
219 |
+
st.write("df",list_X_after_missing_values[b])
|
220 |
+
X = list_X_after_missing_values[b]
|
221 |
+
if eva == "reg":
|
222 |
+
st.write("Using R2 score for Evaluation",evaluationer.reg_evaluation_df)
|
223 |
+
value = "test_r2"
|
224 |
+
evaluationer.reg_evaluation_df.sort_values(by = value,inplace= True)
|
225 |
+
st.write("adfsdf",evaluationer.reg_evaluation_df.iloc[-1,0])
|
226 |
+
name = str(evaluationer.reg_evaluation_df.iloc[-1,0])
|
227 |
+
st.write("Sdffsf",name)
|
228 |
+
if len_null >0:
|
229 |
+
b = list_X_after_missing_values_names.index(name)
|
230 |
+
st.write("Sdffsf",b)
|
231 |
+
st.write("df",list_X_after_missing_values[b])
|
232 |
+
X = list_X_after_missing_values[b]
|
233 |
+
|
234 |
+
|
235 |
+
# Create a figure and axes
|
236 |
+
num_plots = len(num_cols)
|
237 |
+
cols = 2 # Number of columns in the subplot grid
|
238 |
+
rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
|
239 |
+
|
240 |
+
fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
|
241 |
+
|
242 |
+
# Flatten the axes array for easy iteration, and remove any excess subplots
|
243 |
+
axes = axes.flatten()
|
244 |
+
for ax in axes[num_plots:]:
|
245 |
+
fig.delaxes(ax)
|
246 |
+
|
247 |
+
for i, col in enumerate(num_cols):
|
248 |
+
sns.histplot(X[col], ax=axes[i],kde = True,color=sns.color_palette('Oranges', as_cmap=True)(0.7))
|
249 |
+
axes[i].set_title(col)
|
250 |
+
|
251 |
+
# Adjust layout
|
252 |
+
plt.tight_layout()
|
253 |
+
|
254 |
+
# Show the plot in Streamlit
|
255 |
+
st.pyplot(fig)
|
256 |
+
|
257 |
+
# Create a figure and axes
|
258 |
+
num_plots = len(num_cols)
|
259 |
+
cols = 3 # Number of columns in the subplot grid
|
260 |
+
rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
|
261 |
+
|
262 |
+
fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
|
263 |
+
|
264 |
+
# Flatten the axes array for easy iteration, and remove any excess subplots
|
265 |
+
axes = axes.flatten()
|
266 |
+
for ax in axes[num_plots:]:
|
267 |
+
fig.delaxes(ax)
|
268 |
+
|
269 |
+
for i, col in enumerate(num_cols):
|
270 |
+
sns.boxplot(y=X[col], ax=axes[i],palette="magma")
|
271 |
+
axes[i].set_title(col)
|
272 |
+
|
273 |
+
# Adjust layout
|
274 |
+
plt.tight_layout()
|
275 |
+
|
276 |
+
# Show the plot in Streamlit
|
277 |
+
st.pyplot(fig)
|
278 |
+
|
279 |
+
outlier_cols = st.multiselect("De-Select columns for Detecting Outliers", num_cols,default= list(num_cols))
|
280 |
+
|
281 |
+
st.write("Checking for Outliers")
|
282 |
+
outliers_df_X,outlier_indexes = outliers.detect_outliers(X,list(outlier_cols))
|
283 |
+
st.write("Outliers in Dataframe Summary",outliers_df_X)
|
284 |
+
st.write("Columns for Outliers handling",tuple(outliers_df_X["columns name"]))
|
285 |
+
|
286 |
+
select_outlier_cols = st.multiselect("Select columns for Outlier Handling",tuple(outliers_df_X["columns name"]),default =tuple(outliers_df_X["columns name"]))
|
287 |
+
resultant,outlier_handled_df,outlier_handled_df_name= outliers.outlier_handling(X,y,model,outlier_indexes = outlier_indexes,outlier_cols = select_outlier_cols ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = "reg")
|
288 |
+
st.write("outlier handling with methods",resultant)
|
289 |
+
st.write("Best method with outlier handling",resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])
|
290 |
+
try :
|
291 |
+
st.write("Best X Data Index No.",outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0]))
|
292 |
+
|
293 |
+
st.write("Best X DataFrame after outlier handling ",outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
|
294 |
+
X = outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])]
|
295 |
+
except :
|
296 |
+
"evaluation of baseline model is better continuing with baseline model"
|
297 |
+
|
298 |
+
# result_df ,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
|
299 |
+
X_train,X_test,y_train,y_test = tts(X,y[X.index],random_state = 42,test_size = 0.2)
|
300 |
+
st.write("result_df",X)
|
301 |
+
st.write("fsdfs",X_train)
|
302 |
+
result_df_1 = feature_selections.feature_selection(X_train,X_test,y_train,y_test,model,alpha = 0.05)
|
303 |
+
st.write("sdchsvdgj",result_df_1)
|
304 |
+
|
305 |
+
|
306 |
+
|
307 |
+
|
308 |
+
|
309 |
+
|
310 |
+
|
311 |
+
|
312 |
+
|
313 |
+
|
314 |
+
|
315 |
+
|
316 |
+
|
317 |
+
|
best_tts.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.model_selection import train_test_split as tts
|
2 |
+
from sklearn.metrics import r2_score,f1_score,accuracy_score, root_mean_squared_error
|
3 |
+
import evaluationer
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
def best_tts(X,y,model,eva):
|
8 |
+
# def best_tts(X,y,test_size_range = range(10,25),random_state_range =range(1,100), stratify=None,shuffle=True,model = LinearRegression(),method = root_mean_squared_error,eva = "reg"):
|
9 |
+
|
10 |
+
if eva == "reg":
|
11 |
+
|
12 |
+
test_r2_,test_r2_ts,test_r2_rs = 0,0,0
|
13 |
+
for k in range(10,25):
|
14 |
+
i = k/100
|
15 |
+
for j in range(1,100):
|
16 |
+
X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size = i, random_state = j,)
|
17 |
+
|
18 |
+
model = model
|
19 |
+
model.fit(X_train,y_train) # model fitting
|
20 |
+
y_pred_train = model.predict(X_train) # model prediction for train
|
21 |
+
y_pred_test = model.predict(X_test) # model prediction for test
|
22 |
+
|
23 |
+
train_r2 = r2_score(y_train, y_pred_train) # evaluating r2 score for train
|
24 |
+
|
25 |
+
|
26 |
+
test_r2 = r2_score(y_test, y_pred_test) # evaluating r2 score for test
|
27 |
+
if test_r2_ < test_r2:
|
28 |
+
test_r2_ = test_r2
|
29 |
+
test_r2_ts = i
|
30 |
+
test_r2_rs = j
|
31 |
+
|
32 |
+
n_r_train, n_c_train = X_train.shape # getting no of rows and columns of train data
|
33 |
+
n_r_test, n_c_test = X_test.shape # getting no of rows and columns of test data
|
34 |
+
|
35 |
+
adj_r2_train = 1 - ((1 - train_r2)*(n_r_train - 1)/ (n_r_train - n_c_train - 1)) # evaluating adjusted r2 score for train
|
36 |
+
|
37 |
+
|
38 |
+
adj_r2_test = 1 - ((1 - test_r2)*(n_r_test - 1)/ (n_r_test - n_c_test - 1)) # evaluating adjusted r2 score for test
|
39 |
+
|
40 |
+
|
41 |
+
train_evaluation = root_mean_squared_error(y_train, y_pred_train) # evaluating train error
|
42 |
+
|
43 |
+
|
44 |
+
test_evaluation = root_mean_squared_error(y_test, y_pred_test) # evaluating test error
|
45 |
+
|
46 |
+
X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size = test_r2_ts, random_state = test_r2_rs)
|
47 |
+
evaluationer.evaluation("best_tts",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
48 |
+
return evaluationer.reg_evaluation_df,X_train,X_test,y_train,y_test
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
elif eva == "class":
|
53 |
+
global test_accuracies_,test_accuracies_ts,test_accuracies_rs
|
54 |
+
test_accuracies_,test_accuracies_ts,test_accuracies_rs = 0,0,0
|
55 |
+
|
56 |
+
for k in range(10,25):
|
57 |
+
i = k/100
|
58 |
+
for j in range(1,100):
|
59 |
+
X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size = i, random_state = j)
|
60 |
+
model = model
|
61 |
+
model.fit(X_train,y_train) # model fitting
|
62 |
+
y_pred_train = model.predict(X_train) # model prediction for train
|
63 |
+
y_pred_test = model.predict(X_test) # model prediction for test
|
64 |
+
# y_pred_proba_train= model.predict_proba(X_train)
|
65 |
+
# y_pred_proba_test= model.predict_proba(X_test)
|
66 |
+
|
67 |
+
|
68 |
+
unique_classes = np.unique(y_train)
|
69 |
+
|
70 |
+
# Determine the average method
|
71 |
+
if len(unique_classes) == 2:
|
72 |
+
# Binary classification
|
73 |
+
# print("Using 'binary' average for binary classification.")
|
74 |
+
average_method = 'binary'
|
75 |
+
elif len(unique_classes)!=2:
|
76 |
+
# Determine the distribution of the target column
|
77 |
+
class_counts = np.bincount(y_train)
|
78 |
+
|
79 |
+
# Check if the dataset is imbalanced
|
80 |
+
imbalance_ratio = max(class_counts) / min(class_counts)
|
81 |
+
|
82 |
+
if imbalance_ratio > 1.5:
|
83 |
+
# Imbalanced dataset
|
84 |
+
# print("Using 'weighted' average due to imbalanced dataset.")
|
85 |
+
average_method = 'weighted'
|
86 |
+
else:
|
87 |
+
# Balanced dataset
|
88 |
+
# print("Using 'macro' average due to balanced dataset.")
|
89 |
+
average_method = 'macro'
|
90 |
+
# F1 scores
|
91 |
+
train_f1_scores = (f1_score(y_train, y_pred_train,average=average_method))
|
92 |
+
|
93 |
+
|
94 |
+
test_f1_scores = (f1_score(y_test, y_pred_test,average=average_method))
|
95 |
+
|
96 |
+
# Accuracies
|
97 |
+
train_accuracies = (accuracy_score(y_train, y_pred_train))
|
98 |
+
|
99 |
+
test_accuracies = (accuracy_score(y_test, y_pred_test))
|
100 |
+
if test_accuracies_ <test_accuracies:
|
101 |
+
test_accuracies_,test_accuracies_ts,test_accuracies_rs =test_accuracies, i,j
|
102 |
+
X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size = test_accuracies_ts, random_state = test_accuracies_rs)
|
103 |
+
print(f"test_size = {test_accuracies_ts}, random_state = {test_accuracies_rs}")
|
104 |
+
|
105 |
+
evaluationer.evaluation("best_tts",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
106 |
+
|
107 |
+
|
108 |
+
return evaluationer.classification_evaluation_df,X_train,X_test,y_train,y_test
|
109 |
+
|
110 |
+
|
evaluationer.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# importing libraries
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
from sklearn.model_selection import train_test_split
|
5 |
+
from sklearn.metrics import root_mean_squared_error,r2_score,mean_squared_error,root_mean_squared_log_error,mean_absolute_error,mean_squared_log_error
|
6 |
+
from sklearn.metrics import f1_score, accuracy_score, precision_score,recall_score, average_precision_score
|
7 |
+
# creating a class for evaluation
|
8 |
+
|
9 |
+
reg_evaluation_df = pd.DataFrame({"evaluation_df_method" :[],
|
10 |
+
"model": [],# model displays regression model
|
11 |
+
"method": [],# method display evaluation metrics used
|
12 |
+
"train_r2": [],# train r2 shows train R2 score
|
13 |
+
"test_r2": [],# test r2 shows test R2 Score
|
14 |
+
"adjusted_r2_train": [],# adjusted_r2_train shows adjusted r2 score for train
|
15 |
+
"adjusted_r2_test": [],# adjusted_r2_test shows adjusted r2 score for test
|
16 |
+
"train_evaluation": [],# train_evaluation shows train evaluation score by used method
|
17 |
+
"test_evaluation" : []# test_evaluation shows test evaluation score by used method
|
18 |
+
})
|
19 |
+
|
20 |
+
classification_evaluation_df = pd.DataFrame({"evaluation_df_method" :[],
|
21 |
+
'model': [],
|
22 |
+
'train_f1': [],
|
23 |
+
'test_f1': [],
|
24 |
+
'train_acc': [],
|
25 |
+
'test_acc': [],
|
26 |
+
'precision_train': [],
|
27 |
+
'precision_test': [],
|
28 |
+
'recall_train': [],
|
29 |
+
'recall_test': []
|
30 |
+
})
|
31 |
+
|
32 |
+
# function for evaluating dataframe
|
33 |
+
def evaluation(evaluation_df_method,X_train,X_test,y_train,y_test,model,method,eva):# input parameters from train_test_split , model and method for evaluation.
|
34 |
+
global y_pred_train,y_pred_test,y_pred_proba_train,y_pred_proba_test
|
35 |
+
model = model
|
36 |
+
model.fit(X_train,y_train) # model fitting
|
37 |
+
y_pred_train = model.predict(X_train) # model prediction for train
|
38 |
+
y_pred_test = model.predict(X_test) # model prediction for test
|
39 |
+
|
40 |
+
if eva == "reg":
|
41 |
+
|
42 |
+
train_r2 = r2_score(y_train, y_pred_train) # evaluating r2 score for train
|
43 |
+
test_r2 = r2_score(y_test, y_pred_test) # evaluating r2 score for test
|
44 |
+
|
45 |
+
n_r_train, n_c_train = X_train.shape # getting no of rows and columns of train data
|
46 |
+
n_r_test, n_c_test = X_test.shape # getting no of rows and columns of test data
|
47 |
+
|
48 |
+
adj_r2_train = 1 - ((1 - train_r2)*(n_r_train - 1)/ (n_r_train - n_c_train - 1)) # evaluating adjusted r2 score for train
|
49 |
+
adj_r2_test = 1 - ((1 - test_r2)*(n_r_test - 1)/ (n_r_test - n_c_test - 1)) # evaluating adjusted r2 score for test
|
50 |
+
|
51 |
+
train_evaluation = method(y_train, y_pred_train) # evaluating train error
|
52 |
+
test_evaluation = method(y_test, y_pred_test) # evaluating test error
|
53 |
+
|
54 |
+
if method == root_mean_squared_error:
|
55 |
+
a = "root_mean_squared_error"
|
56 |
+
elif method ==root_mean_squared_log_error:
|
57 |
+
a = "root_mean_squared_log_error"
|
58 |
+
elif method == mean_absolute_error:
|
59 |
+
a = "mean_absolute_error"
|
60 |
+
elif method == mean_squared_error:
|
61 |
+
a = "mean_squared_error"
|
62 |
+
elif method == mean_squared_log_error:
|
63 |
+
a = "mean_squared_log_error"
|
64 |
+
|
65 |
+
# declaring global dataframes
|
66 |
+
global reg_evaluation_df,temp_df
|
67 |
+
|
68 |
+
# creating temporary dataframe for concating in later into main evaluation dataframe
|
69 |
+
temp_df = pd.DataFrame({"evaluation_df_method" :[evaluation_df_method],
|
70 |
+
"model": [model],
|
71 |
+
"method": [a],
|
72 |
+
"train_r2": [train_r2],
|
73 |
+
"test_r2": [test_r2],
|
74 |
+
"adjusted_r2_train": [adj_r2_train],
|
75 |
+
"adjusted_r2_test": [adj_r2_test],
|
76 |
+
"train_evaluation": [train_evaluation],
|
77 |
+
"test_evaluation" : [test_evaluation]
|
78 |
+
})
|
79 |
+
reg_evaluation_df = pd.concat([reg_evaluation_df,temp_df]).reset_index(drop = True)
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
# return reg_evaluation_df # returning evaluation_df
|
85 |
+
|
86 |
+
elif eva == "class":
|
87 |
+
|
88 |
+
# y_pred_proba_train= model.predict_proba(X_train)
|
89 |
+
# y_pred_proba_test= model.predict_proba(X_test)
|
90 |
+
|
91 |
+
unique_classes = np.unique(y_train)
|
92 |
+
|
93 |
+
# Determine the average method
|
94 |
+
if len(unique_classes) == 2:
|
95 |
+
# Binary classification
|
96 |
+
print("Using 'binary' average for binary classification.")
|
97 |
+
average_method = 'binary'
|
98 |
+
elif len(unique_classes)!=2:
|
99 |
+
# Determine the distribution of the target column
|
100 |
+
class_counts = np.bincount(y_train)
|
101 |
+
|
102 |
+
# Check if the dataset is imbalanced
|
103 |
+
imbalance_ratio = max(class_counts) / min(class_counts)
|
104 |
+
|
105 |
+
if imbalance_ratio > 1.5:
|
106 |
+
# Imbalanced dataset
|
107 |
+
print("Using 'weighted' average due to imbalanced dataset.")
|
108 |
+
average_method = 'weighted'
|
109 |
+
else:
|
110 |
+
# Balanced dataset
|
111 |
+
print("Using 'macro' average due to balanced dataset.")
|
112 |
+
average_method = 'macro'
|
113 |
+
|
114 |
+
# F1 scores
|
115 |
+
train_f1_scores = (f1_score(y_train, y_pred_train,average=average_method))
|
116 |
+
test_f1_scores = (f1_score(y_test, y_pred_test,average=average_method))
|
117 |
+
|
118 |
+
# Accuracies
|
119 |
+
train_accuracies = (accuracy_score(y_train, y_pred_train))
|
120 |
+
test_accuracies = (accuracy_score(y_test, y_pred_test))
|
121 |
+
|
122 |
+
# Precisions
|
123 |
+
train_precisions = (precision_score(y_train, y_pred_train,average=average_method))
|
124 |
+
test_precisions = (precision_score(y_test, y_pred_test,average=average_method))
|
125 |
+
|
126 |
+
# Recalls
|
127 |
+
train_recalls = (recall_score(y_train, y_pred_train,average=average_method))
|
128 |
+
test_recalls = (recall_score(y_test, y_pred_test,average=average_method))
|
129 |
+
|
130 |
+
# declaring global dataframes
|
131 |
+
global classification_evaluation_df,temp_df1
|
132 |
+
|
133 |
+
# creating temporary dataframe for concating in later into main evaluation dataframe
|
134 |
+
temp_df1 = pd.DataFrame({"evaluation_df_method" :[evaluation_df_method],
|
135 |
+
'model': [model],
|
136 |
+
'train_f1': [train_f1_scores],
|
137 |
+
'test_f1': [test_f1_scores],
|
138 |
+
'train_acc': [train_accuracies],
|
139 |
+
'test_acc': [test_accuracies],
|
140 |
+
'precision_train': [train_precisions],
|
141 |
+
'precision_test': [test_precisions],
|
142 |
+
'recall_train': [train_recalls],
|
143 |
+
'recall_test': [test_recalls]
|
144 |
+
})
|
145 |
+
classification_evaluation_df = pd.concat([classification_evaluation_df, temp_df1]).reset_index(drop = True)
|
146 |
+
|
147 |
+
return classification_evaluation_df # returning evaluation_df
|
148 |
+
|
149 |
+
global method_df
|
150 |
+
method_df = pd.DataFrame(data = [root_mean_squared_error, root_mean_squared_log_error,mean_absolute_error,mean_squared_error,mean_squared_log_error],
|
151 |
+
index = ["root_mean_squared_error", "root_mean_squared_log_error","mean_absolute_error","mean_squared_error","mean_squared_log_error"])
|
feature_selections.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.feature_selection import mutual_info_regression
|
2 |
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
3 |
+
from sklearn.linear_model import Lasso
|
4 |
+
from sklearn.linear_model import LogisticRegression
|
5 |
+
from sklearn.metrics import roc_curve, auc
|
6 |
+
import statsmodels.api as sm
|
7 |
+
import pandas as pd
|
8 |
+
import numpy as np
|
9 |
+
import evaluationer
|
10 |
+
import streamlit as st
|
11 |
+
# import root_mean_squared_error
|
12 |
+
from sklearn.metrics import root_mean_squared_error
|
13 |
+
def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
|
14 |
+
|
15 |
+
st.write("dvsdv",y_train)
|
16 |
+
st.write("dvfssdv",X_train)
|
17 |
+
|
18 |
+
model = sm.OLS(y_train, sm.add_constant(X_train))
|
19 |
+
model_fit = model.fit()
|
20 |
+
pval_cols = model_fit.pvalues[model_fit.pvalues > 0.05].index.tolist()
|
21 |
+
coef_cols = model_fit.params[abs(model_fit.params) < 0.001].index.tolist()
|
22 |
+
pval_and_coef_cols = list(set(coef_cols) | set(pval_cols))
|
23 |
+
|
24 |
+
mi_scores = mutual_info_regression(X_train, y_train)
|
25 |
+
mi = pd.DataFrame()
|
26 |
+
|
27 |
+
mi["col_name"] = X_train.columns
|
28 |
+
mi["mi_score"] = mi_scores
|
29 |
+
|
30 |
+
mi_cols = mi[mi.mi_score ==0].col_name.values.tolist()
|
31 |
+
|
32 |
+
corr = X_train.corr()
|
33 |
+
|
34 |
+
corru= pd.DataFrame(np.triu(corr),columns = corr.columns , index = corr.index)
|
35 |
+
corr_u_cols = corru[corru[(corru > 0.5 )& (corru <1)].any()].index.tolist()
|
36 |
+
|
37 |
+
corrl= pd.DataFrame(np.tril(corr),columns = corr.columns , index = corr.index)
|
38 |
+
corr_l_cols = corrl[corrl[(corrl > 0.5 )& (corrl <1)].any()].index.tolist()
|
39 |
+
|
40 |
+
X_new_vif = sm.add_constant(X_train)
|
41 |
+
# Calculating VIF
|
42 |
+
vif = pd.DataFrame()
|
43 |
+
vif["variables"] = X_new_vif.columns
|
44 |
+
vif["VIF"] = [variance_inflation_factor(X_new_vif.values, i) for i in range(X_new_vif.shape[1])]
|
45 |
+
st.write("gdfgdsdsdfad",vif)
|
46 |
+
if len(vif[vif["variables"] == "const"]) == 1:
|
47 |
+
vif = vif.drop(index = (vif[vif["variables"] == "const"].index[0]))
|
48 |
+
st.write("gdfgdsad",vif)
|
49 |
+
# drop const in vif cols
|
50 |
+
# vif_cols = X_new_vif.drop(columns = "const")
|
51 |
+
vif_cols = vif[vif.VIF >10].variables.tolist()
|
52 |
+
|
53 |
+
|
54 |
+
# lasso
|
55 |
+
if alpha == "best":
|
56 |
+
|
57 |
+
lasso_len = []
|
58 |
+
alpha_i = []
|
59 |
+
for i in range(1,1000,5):
|
60 |
+
j = i/10000
|
61 |
+
|
62 |
+
model_lasso = Lasso(alpha=j)
|
63 |
+
model_lasso.fit(X_train, y_train)
|
64 |
+
col_df = pd.DataFrame({
|
65 |
+
"col_name": X_train.columns,
|
66 |
+
"lasso_coef": model_lasso.coef_
|
67 |
+
})
|
68 |
+
a = len(col_df[col_df.lasso_coef ==0])
|
69 |
+
lasso_len.append(a)
|
70 |
+
alpha_i.append(j)
|
71 |
+
for i in zip(lasso_len,alpha_i):
|
72 |
+
print(i)
|
73 |
+
input_alpha = float(input("enter alpha"))
|
74 |
+
model_lasso = Lasso(alpha=input_alpha)
|
75 |
+
model_lasso.fit(X_train, y_train)
|
76 |
+
col_df = pd.DataFrame({
|
77 |
+
"col_name": X_train.columns,
|
78 |
+
"lasso_coef": model_lasso.coef_
|
79 |
+
})
|
80 |
+
|
81 |
+
lasso_cols =col_df[col_df.lasso_coef ==0].col_name.tolist()
|
82 |
+
else:
|
83 |
+
model_lasso = Lasso(alpha=alpha)
|
84 |
+
model_lasso.fit(X_train, y_train)
|
85 |
+
col_df = pd.DataFrame({
|
86 |
+
"col_name": X_train.columns,
|
87 |
+
"lasso_coef": model_lasso.coef_
|
88 |
+
})
|
89 |
+
|
90 |
+
lasso_cols =col_df[col_df.lasso_coef ==0].col_name.tolist()
|
91 |
+
|
92 |
+
feature_cols = [pval_cols,coef_cols,pval_and_coef_cols,mi_cols,corr_u_cols,corr_l_cols,vif_cols,lasso_cols]
|
93 |
+
|
94 |
+
for col in feature_cols:
|
95 |
+
|
96 |
+
try:
|
97 |
+
st.write(f"{col}",X_train.drop(columns = col))
|
98 |
+
except:
|
99 |
+
st.write(f"error IN col")
|
100 |
+
feature_cols_name = ["pval_cols","coef_cols","pval_and_coef_cols","mi_cols","corr_u_cols","corr_l_cols","vif_cols","lasso_cols"]
|
101 |
+
st.write("feature_cols", vif_cols)
|
102 |
+
for i,j in enumerate(feature_cols):
|
103 |
+
evaluationer.evaluation(f"{feature_cols_name[i]} dropped" ,X_train.drop(columns = j),X_test.drop(columns = j),y_train,y_test,model_reg,method = root_mean_squared_error,eva = "reg")
|
104 |
+
return evaluationer.reg_evaluation_df
|
models.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
import seaborn as sns
|
5 |
+
# import algorithms for classification
|
6 |
+
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
|
7 |
+
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,HistGradientBoostingClassifier
|
8 |
+
from sklearn.neighbors import KNeighborsClassifier
|
9 |
+
from sklearn.tree import DecisionTreeClassifier
|
10 |
+
from sklearn.svm import SVC
|
11 |
+
from xgboost import XGBClassifier,XGBRFClassifier
|
12 |
+
from sklearn.neural_network import MLPClassifier
|
13 |
+
from lightgbm import LGBMClassifier
|
14 |
+
from sklearn.naive_bayes import MultinomialNB,CategoricalNB
|
15 |
+
# import algorithms for regression
|
16 |
+
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet
|
17 |
+
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor,HistGradientBoostingRegressor
|
18 |
+
from sklearn.neighbors import KNeighborsRegressor
|
19 |
+
from sklearn.tree import DecisionTreeRegressor
|
20 |
+
from sklearn.svm import SVR
|
21 |
+
from xgboost import XGBRegressor, XGBRFRegressor
|
22 |
+
from sklearn.neural_network import MLPRegressor
|
23 |
+
from lightgbm import LGBMRegressor
|
24 |
+
from sklearn.naive_bayes import GaussianNB
|
25 |
+
|
26 |
+
# dictionary where keys are name of algorithm and values are algorithm for classifier
|
27 |
+
algos_class = {
|
28 |
+
"Logistic Regression": LogisticRegression(),
|
29 |
+
"SGD Classifier": SGDClassifier(),
|
30 |
+
"Ridge Classifier": RidgeClassifier(),
|
31 |
+
"Random Forest Classifier": RandomForestClassifier(),
|
32 |
+
"AdaBoost Classifier": AdaBoostClassifier(),
|
33 |
+
"Gradient Boosting Classifier": GradientBoostingClassifier(),
|
34 |
+
"Hist Gradient Boosting Classifier": HistGradientBoostingClassifier(),
|
35 |
+
"K Neighbors Classifier": KNeighborsClassifier(),
|
36 |
+
"Decision Tree Classifier": DecisionTreeClassifier(),
|
37 |
+
"SVC": SVC(),
|
38 |
+
"XGB Classifier": XGBClassifier(),
|
39 |
+
"XGBRF Classifier": XGBRFClassifier(),
|
40 |
+
"MLP Classifier": MLPClassifier(),
|
41 |
+
"LGBM Classifier": LGBMClassifier(),
|
42 |
+
"Multinomial Naive Bayes": MultinomialNB(),
|
43 |
+
"Categorical Naive Bayes": CategoricalNB()}
|
44 |
+
|
45 |
+
# dictionary where keys are name of algorithm and values are algorithm for regression
|
46 |
+
algos_reg = {
|
47 |
+
"Linear Regression": LinearRegression(),
|
48 |
+
"SGD Regressor": SGDRegressor(),
|
49 |
+
"Ridge Regressor": Ridge(),
|
50 |
+
"Lasso Regressor": Lasso(),
|
51 |
+
"ElasticNet Regressor": ElasticNet(),
|
52 |
+
"Random Forest Regressor": RandomForestRegressor(),
|
53 |
+
"AdaBoost Regressor": AdaBoostRegressor(),
|
54 |
+
"Gradient Boosting Regressor": GradientBoostingRegressor(),
|
55 |
+
"Hist Gradient Boosting Regressor": HistGradientBoostingRegressor(),
|
56 |
+
"K Neighbors Regressor": KNeighborsRegressor(),
|
57 |
+
"Decision Tree Regressor": DecisionTreeRegressor(),
|
58 |
+
"SVR": SVR(),
|
59 |
+
"XGB Regressor": XGBRegressor(),
|
60 |
+
"XGBRF Regressor": XGBRFRegressor(),
|
61 |
+
"MLP Regressor": MLPRegressor(),
|
62 |
+
"LGBM Regressor": LGBMRegressor(),
|
63 |
+
"Gaussian Naive Bayes": GaussianNB()}
|
64 |
+
|
65 |
+
# dataframe where index are name of algorithm as "algorithm name" , column is algorithm as "algorithm"
|
66 |
+
|
67 |
+
Classification_models = pd.DataFrame(data=algos_class.values(), index=algos_class.keys())
|
68 |
+
|
69 |
+
Regression_models = pd.DataFrame(data=algos_reg.values(), index=algos_reg.keys())
|
70 |
+
|
null_value_handling.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import streamlit as st
|
3 |
+
# import simple imputer, iter imputer , knn inputer
|
4 |
+
from sklearn.model_selection import train_test_split as tts
|
5 |
+
from sklearn.experimental import enable_iterative_imputer
|
6 |
+
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
|
7 |
+
import evaluationer
|
8 |
+
# import label, ohe,ordinal encoder
|
9 |
+
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
|
10 |
+
|
11 |
+
# creating a function for null_handling with different methods for null value imputing, categorical columns encoding and evaluation
|
12 |
+
|
13 |
+
null_value_handling_method_num_cols = ["KNN Imputed","SI Mean Imputed","SI Median Imputed","SI Most Frequent Imputed","Iter Imputed"]
|
14 |
+
null_value_handling_method_cat_cols = ["SI Most Frequent Imputed (categorical)"]
|
15 |
+
|
16 |
+
# dict for null value handling method num cols
|
17 |
+
|
18 |
+
dict1 = {"KNN Imputed" :KNNImputer(n_neighbors = 5),"SI Mean Imputed":SimpleImputer(strategy = "mean"),"SI Median Imputed":SimpleImputer(strategy = "median"),
|
19 |
+
"SI Most Frequent Imputed":SimpleImputer(strategy = "most_frequent"),"Iter Imputed":IterativeImputer(max_iter = 200,random_state= 42)}
|
20 |
+
|
21 |
+
dict2 = {"SI Most Frequent Imputed (categorical)":SimpleImputer(strategy = "most_frequent")}
|
22 |
+
|
23 |
+
# creating dataframe from dict1 and dict2
|
24 |
+
num_nvh_method_df = pd.DataFrame(data=dict1.values(), index=dict1.keys())
|
25 |
+
cat_nvh_method_df = pd.DataFrame(data=dict2.values(), index=dict2.keys())
|
26 |
+
|
27 |
+
num_imputed_dict = {"KNN Imputed":[],"SI Mean Imputed":[],"SI Median Imputed":[],"SI Most Frequent Imputed":[],"Iter Imputed":[]}
|
28 |
+
|
29 |
+
cat_imputed_dict = {"SI Most Frequent Imputed (categorical)":[],"Iter Imputed":[]}
|
30 |
+
|
31 |
+
num_imputed_df = pd.DataFrame(data = num_imputed_dict.values(),index = num_imputed_dict.keys())
|
32 |
+
|
33 |
+
cat_imputed_df = pd.DataFrame(data = cat_imputed_dict.values(),index = cat_imputed_dict.keys())
|
34 |
+
|
35 |
+
final_df = []
|
36 |
+
def null_handling(X,clean_num_nvh_df,clean_num_nvh_df_cat):
|
37 |
+
num_nvh_method = clean_num_nvh_df.columns #KNN Imputed","SI Mean Imputed","SI Media
|
38 |
+
cat_nvh_method = clean_num_nvh_df_cat.columns
|
39 |
+
for method in num_nvh_method:
|
40 |
+
X[clean_num_nvh_df[method].dropna().values] = num_nvh_method_df.loc[method].values[0].fit_transform(X[clean_num_nvh_df[method].dropna().values])
|
41 |
+
|
42 |
+
for method in cat_nvh_method:
|
43 |
+
X[clean_num_nvh_df_cat[method].dropna().values] = cat_nvh_method_df.loc[method].values[0].fit_transform(X[clean_num_nvh_df_cat[method].dropna().values])
|
44 |
+
|
45 |
+
final_df = X
|
46 |
+
|
47 |
+
return final_df
|
48 |
+
|
49 |
+
|
outliers.py
ADDED
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
import seaborn as sns
|
5 |
+
from scipy.stats.mstats import winsorize
|
6 |
+
from sklearn.preprocessing import StandardScaler,MinMaxScaler
|
7 |
+
from sklearn.metrics import root_mean_squared_error
|
8 |
+
from scipy.stats import yeojohnson
|
9 |
+
import evaluationer
|
10 |
+
from sklearn.model_selection import train_test_split as tts
|
11 |
+
def detect_outliers(df,num_cols):
|
12 |
+
global outlier_df,zscore_cols,outlier_indexes,iqr_cols
|
13 |
+
outlier_df = pd.DataFrame({"method" :[],"columns name":[],"upper limit":[],
|
14 |
+
"lower limit":[],"no of Rows":[],"percentage outlier":[]})
|
15 |
+
if type(num_cols) == list:
|
16 |
+
if len(num_cols)!=0:
|
17 |
+
num_cols = num_cols
|
18 |
+
else:
|
19 |
+
num_cols = df.select_dtypes(exclude = "object").columns.tolist()
|
20 |
+
else:
|
21 |
+
if num_cols.tolist() != None:
|
22 |
+
num_cols = num_cols
|
23 |
+
else:
|
24 |
+
num_cols = df.select_dtypes(exclude = "object").columns.tolist()
|
25 |
+
zscore_cols = []
|
26 |
+
iqr_cols = []
|
27 |
+
outlier_indexes =[]
|
28 |
+
for col in num_cols:
|
29 |
+
skewness = df[col].skew()
|
30 |
+
if -0.5 <= skewness <= 0.5:
|
31 |
+
method = "zscore"
|
32 |
+
zscore_cols.append(col)
|
33 |
+
|
34 |
+
else:
|
35 |
+
method = "iqr"
|
36 |
+
iqr_cols.append(col)
|
37 |
+
if len(zscore_cols) >0:
|
38 |
+
for col in zscore_cols:
|
39 |
+
mean = df[col].mean()
|
40 |
+
std = df[col].std()
|
41 |
+
ul = mean + (3*std)
|
42 |
+
ll = mean - (3*std)
|
43 |
+
mask = (df[col] < ll) | (df[col] > ul)
|
44 |
+
temp = df[mask]
|
45 |
+
|
46 |
+
Zscore_index = temp.index.tolist()
|
47 |
+
outlier_indexes.extend(Zscore_index)
|
48 |
+
|
49 |
+
if len(temp)>0:
|
50 |
+
|
51 |
+
temp_df = pd.DataFrame({"method" : ["ZScore"],
|
52 |
+
"columns name" : [col],
|
53 |
+
"upper limit" : [round(ul,2)],
|
54 |
+
"lower limit" :[ round(ll,2)],
|
55 |
+
"no of Rows" : [len(temp)],
|
56 |
+
"percentage outlier" : [round(len(temp)*100/len(df),2)]})
|
57 |
+
|
58 |
+
outlier_df = pd.concat([outlier_df,temp_df]).reset_index(drop = True)
|
59 |
+
|
60 |
+
else:
|
61 |
+
print("No columns for Zscore method")
|
62 |
+
|
63 |
+
|
64 |
+
if len(iqr_cols) >0:
|
65 |
+
for col in iqr_cols:
|
66 |
+
q3 = df[col].quantile(.75)
|
67 |
+
q1 = df[col].quantile(.25)
|
68 |
+
IQR = q3 -q1
|
69 |
+
ul = q3 + 1.5*IQR
|
70 |
+
ll = q1 - 1.5*IQR
|
71 |
+
mask = (df[col] < ll) | (df[col] > ul)
|
72 |
+
temp = df[mask]
|
73 |
+
|
74 |
+
IQR_index = temp.index.tolist()
|
75 |
+
outlier_indexes.extend(IQR_index)
|
76 |
+
|
77 |
+
if len(temp)>0:
|
78 |
+
list(outlier_indexes).append(list(IQR_index))
|
79 |
+
|
80 |
+
temp_df1 = pd.DataFrame({"method" : ["IQR"],
|
81 |
+
"columns name" : [col],
|
82 |
+
"upper limit" : [round(ul,2)],
|
83 |
+
"lower limit" : [round(ll,2)],
|
84 |
+
"no of Rows": [len(temp)],
|
85 |
+
"percentage outlier" : [round((len(temp)*100/len(df)),2)]
|
86 |
+
})
|
87 |
+
|
88 |
+
outlier_df = pd.concat([outlier_df,temp_df1]).reset_index(drop = True)
|
89 |
+
|
90 |
+
else:
|
91 |
+
print("No columns for IQR method")
|
92 |
+
|
93 |
+
|
94 |
+
outlier_indexes = list(set(outlier_indexes))
|
95 |
+
|
96 |
+
return outlier_df,outlier_indexes
|
97 |
+
|
98 |
+
|
99 |
+
def outlier_handling(df,y,model,outlier_indexes = [],outlier_cols = None ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = "reg"):
|
100 |
+
num_col = df.select_dtypes(exclude = "O").columns
|
101 |
+
|
102 |
+
global outliers_dropped_df,log_transformed_df,sqrt_transformed_df,yeo_johnson_transformed_df,rank_transformed_df
|
103 |
+
global std_scaler_df,winsorize_transformed_df,inverse_log_transformed_winsorize_df,inverse_sqrt_transformed_winsorize_df,minmaxscaler_df
|
104 |
+
if eva == "reg":
|
105 |
+
if len(outlier_indexes) ==0:
|
106 |
+
print("no outlier indexes passed")
|
107 |
+
outliers_dropped_df = df.copy()
|
108 |
+
else:
|
109 |
+
outliers_dropped_df = df.drop(index =outlier_indexes)
|
110 |
+
|
111 |
+
if outlier_cols != None:
|
112 |
+
|
113 |
+
if df[outlier_cols][df[outlier_cols] <0].sum().sum() == 0:
|
114 |
+
log_transformed_df = df.copy()
|
115 |
+
log_transformed_df[outlier_cols] = np.log(log_transformed_df[outlier_cols] + 1e-5)
|
116 |
+
sqrt_transformed_df = df.copy()
|
117 |
+
sqrt_transformed_df[outlier_cols] = np.sqrt(sqrt_transformed_df[outlier_cols] + 1e-5)
|
118 |
+
inverse_log_transformed_winsorize_df = log_transformed_df.copy()
|
119 |
+
inverse_sqrt_transformed_winsorize_df = sqrt_transformed_df.copy()
|
120 |
+
for column in outlier_cols:
|
121 |
+
inverse_log_transformed_winsorize_df[column] = np.exp(winsorize(inverse_log_transformed_winsorize_df[column], limits=[0.05, 0.05]))
|
122 |
+
inverse_sqrt_transformed_winsorize_df[column] = (winsorize(inverse_sqrt_transformed_winsorize_df[column], limits=[0.05, 0.05]))**2
|
123 |
+
else:
|
124 |
+
print("df have values less than zero")
|
125 |
+
std_scaler_df = df.copy()
|
126 |
+
std_scaler_df[outlier_cols] = StandardScaler().fit_transform(std_scaler_df[outlier_cols])
|
127 |
+
|
128 |
+
minmaxscaler_df = df.copy()
|
129 |
+
minmaxscaler_df[outlier_cols] = MinMaxScaler().fit_transform(minmaxscaler_df[outlier_cols])
|
130 |
+
|
131 |
+
yeo_johnson_transformed_df = df.copy()
|
132 |
+
for column in outlier_cols:
|
133 |
+
try:
|
134 |
+
yeo_johnson_transformed_df[column], lambda_ = yeojohnson(yeo_johnson_transformed_df[column])
|
135 |
+
|
136 |
+
except :
|
137 |
+
yeo_johnson_transformed_df[column] = yeo_johnson_transformed_df[column]
|
138 |
+
|
139 |
+
print(f"Yeo-Johnson transformation failed for column '{column}'. Original data used.")
|
140 |
+
# yeo_johnson_transformed_df[column], lambda_ = yeojohnson(yeo_johnson_transformed_df[column])
|
141 |
+
rank_transformed_df = df.copy()
|
142 |
+
rank_transformed_df[outlier_cols] = rank_transformed_df[outlier_cols].rank()
|
143 |
+
winsorize_transformed_df = df.copy()
|
144 |
+
for column in outlier_cols:
|
145 |
+
winsorize_transformed_df[column] = winsorize(winsorize_transformed_df[column], limits=[0.05, 0.05])
|
146 |
+
|
147 |
+
|
148 |
+
|
149 |
+
else:
|
150 |
+
|
151 |
+
|
152 |
+
if df[num_col][df[num_col] <0].sum().sum() == 0:
|
153 |
+
log_transformed_df = df.copy()
|
154 |
+
log_transformed_df[num_col] = np.log(log_transformed_df[num_col] + 1e-5)
|
155 |
+
sqrt_transformed_df = df.copy()
|
156 |
+
sqrt_transformed_df[num_col] = np.sqrt(sqrt_transformed_df[num_col] + 1e-5)
|
157 |
+
inverse_log_transformed_winsorize_df = log_transformed_df.copy()
|
158 |
+
inverse_sqrt_transformed_winsorize_df = sqrt_transformed_df.copy()
|
159 |
+
for column in num_col:
|
160 |
+
inverse_log_transformed_winsorize_df[column] = np.exp(winsorize(inverse_log_transformed_winsorize_df[column], limits=[0.05, 0.05]))
|
161 |
+
inverse_sqrt_transformed_winsorize_df[column] = (winsorize(inverse_sqrt_transformed_winsorize_df[column], limits=[0.05, 0.05]))**2
|
162 |
+
else:
|
163 |
+
|
164 |
+
print("df have values less than zero")
|
165 |
+
|
166 |
+
std_scaler_df = df.copy()
|
167 |
+
std_scaler_df[outlier_cols] = StandardScaler().fit_transform(std_scaler_df[outlier_cols])
|
168 |
+
|
169 |
+
minmaxscaler_df = df.copy()
|
170 |
+
minmaxscaler_df[outlier_cols] = MinMaxScaler().fit_transform(minmaxscaler_df[outlier_cols])
|
171 |
+
|
172 |
+
yeo_johnson_transformed_df = df.copy()
|
173 |
+
for column in num_col:
|
174 |
+
try:
|
175 |
+
yeo_johnson_transformed_df[column], lambda_ = yeojohnson(yeo_johnson_transformed_df[column])
|
176 |
+
|
177 |
+
except :
|
178 |
+
yeo_johnson_transformed_df[column] = yeo_johnson_transformed_df[column]
|
179 |
+
|
180 |
+
print(f"Yeo-Johnson transformation failed for column '{column}'. Original data used.")
|
181 |
+
# yeo_johnson_transformed_df[column], lambda_ = yeojohnson(yeo_johnson_transformed_df[column])
|
182 |
+
rank_transformed_df = df.copy()
|
183 |
+
rank_transformed_df[num_col] = rank_transformed_df[num_col].rank()
|
184 |
+
winsorize_transformed_df = df.copy()
|
185 |
+
for column in num_col:
|
186 |
+
winsorize_transformed_df[column] = winsorize(winsorize_transformed_df[column], limits=[0.05, 0.05])
|
187 |
+
|
188 |
+
if (df[num_col][df[num_col] <0].sum().sum() == 0):
|
189 |
+
outlier_handled_df = [std_scaler_df,minmaxscaler_df,outliers_dropped_df,log_transformed_df,sqrt_transformed_df,yeo_johnson_transformed_df,
|
190 |
+
rank_transformed_df,winsorize_transformed_df,inverse_log_transformed_winsorize_df,inverse_sqrt_transformed_winsorize_df]
|
191 |
+
|
192 |
+
outlier_handled_df_name = ["std_scaler_df","minmaxscaler_df","outliers_dropped_df", "log_transformed_df","sqrt_transformed_df", "yeo_johnson_transformed_df","rank_transformed_df","winsorize_transformed_df",
|
193 |
+
"inverse_log_transformed_winsorize_df", "inverse_sqrt_transformed_winsorize_df"]
|
194 |
+
elif df[outlier_cols][df[outlier_cols] <0].sum().sum() == 0:
|
195 |
+
outlier_handled_df = [std_scaler_df,minmaxscaler_df,outliers_dropped_df,log_transformed_df,sqrt_transformed_df,yeo_johnson_transformed_df,
|
196 |
+
rank_transformed_df,winsorize_transformed_df,inverse_log_transformed_winsorize_df,inverse_sqrt_transformed_winsorize_df]
|
197 |
+
|
198 |
+
outlier_handled_df_name = ["std_scaler_df","minmaxscaler_df","outliers_dropped_df","log_transformed_df", "sqrt_transformed_df","yeo_johnson_transformed_df","rank_transformed_df",
|
199 |
+
"winsorize_transformed_df","inverse_log_transformed_winsorize_df","inverse_sqrt_transformed_winsorize_df"]
|
200 |
+
|
201 |
+
else:
|
202 |
+
outlier_handled_df = [std_scaler_df,minmaxscaler_df,outliers_dropped_df,yeo_johnson_transformed_df,rank_transformed_df,winsorize_transformed_df]
|
203 |
+
|
204 |
+
outlier_handled_df_name = ["std_scaler_df","minmaxscaler_df","outliers_dropped_df","yeo_johnson_transformed_df","rank_transformed_df","winsorize_transformed_df"]
|
205 |
+
|
206 |
+
for j,i in enumerate(outlier_handled_df):
|
207 |
+
X_train, X_test, y_train, y_test = tts(i,y[i.index],test_size = test_size, random_state = random_state)
|
208 |
+
evaluationer.evaluation(f"{outlier_handled_df_name[j]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
209 |
+
|
210 |
+
|
211 |
+
return evaluationer.reg_evaluation_df , outlier_handled_df,outlier_handled_df_name
|
212 |
+
elif eva =="class":
|
213 |
+
|
214 |
+
std_scaler_df = df.copy()
|
215 |
+
|
216 |
+
std_scaler_df.loc[:,:] = StandardScaler().fit_transform(std_scaler_df.loc[:,:])
|
217 |
+
|
218 |
+
minmaxscaler_df = df.copy()
|
219 |
+
minmaxscaler_df.loc[:,:] = MinMaxScaler().fit_transform(minmaxscaler_df.loc[:,:])
|
220 |
+
|
221 |
+
rank_transformed_df = df.copy()
|
222 |
+
rank_transformed_df = rank_transformed_df.rank()
|
223 |
+
|
224 |
+
outlier_handled_df = [std_scaler_df,minmaxscaler_df,rank_transformed_df]
|
225 |
+
outlier_handled_df_name = ["std_scaler_df","minmaxscaler_df","rank_transformed_df"]
|
226 |
+
|
227 |
+
for j,i in enumerate(outlier_handled_df):
|
228 |
+
|
229 |
+
X_train, X_test, y_train, y_test = tts(i,y[i.index],test_size = test_size, random_state = random_state)
|
230 |
+
evaluationer.evaluation(f"{outlier_handled_df_name[j]}", X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva = "class")
|
231 |
+
return evaluationer.classification_evaluation_df, outlier_handled_df,outlier_handled_df_name
|
232 |
+
# returning evaluating dataframe
|
233 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==1.34.0
|
2 |
+
joblib==1.4.2
|
3 |
+
numpy==1.26.4
|
4 |
+
pandas==2.2.2
|
5 |
+
scikit-learn==1.4.2
|
6 |
+
seaborn==0.13.2
|
7 |
+
matplotlib==3.9.0
|
8 |
+
xgboost==2.0.3
|
9 |
+
lightgbm==4.3.0
|
10 |
+
statsmodels==0.14.2
|