Wilson-ZheLin commited on
Commit
9183c57
β€’
1 Parent(s): 7c98744

Initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ *.pyc
2
+ *.swp
3
+ *.swo
4
+ *.DS_Store
5
+ *.ipynb_checkpoints
6
+ data/
7
+ test.py
app/__init__.py ADDED
File without changes
app/app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import streamlit as st
3
+ from streamlit_lottie import st_lottie
4
+ from util import load_lottie, stream_data, welcome_message, introduction_message
5
+ from prediction_model import prediction_model_pipeline
6
+ from cluster_model import cluster_model_pipeline
7
+ from regression_model import regression_model_pipeline
8
+ from visualization import data_visualization
9
+ from src.util import read_file_from_streamlit
10
+
11
+ st.set_page_config(page_title="Streamline Analyst", page_icon=":rocket:", layout="wide")
12
+
13
+ # TITLE SECTION
14
+ with st.container():
15
+ st.subheader("Hello there πŸ‘‹")
16
+ st.title("Welcome to Streamline Analyst!")
17
+ if 'initialized' not in st.session_state:
18
+ st.session_state.initialized = True
19
+ if st.session_state.initialized:
20
+ st.session_state.welcome_message = welcome_message()
21
+ st.write(stream_data(st.session_state.welcome_message))
22
+ time.sleep(0.5)
23
+ st.write("[Github > ](https://github.com/Wilson-ZheLin/Streamline-Analyst)")
24
+ st.session_state.initialized = False
25
+ else:
26
+ st.write(st.session_state.welcome_message)
27
+ st.write("[Github > ](https://github.com/Wilson-ZheLin/Streamline-Analyst)")
28
+
29
+ # INTRO SECTION
30
+ with st.container():
31
+ st.divider()
32
+ if 'lottie' not in st.session_state:
33
+ st.session_state.lottie_url1, st.session_state.lottie_url2 = load_lottie()
34
+ st.session_state.lottie = True
35
+
36
+ left_column_r1, right_column_r1 = st.columns([6, 4])
37
+ with left_column_r1:
38
+ st.header("What can Streamline Analyst do?")
39
+ st.write(introduction_message()[0])
40
+ with right_column_r1:
41
+ if st.session_state.lottie:
42
+ st_lottie(st.session_state.lottie_url1, height=280, key="animation1")
43
+
44
+ left_column_r2, _, right_column_r2 = st.columns([6, 1, 5])
45
+ with left_column_r2:
46
+ if st.session_state.lottie:
47
+ st_lottie(st.session_state.lottie_url2, height=200, key="animation2")
48
+ with right_column_r2:
49
+ st.header("Simple to Use")
50
+ st.write(introduction_message()[1])
51
+
52
+ # MAIN SECTION
53
+ with st.container():
54
+ st.divider()
55
+ st.header("Let's Get Started")
56
+ left_column, right_column = st.columns([6, 4])
57
+ with left_column:
58
+ API_KEY = st.text_input(
59
+ "Your API Key won't be stored or shared!",
60
+ placeholder="Enter your API key here...",
61
+ )
62
+ st.write("πŸ‘†Your OpenAI API key:")
63
+ uploaded_file = st.file_uploader("Choose a data file. Your data won't be stored as well!", accept_multiple_files=False, type=['csv', 'json', 'xls', 'xlsx'])
64
+ if uploaded_file:
65
+ if uploaded_file.getvalue():
66
+ uploaded_file.seek(0)
67
+ st.session_state.DF_uploaded = read_file_from_streamlit(uploaded_file)
68
+ st.session_state.is_file_empty = False
69
+ else:
70
+ st.session_state.is_file_empty = True
71
+
72
+ with right_column:
73
+ SELECTED_MODEL = st.selectbox(
74
+ 'Which OpenAI model do you want to use?',
75
+ ('GPT-4-Turbo', 'GPT-3.5-Turbo'))
76
+
77
+ MODE = st.selectbox(
78
+ 'Select proper data analysis mode',
79
+ ('Predictive Classification', 'Clustering Model', 'Regression Model', 'Data Visualization'))
80
+
81
+ st.write(f'Model selected: :green[{SELECTED_MODEL}]')
82
+ st.write(f'Data analysis mode: :green[{MODE}]')
83
+
84
+ # Proceed Button
85
+ is_proceed_enabled = uploaded_file is not None and API_KEY != "" or uploaded_file is not None and MODE == "Data Visualization"
86
+
87
+ # Initialize the 'button_clicked' state
88
+ if 'button_clicked' not in st.session_state:
89
+ st.session_state.button_clicked = False
90
+ if st.button('Start Analysis', disabled=(not is_proceed_enabled) or st.session_state.button_clicked, type="primary"):
91
+ st.session_state.button_clicked = True
92
+ if "is_file_empty" in st.session_state and st.session_state.is_file_empty:
93
+ st.caption('Your data file is empty!')
94
+
95
+ # Start Analysis
96
+ if st.session_state.button_clicked:
97
+ GPT_MODEL = 4 if SELECTED_MODEL == 'GPT-4-Turbo' else 3.5
98
+ with st.container():
99
+ if "DF_uploaded" not in st.session_state:
100
+ st.error("File is empty!")
101
+ else:
102
+ if MODE == 'Predictive Classification':
103
+ prediction_model_pipeline(st.session_state.DF_uploaded, API_KEY, GPT_MODEL)
104
+ elif MODE == 'Clustering Model':
105
+ cluster_model_pipeline(st.session_state.DF_uploaded, API_KEY, GPT_MODEL)
106
+ elif MODE == 'Regression Model':
107
+ regression_model_pipeline(st.session_state.DF_uploaded, API_KEY, GPT_MODEL)
108
+ elif MODE == 'Data Visualization':
109
+ data_visualization(st.session_state.DF_uploaded)
app/cluster_model.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from util import developer_info, developer_info_static
3
+ from src.plot import plot_clusters, correlation_matrix_plotly
4
+ from src.handle_null_value import contains_missing_value, remove_high_null, fill_null_values
5
+ from src.preprocess import convert_to_numeric, remove_duplicates, transform_data_for_clustering
6
+ from src.llm_service import decide_fill_null, decide_encode_type, decide_cluster_model
7
+ from src.pca import decide_pca, perform_PCA_for_clustering
8
+ from src.model_service import save_model, calculate_silhouette_score, calculate_calinski_harabasz_score, calculate_davies_bouldin_score, gmm_predict, estimate_optimal_clusters
9
+ from src.cluster_model import train_select_cluster_model
10
+ from src.util import contain_null_attributes_info, separate_fill_null_list, check_all_columns_numeric, non_numeric_columns_and_head, separate_decode_list, get_cluster_method_name
11
+
12
+ def start_training_model():
13
+ st.session_state["start_training"] = True
14
+
15
+ def cluster_model_pipeline(DF, API_KEY, GPT_MODEL):
16
+ st.divider()
17
+ st.subheader('Data Overview')
18
+ if 'data_origin' not in st.session_state:
19
+ st.session_state.data_origin = DF
20
+ st.dataframe(st.session_state.data_origin.describe(), width=1200)
21
+
22
+ # Data Imputation
23
+ st.subheader('Handle and Impute Missing Values')
24
+ if "contain_null" not in st.session_state:
25
+ st.session_state.contain_null = contains_missing_value(st.session_state.data_origin)
26
+
27
+ if 'filled_df' not in st.session_state:
28
+ if st.session_state.contain_null:
29
+ with st.status("Processing **missing values** in the data...", expanded=True) as status:
30
+ st.write("Filtering out high-frequency missing rows and columns...")
31
+ filled_df = remove_high_null(DF)
32
+ st.write("Large language model analysis...")
33
+ attributes, types_info, description_info = contain_null_attributes_info(filled_df)
34
+ fill_result_dict = decide_fill_null(attributes, types_info, description_info, GPT_MODEL, API_KEY)
35
+ st.write("Imputing missing values...")
36
+ mean_list, median_list, mode_list, new_category_list, interpolation_list = separate_fill_null_list(fill_result_dict)
37
+ filled_df = fill_null_values(filled_df, mean_list, median_list, mode_list, new_category_list, interpolation_list)
38
+ # Store the imputed DataFrame in session_state
39
+ st.session_state.filled_df = filled_df
40
+ DF = filled_df
41
+ status.update(label='Missing value processing completed!', state="complete", expanded=False)
42
+ st.download_button(
43
+ label="Download Data with Missing Values Imputed",
44
+ data=st.session_state.filled_df.to_csv(index=False).encode('utf-8'),
45
+ file_name="imputed_missing_values.csv",
46
+ mime='text/csv')
47
+ else:
48
+ st.session_state.filled_df = DF
49
+ st.success("No missing values detected. Processing skipped.")
50
+ else:
51
+ st.success("Missing value processing completed!")
52
+ if st.session_state.contain_null:
53
+ st.download_button(
54
+ label="Download Data with Missing Values Imputed",
55
+ data=st.session_state.filled_df.to_csv(index=False).encode('utf-8'),
56
+ file_name="imputed_missing_values.csv",
57
+ mime='text/csv')
58
+
59
+ # Data Encoding
60
+ st.subheader("Process Data Encoding")
61
+ st.caption("*For considerations of processing time, **NLP features** like **TF-IDF** have not been included in the current pipeline, long text attributes may be dropped.")
62
+ if 'all_numeric' not in st.session_state:
63
+ st.session_state.all_numeric = check_all_columns_numeric(st.session_state.data_origin)
64
+
65
+ if 'encoded_df' not in st.session_state:
66
+ if not st.session_state.all_numeric:
67
+ with st.status("Encoding non-numeric data using **numeric mapping** and **one-hot**...", expanded=True) as status:
68
+ non_numeric_attributes, non_numeric_head = non_numeric_columns_and_head(DF)
69
+ st.write("Large language model analysis...")
70
+ encode_result_dict = decide_encode_type(non_numeric_attributes, non_numeric_head, GPT_MODEL, API_KEY)
71
+ st.write("Encoding the data...")
72
+ convert_int_cols, one_hot_cols, drop_cols = separate_decode_list(encode_result_dict, "")
73
+ encoded_df, mappings = convert_to_numeric(DF, convert_int_cols, one_hot_cols, drop_cols)
74
+ # Store the imputed DataFrame in session_state
75
+ st.session_state.encoded_df = encoded_df
76
+ DF = encoded_df
77
+ status.update(label='Data encoding completed!', state="complete", expanded=False)
78
+ st.download_button(
79
+ label="Download Encoded Data",
80
+ data=st.session_state.encoded_df.to_csv(index=False).encode('utf-8'),
81
+ file_name="encoded_data.csv",
82
+ mime='text/csv')
83
+ else:
84
+ st.session_state.encoded_df = DF
85
+ st.success("All columns are numeric. Processing skipped.")
86
+ else:
87
+ st.success("Data encoded completed using numeric mapping and one-hot!")
88
+ if not st.session_state.all_numeric:
89
+ st.download_button(
90
+ label="Download Encoded Data",
91
+ data=st.session_state.encoded_df.to_csv(index=False).encode('utf-8'),
92
+ file_name="encoded_data.csv",
93
+ mime='text/csv')
94
+
95
+ # Correlation Heatmap
96
+ if 'df_cleaned1' not in st.session_state:
97
+ st.session_state.df_cleaned1 = DF
98
+ st.subheader('Correlation Between Attributes')
99
+ st.plotly_chart(correlation_matrix_plotly(st.session_state.df_cleaned1))
100
+
101
+ # Remove duplicate entities
102
+ st.subheader('Remove Duplicate Entities')
103
+ if 'df_cleaned2' not in st.session_state:
104
+ st.session_state.df_cleaned2 = remove_duplicates(st.session_state.df_cleaned1)
105
+ # DF = remove_duplicates(DF)
106
+ st.info("Duplicate rows removed.")
107
+
108
+ # Data Transformation
109
+ st.subheader('Data Transformation')
110
+ if 'data_transformed' not in st.session_state:
111
+ st.session_state.data_transformed = transform_data_for_clustering(st.session_state.df_cleaned2)
112
+ st.success("Data transformed by standardization and box-cox if applicable.")
113
+
114
+ # PCA
115
+ st.subheader('Principal Component Analysis')
116
+ st.write("Deciding whether to perform PCA...")
117
+ if 'df_pca' not in st.session_state:
118
+ _, n_components = decide_pca(st.session_state.df_cleaned2)
119
+ st.session_state.df_pca = perform_PCA_for_clustering(st.session_state.data_transformed, n_components)
120
+ st.success("Completed!")
121
+
122
+ # Splitting and Balancing
123
+ if 'test_percentage' not in st.session_state:
124
+ st.session_state.test_percentage = 20
125
+ if 'balance_data' not in st.session_state:
126
+ st.session_state.balance_data = False
127
+ if "start_training" not in st.session_state:
128
+ st.session_state["start_training"] = False
129
+ if 'model_trained' not in st.session_state:
130
+ st.session_state['model_trained'] = False
131
+
132
+ splitting_column, balance_column = st.columns(2)
133
+ with splitting_column:
134
+ st.subheader(':grey[Data Splitting]')
135
+ st.caption('Data splitting is not applicable to clustering models.')
136
+ st.slider('Percentage of test set', 1, 25, st.session_state.test_percentage, key='test_percentage', disabled=True)
137
+
138
+ with balance_column:
139
+ st.metric(label="Test Data", value="--%", delta=None)
140
+ st.toggle('Class Balancing', value=st.session_state.balance_data, key='to_perform_balance', disabled=True)
141
+ st.caption('Class balancing is not applicable to clustering models.')
142
+
143
+ st.button("Start Training Model", on_click=start_training_model, type="primary", disabled=st.session_state['start_training'])
144
+
145
+ # Model Training
146
+ if st.session_state['start_training']:
147
+ with st.container():
148
+ st.header("Modeling")
149
+ if not st.session_state.get("data_prepared", False):
150
+ st.session_state.X = st.session_state.df_pca
151
+ st.session_state.data_prepared = True
152
+
153
+ # Decide model types:
154
+ if "decided_model" not in st.session_state:
155
+ st.session_state["decided_model"] = False
156
+ if "all_set" not in st.session_state:
157
+ st.session_state["all_set"] = False
158
+
159
+ if not st.session_state["decided_model"]:
160
+ with st.spinner("Deciding models based on data..."):
161
+ shape_info = str(st.session_state.X.shape)
162
+ description_info = st.session_state.X.describe().to_csv()
163
+ cluster_info = estimate_optimal_clusters(st.session_state.X)
164
+ st.session_state.default_cluster = cluster_info
165
+ model_dict = decide_cluster_model(shape_info, description_info, cluster_info, GPT_MODEL, API_KEY)
166
+ model_list = list(model_dict.values())
167
+ if 'model_list' not in st.session_state:
168
+ st.session_state.model_list = model_list
169
+ st.session_state.decided_model = True
170
+
171
+ # Display results
172
+ if st.session_state["decided_model"]:
173
+ display_results(st.session_state.X)
174
+ st.session_state["all_set"] = True
175
+
176
+ # Download models
177
+ if st.session_state["all_set"]:
178
+ download_col1, download_col2, download_col3 = st.columns(3)
179
+ with download_col1:
180
+ st.download_button(label="Download Model", data=st.session_state.downloadable_model1, file_name=f"{st.session_state.model1_name}.joblib", mime="application/octet-stream")
181
+ with download_col2:
182
+ st.download_button(label="Download Model", data=st.session_state.downloadable_model2, file_name=f"{st.session_state.model2_name}.joblib", mime="application/octet-stream")
183
+ with download_col3:
184
+ st.download_button(label="Download Model", data=st.session_state.downloadable_model3, file_name=f"{st.session_state.model3_name}.joblib", mime="application/octet-stream")
185
+
186
+ # Footer
187
+ st.divider()
188
+ if "all_set" in st.session_state and st.session_state["all_set"]:
189
+ if "has_been_set" not in st.session_state:
190
+ st.session_state["has_been_set"] = True
191
+ developer_info()
192
+ else:
193
+ developer_info_static()
194
+
195
+ def display_results(X):
196
+ st.success("Models selected based on your data!")
197
+
198
+ # Data set metrics
199
+ st.metric(label="Total Data", value=len(X), delta=None)
200
+
201
+ # Model training
202
+ model_col1, model_col2, model_col3 = st.columns(3)
203
+ with model_col1:
204
+ if "model1_name" not in st.session_state:
205
+ st.session_state.model1_name = get_cluster_method_name(st.session_state.model_list[0])
206
+ st.subheader(st.session_state.model1_name)
207
+
208
+ # Slider for model parameters
209
+ if st.session_state.model_list[0] == 2:
210
+ st.caption('N-cluster is not applicable to DBSCAN.')
211
+ else:
212
+ st.caption(f'N-cluster for {st.session_state.model1_name}:')
213
+ n_clusters1 = st.slider('N clusters', 2, 20, st.session_state.default_cluster, label_visibility="collapsed", key='n_clusters1', disabled=st.session_state.model_list[0] == 2)
214
+
215
+ with st.spinner("Model training in progress..."):
216
+ st.session_state.model1 = train_select_cluster_model(X, n_clusters1, st.session_state.model_list[0])
217
+ st.session_state.downloadable_model1 = save_model(st.session_state.model1)
218
+
219
+ if st.session_state.model_list[0] != 3:
220
+ label1 = st.session_state.model1.labels_
221
+ else:
222
+ label1 = gmm_predict(X, st.session_state.model1)
223
+
224
+ # Visualization
225
+ st.pyplot(plot_clusters(X, label1))
226
+ # Model metrics
227
+ st.write(f"Silhouette score: ", f'\n:green[**{calculate_silhouette_score(X, label1)}**]')
228
+ st.write(f"Calinski-Harabasz score: ", f'\n:green[**{calculate_calinski_harabasz_score(X, label1)}**]')
229
+ st.write(f"Davies-Bouldin score: ", f'\n:green[**{calculate_davies_bouldin_score(X, label1)}**]')
230
+
231
+ with model_col2:
232
+ if "model2_name" not in st.session_state:
233
+ st.session_state.model2_name = get_cluster_method_name(st.session_state.model_list[1])
234
+ st.subheader(st.session_state.model2_name)
235
+
236
+ # Slider for model parameters
237
+ if st.session_state.model_list[1] == 2:
238
+ st.caption('N-cluster is not applicable to DBSCAN.')
239
+ else:
240
+ st.caption(f'N-cluster for {st.session_state.model2_name}:')
241
+ n_clusters2 = st.slider('N clusters', 2, 20, st.session_state.default_cluster, label_visibility="collapsed", key='n_clusters2', disabled=st.session_state.model_list[1] == 2)
242
+
243
+ with st.spinner("Model training in progress..."):
244
+ st.session_state.model2 = train_select_cluster_model(X, n_clusters2, st.session_state.model_list[1])
245
+ st.session_state.downloadable_model2 = save_model(st.session_state.model2)
246
+
247
+ if st.session_state.model_list[1] != 3:
248
+ label2 = st.session_state.model2.labels_
249
+ else:
250
+ label2 = gmm_predict(X, st.session_state.model2)
251
+
252
+ # Visualization
253
+ st.pyplot(plot_clusters(X, label2))
254
+ # Model metrics
255
+ st.write(f"Silhouette score: ", f'\n:green[**{calculate_silhouette_score(X, label2)}**]')
256
+ st.write(f"Calinski-Harabasz score: ", f'\n:green[**{calculate_calinski_harabasz_score(X, label2)}**]')
257
+ st.write(f"Davies-Bouldin score: ", f'\n:green[**{calculate_davies_bouldin_score(X, label2)}**]')
258
+
259
+ with model_col3:
260
+ if "model3_name" not in st.session_state:
261
+ st.session_state.model3_name = get_cluster_method_name(st.session_state.model_list[2])
262
+ st.subheader(st.session_state.model3_name)
263
+
264
+ # Slider for model parameters
265
+ if st.session_state.model_list[2] == 2:
266
+ st.caption('N-cluster is not applicable to DBSCAN.')
267
+ else:
268
+ st.caption(f'N-cluster for {st.session_state.model3_name}:')
269
+ n_clusters3 = st.slider('N clusters', 2, 20, st.session_state.default_cluster, label_visibility="collapsed", key='n_clusters3', disabled=st.session_state.model_list[2] == 2)
270
+
271
+ with st.spinner("Model training in progress..."):
272
+ st.session_state.model3 = train_select_cluster_model(X, n_clusters3, st.session_state.model_list[2])
273
+ st.session_state.downloadable_model3 = save_model(st.session_state.model3)
274
+
275
+ if st.session_state.model_list[2] != 3:
276
+ label3 = st.session_state.model3.labels_
277
+ else:
278
+ label3 = gmm_predict(X, st.session_state.model3)
279
+
280
+ # Visualization
281
+ st.pyplot(plot_clusters(X, label3))
282
+ # Model metrics
283
+ st.write(f"Silhouette score: ", f'\n:green[**{calculate_silhouette_score(X, label3)}**]')
284
+ st.write(f"Calinski-Harabasz score: ", f'\n:green[**{calculate_calinski_harabasz_score(X, label3)}**]')
285
+ st.write(f"Davies-Bouldin score: ", f'\n:green[**{calculate_davies_bouldin_score(X, label3)}**]')
app/config/config.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ lottie_url1: "https://lottie.host/f89e48e2-55e5-4fdf-a406-0be2b00cc2af/ECJa6PGrCV.json"
2
+ lottie_url2: "https://lottie.host/05824020-0a23-4373-8418-721bd6e68504/FE5XXRT455.json"
3
+ welcome_template: "Streamline Analyst πŸͺ„ is an advanced, open-source application powered by LLMs that streamlines the entire process of data analysis. It automates all the tasks from data preprocessing to model testing, simplifying complex data tasks with precision."
4
+ introduction_template1: |
5
+ As a data analysis agent, **Streamline Analyst** is capable of making autonomous decisions based on your data:
6
+ - Effortless Data Preprocessing
7
+ - Intelligent Encoding & Balancing
8
+ - Automated Model Selection & Training
9
+ - Dynamic Data Visualization
10
+ - And much more...
11
+ introduction_template2: |
12
+ **You only need to**:
13
+ 1. **Select** your data file
14
+ 2. **Choose** an analysis mode
15
+ 3. **Press** the Start button
app/prediction_model.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from util import developer_info, developer_info_static
3
+ from src.plot import confusion_metrix, roc, correlation_matrix_plotly
4
+ from src.handle_null_value import contains_missing_value, remove_high_null, fill_null_values
5
+ from src.preprocess import convert_to_numeric, remove_rows_with_empty_target, remove_duplicates
6
+ from src.llm_service import decide_fill_null, decide_encode_type, decide_model, decide_target_attribute, decide_test_ratio, decide_balance
7
+ from src.pca import decide_pca, perform_pca
8
+ from src.model_service import split_data, check_and_balance, fpr_and_tpr, auc, save_model, calculate_f1_score
9
+ from src.predictive_model import train_selected_model
10
+ from src.util import select_Y, contain_null_attributes_info, separate_fill_null_list, check_all_columns_numeric, non_numeric_columns_and_head, separate_decode_list, get_data_overview, get_selected_models, get_model_name, count_unique, attribute_info, get_balance_info, get_balance_method_name
11
+
12
+ def update_balance_data():
13
+ st.session_state.balance_data = st.session_state.to_perform_balance
14
+
15
+ def start_training_model():
16
+ st.session_state["start_training"] = True
17
+
18
+ def prediction_model_pipeline(DF, API_KEY, GPT_MODEL):
19
+ st.divider()
20
+ st.subheader('Data Overview')
21
+ if 'data_origin' not in st.session_state:
22
+ st.session_state.data_origin = DF
23
+ st.dataframe(st.session_state.data_origin.describe(), width=1200)
24
+ attributes = st.session_state.data_origin.columns.tolist()
25
+
26
+ # Select the target variable
27
+ if 'target_selected' not in st.session_state:
28
+ st.session_state.target_selected = False
29
+ st.subheader('Target Variable')
30
+ if not st.session_state.target_selected:
31
+
32
+ with st.spinner("AI is analyzing the data..."):
33
+ attributes_for_target, types_info_for_target, head_info_for_target = attribute_info(st.session_state.data_origin)
34
+ st.session_state.target_Y = decide_target_attribute(attributes_for_target, types_info_for_target, head_info_for_target, GPT_MODEL, API_KEY)
35
+
36
+ if st.session_state.target_Y != -1:
37
+ selected_Y = st.session_state.target_Y
38
+ st.success("Target variable has been selected by the AI!")
39
+ st.write(f'Target attribute selected: :green[**{selected_Y}**]')
40
+ st.session_state.target_selected = True
41
+ else:
42
+ st.info("AI cannot determine the target variable from the data. Please select the target variable")
43
+ target_col1, target_col2 = st.columns([9, 1])
44
+ with target_col1:
45
+ selected_Y = st.selectbox(
46
+ label = 'Select the target variable to predict:',
47
+ options = attributes,
48
+ index = len(attributes)-1,
49
+ label_visibility='collapsed'
50
+ )
51
+ with target_col2:
52
+ if st.button("Confirm", type="primary"):
53
+ st.session_state.target_selected = True
54
+ st.session_state.selected_Y = selected_Y
55
+ else:
56
+ if st.session_state.target_Y != -1:
57
+ st.success("Target variable has been selected by the AI!")
58
+ st.write(f"Target variable selected: :green[**{st.session_state.selected_Y}**]")
59
+
60
+ if st.session_state.target_selected:
61
+
62
+ # Data Imputation
63
+ st.subheader('Handle and Impute Missing Values')
64
+ if "contain_null" not in st.session_state:
65
+ st.session_state.contain_null = contains_missing_value(st.session_state.data_origin)
66
+
67
+ if 'filled_df' not in st.session_state:
68
+ if st.session_state.contain_null:
69
+ with st.status("Processing **missing values** in the data...", expanded=True) as status:
70
+ st.write("Filtering out high-frequency missing rows and columns...")
71
+ filled_df = remove_high_null(DF)
72
+ filled_df = remove_rows_with_empty_target(filled_df, st.session_state.selected_Y)
73
+ st.write("Large language model analysis...")
74
+ attributes, types_info, description_info = contain_null_attributes_info(filled_df)
75
+ fill_result_dict = decide_fill_null(attributes, types_info, description_info, GPT_MODEL, API_KEY)
76
+ st.write("Imputing missing values...")
77
+ mean_list, median_list, mode_list, new_category_list, interpolation_list = separate_fill_null_list(fill_result_dict)
78
+ filled_df = fill_null_values(filled_df, mean_list, median_list, mode_list, new_category_list, interpolation_list)
79
+ # Store the imputed DataFrame in session_state
80
+ st.session_state.filled_df = filled_df
81
+ DF = filled_df
82
+ status.update(label='Missing value processing completed!', state="complete", expanded=False)
83
+ st.download_button(
84
+ label="Download Data with Missing Values Imputed",
85
+ data=st.session_state.filled_df.to_csv(index=False).encode('utf-8'),
86
+ file_name="imputed_missing_values.csv",
87
+ mime='text/csv')
88
+ else:
89
+ st.session_state.filled_df = DF
90
+ st.success("No missing values detected. Processing skipped.")
91
+ else:
92
+ st.success("Missing value processing completed!")
93
+ if st.session_state.contain_null:
94
+ st.download_button(
95
+ label="Download Data with Missing Values Imputed",
96
+ data=st.session_state.filled_df.to_csv(index=False).encode('utf-8'),
97
+ file_name="imputed_missing_values.csv",
98
+ mime='text/csv')
99
+
100
+ # Data Encoding
101
+ st.subheader("Process Data Encoding")
102
+ st.caption("*For considerations of processing time, **NLP features** like **TF-IDF** have not been included in the current pipeline, long text attributes may be dropped.")
103
+ if 'all_numeric' not in st.session_state:
104
+ st.session_state.all_numeric = check_all_columns_numeric(st.session_state.data_origin)
105
+
106
+ if 'encoded_df' not in st.session_state:
107
+ if not st.session_state.all_numeric:
108
+ with st.status("Encoding non-numeric data using **numeric mapping** and **one-hot**...", expanded=True) as status:
109
+ non_numeric_attributes, non_numeric_head = non_numeric_columns_and_head(DF)
110
+ st.write("Large language model analysis...")
111
+ encode_result_dict = decide_encode_type(non_numeric_attributes, non_numeric_head, GPT_MODEL, API_KEY)
112
+ st.write("Encoding the data...")
113
+ convert_int_cols, one_hot_cols, drop_cols = separate_decode_list(encode_result_dict, st.session_state.selected_Y)
114
+ encoded_df, mappings = convert_to_numeric(DF, convert_int_cols, one_hot_cols, drop_cols)
115
+ # Store the imputed DataFrame in session_state
116
+ st.session_state.encoded_df = encoded_df
117
+ DF = encoded_df
118
+ status.update(label='Data encoding completed!', state="complete", expanded=False)
119
+ st.download_button(
120
+ label="Download Encoded Data",
121
+ data=st.session_state.encoded_df.to_csv(index=False).encode('utf-8'),
122
+ file_name="encoded_data.csv",
123
+ mime='text/csv')
124
+ else:
125
+ st.session_state.encoded_df = DF
126
+ st.success("All columns are numeric. Processing skipped.")
127
+ else:
128
+ st.success("Data encoded completed using numeric mapping and one-hot!")
129
+ if not st.session_state.all_numeric:
130
+ st.download_button(
131
+ label="Download Encoded Data",
132
+ data=st.session_state.encoded_df.to_csv(index=False).encode('utf-8'),
133
+ file_name="encoded_data.csv",
134
+ mime='text/csv')
135
+
136
+ # Correlation Heatmap
137
+ if 'df_cleaned1' not in st.session_state:
138
+ st.session_state.df_cleaned1 = DF
139
+ st.subheader('Correlation Between Attributes')
140
+ st.plotly_chart(correlation_matrix_plotly(st.session_state.df_cleaned1))
141
+
142
+ # Remove duplicate entities
143
+ st.subheader('Remove Duplicate Entities')
144
+ if 'df_cleaned2' not in st.session_state:
145
+ st.session_state.df_cleaned2 = remove_duplicates(st.session_state.df_cleaned1)
146
+ # DF = remove_duplicates(DF)
147
+ st.info("Duplicate rows removed.")
148
+
149
+ # PCA
150
+ st.subheader('Principal Component Analysis')
151
+ st.write("Deciding whether to perform PCA...")
152
+ if 'df_pca' not in st.session_state:
153
+ to_perform_pca, n_components = decide_pca(st.session_state.df_cleaned2.drop(columns=[st.session_state.selected_Y]))
154
+ if 'to_perform_pca' not in st.session_state:
155
+ st.session_state.to_perform_pca = to_perform_pca
156
+ if st.session_state.to_perform_pca:
157
+ st.session_state.df_pca = perform_pca(st.session_state.df_cleaned2, n_components, st.session_state.selected_Y)
158
+ else:
159
+ st.session_state.df_pca = st.session_state.df_cleaned2
160
+ st.success("Completed!")
161
+
162
+ # Splitting and Balancing
163
+ if 'balance_data' not in st.session_state:
164
+ st.session_state.balance_data = True
165
+ if "start_training" not in st.session_state:
166
+ st.session_state["start_training"] = False
167
+ if 'model_trained' not in st.session_state:
168
+ st.session_state['model_trained'] = False
169
+ if 'is_binary' not in st.session_state:
170
+ st.session_state['is_binary'] = count_unique(st.session_state.df_pca, st.session_state.selected_Y) == 2
171
+
172
+ # AI decide the testing set percentage
173
+ if 'test_percentage' not in st.session_state:
174
+ with st.spinner("Deciding testing set percentage based on data..."):
175
+ st.session_state.test_percentage = int(decide_test_ratio(st.session_state.df_pca.shape, GPT_MODEL, API_KEY) * 100)
176
+
177
+ splitting_column, balance_column = st.columns(2)
178
+ with splitting_column:
179
+ st.subheader('Data Splitting')
180
+ st.caption('AI recommended test percentage for the model')
181
+ st.slider('Percentage of test set', 1, 25, st.session_state.test_percentage, key='test_percentage', disabled=st.session_state['start_training'])
182
+
183
+ with balance_column:
184
+ st.metric(label="Test Data", value=f"{st.session_state.test_percentage}%", delta=None)
185
+ st.toggle('Class Balancing', value=st.session_state.balance_data, key='to_perform_balance', on_change=update_balance_data, disabled=st.session_state['start_training'])
186
+ st.caption('Strategies for handling imbalanced data sets and to enhance machine learning model performance.')
187
+ st.caption('AI will select the most appropriate method to balance the data.')
188
+
189
+ st.button("Start Training Model", on_click=start_training_model, type="primary", disabled=st.session_state['start_training'])
190
+
191
+ # Model Training
192
+ if st.session_state['start_training']:
193
+ with st.container():
194
+ st.header("Modeling")
195
+ X, Y = select_Y(st.session_state.df_pca, st.session_state.selected_Y)
196
+
197
+ # Balancing
198
+ if st.session_state.balance_data and "balance_method" not in st.session_state:
199
+ with st.spinner("AI is deciding the balance strategy for the data..."):
200
+ shape_info_balance, description_info_balance, balance_info_balance = get_balance_info(st.session_state.df_pca, st.session_state.selected_Y)
201
+ st.session_state.balance_method = int(decide_balance(shape_info_balance, description_info_balance, balance_info_balance, GPT_MODEL, API_KEY))
202
+ X_train_res, Y_train_res = check_and_balance(X, Y, method = st.session_state.balance_method)
203
+ else:
204
+ X_train_res, Y_train_res = X, Y
205
+ if 'balance_method' not in st.session_state:
206
+ st.session_state.balance_method = 4
207
+
208
+ # Splitting the data
209
+ if not st.session_state.get("data_splitted", False):
210
+ st.session_state.X_train, st.session_state.X_test, st.session_state.Y_train, st.session_state.Y_test = split_data(X_train_res, Y_train_res, st.session_state.test_percentage / 100, 42, st.session_state.to_perform_pca)
211
+ st.session_state["data_splitted"] = True
212
+
213
+ # Decide model types:
214
+ if "decided_model" not in st.session_state:
215
+ st.session_state["decided_model"] = False
216
+ if "all_set" not in st.session_state:
217
+ st.session_state["all_set"] = False
218
+
219
+ if not st.session_state["decided_model"]:
220
+ with st.spinner("Deciding models based on data..."):
221
+ shape_info, head_info, nunique_info, description_info = get_data_overview(st.session_state.df_pca)
222
+ model_dict = decide_model(shape_info, head_info, nunique_info, description_info, GPT_MODEL, API_KEY)
223
+ model_list = get_selected_models(model_dict)
224
+ if 'model_list' not in st.session_state:
225
+ st.session_state.model_list = model_list
226
+ st.session_state["decided_model"] = True
227
+
228
+ # Display results
229
+ if st.session_state["decided_model"]:
230
+ display_results(st.session_state.X_train, st.session_state.X_test, st.session_state.Y_train, st.session_state.Y_test)
231
+ st.session_state["all_set"] = True
232
+
233
+ # Download models
234
+ if st.session_state["all_set"]:
235
+ download_col1, download_col2, download_col3 = st.columns(3)
236
+ with download_col1:
237
+ st.download_button(label="Download Model", data=st.session_state.downloadable_model1, file_name=f"{st.session_state.model1_name}.joblib", mime="application/octet-stream")
238
+ with download_col2:
239
+ st.download_button(label="Download Model", data=st.session_state.downloadable_model2, file_name=f"{st.session_state.model2_name}.joblib", mime="application/octet-stream")
240
+ with download_col3:
241
+ st.download_button(label="Download Model", data=st.session_state.downloadable_model3, file_name=f"{st.session_state.model3_name}.joblib", mime="application/octet-stream")
242
+
243
+ # Footer
244
+ st.divider()
245
+ if "all_set" in st.session_state and st.session_state["all_set"]:
246
+ if "has_been_set" not in st.session_state:
247
+ st.session_state["has_been_set"] = True
248
+ developer_info()
249
+ else:
250
+ developer_info_static()
251
+
252
+ def display_results(X_train, X_test, Y_train, Y_test):
253
+ st.success("Models selected based on your data!")
254
+
255
+ # Data set metrics
256
+ data_col1, data_col2, data_col3, balance_col4 = st.columns(4)
257
+ with data_col1:
258
+ st.metric(label="Total Data", value=len(X_train)+len(X_test), delta=None)
259
+ with data_col2:
260
+ st.metric(label="Training Data", value=len(X_train), delta=None)
261
+ with data_col3:
262
+ st.metric(label="Testing Data", value=len(X_test), delta=None)
263
+ with balance_col4:
264
+ st.metric(label="Balance Strategy", value=get_balance_method_name(st.session_state.balance_method), delta=None)
265
+
266
+ # Model training
267
+ model_col1, model_col2, model_col3 = st.columns(3)
268
+ with model_col1:
269
+ if "model1_name" not in st.session_state:
270
+ st.session_state.model1_name = get_model_name(st.session_state.model_list[0])
271
+ st.subheader(st.session_state.model1_name)
272
+ with st.spinner("Model training in progress..."):
273
+ if 'model1' not in st.session_state:
274
+ st.session_state.model1 = train_selected_model(X_train, Y_train, st.session_state.model_list[0])
275
+ st.session_state.downloadable_model1 = save_model(st.session_state.model1)
276
+ # Model metrics
277
+ st.write(f"The accuracy of the {st.session_state.model1_name}: ", f'\n:green[**{st.session_state.model1.score(X_test, Y_test)}**]')
278
+ st.pyplot(confusion_metrix(st.session_state.model1_name, st.session_state.model1, X_test, Y_test))
279
+ st.write("F1 Score: ", f':green[**{calculate_f1_score(st.session_state.model1, X_test, Y_test, st.session_state.is_binary)}**]')
280
+ if st.session_state.model_list[0] != 2 and st.session_state['is_binary']:
281
+ if 'fpr1' not in st.session_state:
282
+ fpr1, tpr1 = fpr_and_tpr(st.session_state.model1, X_test, Y_test)
283
+ st.session_state.fpr1 = fpr1
284
+ st.session_state.tpr1 = tpr1
285
+ st.pyplot(roc(st.session_state.model1_name, st.session_state.fpr1, st.session_state.tpr1))
286
+ st.write(f"The AUC of the {st.session_state.model1_name}: ", f'\n:green[**{auc(st.session_state.fpr1, st.session_state.tpr1)}**]')
287
+
288
+ with model_col2:
289
+ if "model2_name" not in st.session_state:
290
+ st.session_state.model2_name = get_model_name(st.session_state.model_list[1])
291
+ st.subheader(st.session_state.model2_name)
292
+ with st.spinner("Model training in progress..."):
293
+ if 'model2' not in st.session_state:
294
+ st.session_state.model2 = train_selected_model(X_train, Y_train, st.session_state.model_list[1])
295
+ st.session_state.downloadable_model2 = save_model(st.session_state.model2)
296
+ # Model metrics
297
+ st.write(f"The accuracy of the {st.session_state.model2_name}: ", f'\n:green[**{st.session_state.model2.score(X_test, Y_test)}**]')
298
+ st.pyplot(confusion_metrix(st.session_state.model2_name, st.session_state.model2, X_test, Y_test))
299
+ st.write("F1 Score: ", f':green[**{calculate_f1_score(st.session_state.model2, X_test, Y_test, st.session_state.is_binary)}**]')
300
+ if st.session_state.model_list[1] != 2 and st.session_state['is_binary']:
301
+ if 'fpr2' not in st.session_state:
302
+ fpr2, tpr2 = fpr_and_tpr(st.session_state.model2, X_test, Y_test)
303
+ st.session_state.fpr2 = fpr2
304
+ st.session_state.tpr2 = tpr2
305
+ st.pyplot(roc(st.session_state.model2_name, st.session_state.fpr2, st.session_state.tpr2))
306
+ st.write(f"The AUC of the {st.session_state.model2_name}: ", f'\n:green[**{auc(st.session_state.fpr2, st.session_state.tpr2)}**]')
307
+
308
+ with model_col3:
309
+ if "model3_name" not in st.session_state:
310
+ st.session_state.model3_name = get_model_name(st.session_state.model_list[2])
311
+ st.subheader(st.session_state.model3_name)
312
+ with st.spinner("Model training in progress..."):
313
+ if 'model3' not in st.session_state:
314
+ st.session_state.model3 = train_selected_model(X_train, Y_train, st.session_state.model_list[2])
315
+ st.session_state.downloadable_model3 = save_model(st.session_state.model3)
316
+ # Model metrics
317
+ st.write(f"The accuracy of the {st.session_state.model3_name}: ", f'\n:green[**{st.session_state.model3.score(X_test, Y_test)}**]')
318
+ st.pyplot(confusion_metrix(st.session_state.model3_name, st.session_state.model3, X_test, Y_test))
319
+ st.write("F1 Score: ", f':green[**{calculate_f1_score(st.session_state.model3, X_test, Y_test, st.session_state.is_binary)}**]')
320
+ if st.session_state.model_list[2] != 2 and st.session_state['is_binary']:
321
+ if 'fpr3' not in st.session_state:
322
+ fpr3, tpr3 = fpr_and_tpr(st.session_state.model3, X_test, Y_test)
323
+ st.session_state.fpr3 = fpr3
324
+ st.session_state.tpr3 = tpr3
325
+ st.pyplot(roc(st.session_state.model3_name, st.session_state.fpr3, st.session_state.tpr3))
326
+ st.write(f"The AUC of the {st.session_state.model3_name}: ", f'\n:green[**{auc(st.session_state.fpr3, st.session_state.tpr3)}**]')
327
+
app/regression_model.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from util import developer_info, developer_info_static
3
+ from src.plot import correlation_matrix_plotly, plot_residuals, plot_predictions_vs_actual, plot_qq_plot
4
+ from src.handle_null_value import contains_missing_value, remove_high_null, fill_null_values
5
+ from src.preprocess import convert_to_numeric, remove_rows_with_empty_target, remove_duplicates, transform_data_for_clustering
6
+ from src.llm_service import decide_fill_null, decide_encode_type, decide_target_attribute, decide_test_ratio, decide_regression_model
7
+ from src.pca import decide_pca, perform_PCA_for_regression
8
+ from src.model_service import split_data, save_model, calculate_r2_score, calculate_mse_and_rmse, calculate_mae
9
+ from src.regression_model import train_selected_regression_model
10
+ from src.util import select_Y, contain_null_attributes_info, separate_fill_null_list, check_all_columns_numeric, non_numeric_columns_and_head, separate_decode_list, get_data_overview, attribute_info, get_regression_method_name
11
+
12
+ def start_training_model():
13
+ st.session_state["start_training"] = True
14
+
15
+ def regression_model_pipeline(DF, API_KEY, GPT_MODEL):
16
+ st.divider()
17
+ st.subheader('Data Overview')
18
+ if 'data_origin' not in st.session_state:
19
+ st.session_state.data_origin = DF
20
+ st.dataframe(st.session_state.data_origin.describe(), width=1200)
21
+ attributes = st.session_state.data_origin.columns.tolist()
22
+
23
+ # Select the target variable
24
+ if 'target_selected' not in st.session_state:
25
+ st.session_state.target_selected = False
26
+ st.subheader('Target Variable')
27
+ if not st.session_state.target_selected:
28
+
29
+ with st.spinner("AI is analyzing the data..."):
30
+ attributes_for_target, types_info_for_target, head_info_for_target = attribute_info(st.session_state.data_origin)
31
+ st.session_state.target_Y = decide_target_attribute(attributes_for_target, types_info_for_target, head_info_for_target, GPT_MODEL, API_KEY)
32
+
33
+ if st.session_state.target_Y != -1:
34
+ selected_Y = st.session_state.target_Y
35
+ st.success("Target variable has been selected by the AI!")
36
+ st.write(f'Target attribute selected: :green[**{selected_Y}**]')
37
+ st.session_state.target_selected = True
38
+ else:
39
+ st.info("AI cannot determine the target variable from the data. Please select the target variable")
40
+ target_col1, target_col2 = st.columns([9, 1])
41
+ with target_col1:
42
+ selected_Y = st.selectbox(
43
+ label = 'Select the target variable to predict:',
44
+ options = attributes,
45
+ index = len(attributes)-1,
46
+ label_visibility='collapsed'
47
+ )
48
+ with target_col2:
49
+ if st.button("Confirm", type="primary"):
50
+ st.session_state.target_selected = True
51
+ st.session_state.selected_Y = selected_Y
52
+ else:
53
+ if st.session_state.target_Y != -1:
54
+ st.success("Target variable has been selected by the AI!")
55
+ st.write(f"Target variable selected: :green[**{st.session_state.selected_Y}**]")
56
+
57
+ if st.session_state.target_selected:
58
+
59
+ # Data Imputation
60
+ st.subheader('Handle and Impute Missing Values')
61
+ if "contain_null" not in st.session_state:
62
+ st.session_state.contain_null = contains_missing_value(st.session_state.data_origin)
63
+
64
+ if 'filled_df' not in st.session_state:
65
+ if st.session_state.contain_null:
66
+ with st.status("Processing **missing values** in the data...", expanded=True) as status:
67
+ st.write("Filtering out high-frequency missing rows and columns...")
68
+ filled_df = remove_high_null(DF)
69
+ filled_df = remove_rows_with_empty_target(filled_df, st.session_state.selected_Y)
70
+ st.write("Large language model analysis...")
71
+ attributes, types_info, description_info = contain_null_attributes_info(filled_df)
72
+ fill_result_dict = decide_fill_null(attributes, types_info, description_info, GPT_MODEL, API_KEY)
73
+ st.write("Imputing missing values...")
74
+ mean_list, median_list, mode_list, new_category_list, interpolation_list = separate_fill_null_list(fill_result_dict)
75
+ filled_df = fill_null_values(filled_df, mean_list, median_list, mode_list, new_category_list, interpolation_list)
76
+ # Store the imputed DataFrame in session_state
77
+ st.session_state.filled_df = filled_df
78
+ DF = filled_df
79
+ status.update(label='Missing value processing completed!', state="complete", expanded=False)
80
+ st.download_button(
81
+ label="Download Data with Missing Values Imputed",
82
+ data=st.session_state.filled_df.to_csv(index=False).encode('utf-8'),
83
+ file_name="imputed_missing_values.csv",
84
+ mime='text/csv')
85
+ else:
86
+ st.session_state.filled_df = DF
87
+ st.success("No missing values detected. Processing skipped.")
88
+ else:
89
+ st.success("Missing value processing completed!")
90
+ if st.session_state.contain_null:
91
+ st.download_button(
92
+ label="Download Data with Missing Values Imputed",
93
+ data=st.session_state.filled_df.to_csv(index=False).encode('utf-8'),
94
+ file_name="imputed_missing_values.csv",
95
+ mime='text/csv')
96
+
97
+ # Data Encoding
98
+ st.subheader("Process Data Encoding")
99
+ st.caption("*For considerations of processing time, **NLP features** like **TF-IDF** have not been included in the current pipeline, long text attributes may be dropped.")
100
+ if 'all_numeric' not in st.session_state:
101
+ st.session_state.all_numeric = check_all_columns_numeric(st.session_state.data_origin)
102
+
103
+ if 'encoded_df' not in st.session_state:
104
+ if not st.session_state.all_numeric:
105
+ with st.status("Encoding non-numeric data using **numeric mapping** and **one-hot**...", expanded=True) as status:
106
+ non_numeric_attributes, non_numeric_head = non_numeric_columns_and_head(DF)
107
+ st.write("Large language model analysis...")
108
+ encode_result_dict = decide_encode_type(non_numeric_attributes, non_numeric_head, GPT_MODEL, API_KEY)
109
+ st.write("Encoding the data...")
110
+ convert_int_cols, one_hot_cols, drop_cols = separate_decode_list(encode_result_dict, st.session_state.selected_Y)
111
+ encoded_df, mappings = convert_to_numeric(DF, convert_int_cols, one_hot_cols, drop_cols)
112
+ # Store the imputed DataFrame in session_state
113
+ st.session_state.encoded_df = encoded_df
114
+ DF = encoded_df
115
+ status.update(label='Data encoding completed!', state="complete", expanded=False)
116
+ st.download_button(
117
+ label="Download Encoded Data",
118
+ data=st.session_state.encoded_df.to_csv(index=False).encode('utf-8'),
119
+ file_name="encoded_data.csv",
120
+ mime='text/csv')
121
+ else:
122
+ st.session_state.encoded_df = DF
123
+ st.success("All columns are numeric. Processing skipped.")
124
+ else:
125
+ st.success("Data encoded completed using numeric mapping and one-hot!")
126
+ if not st.session_state.all_numeric:
127
+ st.download_button(
128
+ label="Download Encoded Data",
129
+ data=st.session_state.encoded_df.to_csv(index=False).encode('utf-8'),
130
+ file_name="encoded_data.csv",
131
+ mime='text/csv')
132
+
133
+ # Correlation Heatmap
134
+ if 'df_cleaned1' not in st.session_state:
135
+ st.session_state.df_cleaned1 = DF
136
+ st.subheader('Correlation Between Attributes')
137
+ st.plotly_chart(correlation_matrix_plotly(st.session_state.df_cleaned1))
138
+
139
+ # Remove duplicate entities
140
+ st.subheader('Remove Duplicate Entities')
141
+ if 'df_cleaned2' not in st.session_state:
142
+ st.session_state.df_cleaned2 = remove_duplicates(st.session_state.df_cleaned1)
143
+ # DF = remove_duplicates(DF)
144
+ st.info("Duplicate rows removed.")
145
+
146
+ # Data Transformation
147
+ st.subheader('Data Transformation')
148
+ if 'data_transformed' not in st.session_state:
149
+ st.session_state.data_transformed = transform_data_for_clustering(st.session_state.df_cleaned2)
150
+ st.success("Data transformed by standardization and box-cox if applicable.")
151
+
152
+ # PCA
153
+ st.subheader('Principal Component Analysis')
154
+ st.write("Deciding whether to perform PCA...")
155
+ if 'df_pca' not in st.session_state:
156
+ _, n_components = decide_pca(st.session_state.df_cleaned2)
157
+ st.session_state.df_pca = perform_PCA_for_regression(st.session_state.data_transformed, n_components, st.session_state.selected_Y)
158
+ st.success("Completed!")
159
+
160
+ if "start_training" not in st.session_state:
161
+ st.session_state["start_training"] = False
162
+
163
+ # AI decide the testing set percentage
164
+ if 'test_percentage' not in st.session_state:
165
+ with st.spinner("Deciding testing set percentage based on data..."):
166
+ st.session_state.test_percentage = int(decide_test_ratio(st.session_state.df_pca.shape, GPT_MODEL, API_KEY) * 100)
167
+
168
+ splitting_column, balance_column = st.columns(2)
169
+ with splitting_column:
170
+ st.subheader('Data Splitting')
171
+ st.caption('AI recommended test percentage for the model')
172
+ st.slider('Percentage of test set', 1, 25, st.session_state.test_percentage, key='test_percentage', disabled=st.session_state['start_training'])
173
+
174
+ with balance_column:
175
+ st.metric(label="Test Data", value=f"{st.session_state.test_percentage}%", delta=None)
176
+ st.toggle('Class Balancing', value=False, key='to_perform_balance', disabled=True)
177
+ st.caption('Class balancing is not applicable to regression models.')
178
+
179
+ st.button("Start Training Model", on_click=start_training_model, type="primary", disabled=st.session_state['start_training'])
180
+
181
+ # Model Training
182
+ if st.session_state['start_training']:
183
+ with st.container():
184
+ st.header("Modeling")
185
+ X_train_res, Y_train_res = select_Y(st.session_state.df_pca, st.session_state.selected_Y)
186
+
187
+ # Splitting the data
188
+ if not st.session_state.get("data_splitted", False):
189
+ st.session_state.X_train, st.session_state.X_test, st.session_state.Y_train, st.session_state.Y_test = split_data(X_train_res, Y_train_res, st.session_state.test_percentage / 100, 42, True)
190
+ st.session_state["data_splitted"] = True
191
+
192
+ # Decide model types:
193
+ if "decided_model" not in st.session_state:
194
+ st.session_state["decided_model"] = False
195
+ if "all_set" not in st.session_state:
196
+ st.session_state["all_set"] = False
197
+
198
+ if not st.session_state["decided_model"]:
199
+ with st.spinner("Deciding models based on data..."):
200
+ shape_info, _, _, description_info = get_data_overview(st.session_state.df_pca)
201
+ model_dict = decide_regression_model(shape_info, description_info, st.session_state.selected_Y, GPT_MODEL, API_KEY)
202
+ model_list = list(model_dict.values())
203
+ if 'model_list' not in st.session_state:
204
+ st.session_state.model_list = model_list
205
+ st.session_state["decided_model"] = True
206
+
207
+ # Show modeling results
208
+ if st.session_state["decided_model"]:
209
+ display_results(st.session_state.X_train, st.session_state.X_test, st.session_state.Y_train, st.session_state.Y_test)
210
+ st.session_state["all_set"] = True
211
+
212
+ # Download models
213
+ if st.session_state["all_set"]:
214
+ download_col1, download_col2, download_col3 = st.columns(3)
215
+ with download_col1:
216
+ st.download_button(label="Download Model", data=st.session_state.downloadable_model1, file_name=f"{st.session_state.model1_name}.joblib", mime="application/octet-stream")
217
+ with download_col2:
218
+ st.download_button(label="Download Model", data=st.session_state.downloadable_model2, file_name=f"{st.session_state.model2_name}.joblib", mime="application/octet-stream")
219
+ with download_col3:
220
+ st.download_button(label="Download Model", data=st.session_state.downloadable_model3, file_name=f"{st.session_state.model3_name}.joblib", mime="application/octet-stream")
221
+
222
+ # Footer
223
+ st.divider()
224
+ if "all_set" in st.session_state and st.session_state["all_set"]:
225
+ if "has_been_set" not in st.session_state:
226
+ st.session_state["has_been_set"] = True
227
+ developer_info()
228
+ else:
229
+ developer_info_static()
230
+
231
+ def display_results(X_train, X_test, Y_train, Y_test):
232
+ st.success("Models selected based on your data!")
233
+
234
+ # Data set metrics
235
+ data_col1, data_col2, data_col3 = st.columns(3)
236
+ with data_col1:
237
+ st.metric(label="Total Data", value=len(X_train)+len(X_test), delta=None)
238
+ with data_col2:
239
+ st.metric(label="Training Data", value=len(X_train), delta=None)
240
+ with data_col3:
241
+ st.metric(label="Testing Data", value=len(X_test), delta=None)
242
+
243
+ # Model training
244
+ model_col1, model_col2, model_col3 = st.columns(3)
245
+ with model_col1:
246
+ if "model1_name" not in st.session_state:
247
+ st.session_state.model1_name = get_regression_method_name(st.session_state.model_list[0])
248
+ st.subheader(st.session_state.model1_name)
249
+ with st.spinner("Model training in progress..."):
250
+ if 'model1' not in st.session_state:
251
+ st.session_state.model1 = train_selected_regression_model(X_train, Y_train, st.session_state.model_list[0])
252
+ st.session_state.y_pred1 = st.session_state.model1.predict(X_test)
253
+ st.session_state.downloadable_model1 = save_model(st.session_state.model1)
254
+ # Model metrics
255
+ st.write("R2 Score: ", f':green[**{calculate_r2_score(st.session_state.y_pred1, Y_test)}**]')
256
+ st.pyplot(plot_predictions_vs_actual(st.session_state.y_pred1, Y_test))
257
+ mse1, rmse1 = calculate_mse_and_rmse(st.session_state.y_pred1, Y_test)
258
+ st.write("Mean Squared Error: ", f':green[**{mse1}**]')
259
+ st.write("Root Mean Squared Error: ", f':green[**{rmse1}**]')
260
+ st.pyplot(plot_residuals(st.session_state.y_pred1, Y_test))
261
+ st.write("Mean Absolute Error: ", f':green[**{calculate_mae(st.session_state.y_pred1, Y_test)}**]')
262
+ st.pyplot(plot_qq_plot(st.session_state.y_pred1, Y_test))
263
+
264
+ with model_col2:
265
+ if "model2_name" not in st.session_state:
266
+ st.session_state.model2_name = get_regression_method_name(st.session_state.model_list[1])
267
+ st.subheader(st.session_state.model2_name)
268
+ with st.spinner("Model training in progress..."):
269
+ if 'model2' not in st.session_state:
270
+ st.session_state.model2 = train_selected_regression_model(X_train, Y_train, st.session_state.model_list[1])
271
+ st.session_state.y_pred = st.session_state.model2.predict(X_test)
272
+ st.session_state.downloadable_model2 = save_model(st.session_state.model2)
273
+ # Model metrics
274
+ st.write("R2 Score: ", f':green[**{calculate_r2_score(st.session_state.y_pred, Y_test)}**]')
275
+ st.pyplot(plot_predictions_vs_actual(st.session_state.y_pred, Y_test))
276
+ mse2, rmse2 = calculate_mse_and_rmse(st.session_state.y_pred, Y_test)
277
+ st.write("Mean Squared Error: ", f':green[**{mse2}**]')
278
+ st.write("Root Mean Squared Error: ", f':green[**{rmse2}**]')
279
+ st.pyplot(plot_residuals(st.session_state.y_pred, Y_test))
280
+ st.write("Mean Absolute Error: ", f':green[**{calculate_mae(st.session_state.y_pred, Y_test)}**]')
281
+ st.pyplot(plot_qq_plot(st.session_state.y_pred, Y_test))
282
+
283
+ with model_col3:
284
+ if "model3_name" not in st.session_state:
285
+ st.session_state.model3_name = get_regression_method_name(st.session_state.model_list[2])
286
+ st.subheader(st.session_state.model3_name)
287
+ with st.spinner("Model training in progress..."):
288
+ if 'model3' not in st.session_state:
289
+ st.session_state.model3 = train_selected_regression_model(X_train, Y_train, st.session_state.model_list[2])
290
+ st.session_state.y_pred3 = st.session_state.model3.predict(X_test)
291
+ st.session_state.downloadable_model3 = save_model(st.session_state.model3)
292
+ # Model metrics
293
+ st.write("R2 Score: ", f':green[**{calculate_r2_score(st.session_state.y_pred3, Y_test)}**]')
294
+ st.pyplot(plot_predictions_vs_actual(st.session_state.y_pred3, Y_test))
295
+ mse3, rmse3 = calculate_mse_and_rmse(st.session_state.y_pred3, Y_test)
296
+ st.write("Mean Squared Error: ", f':green[**{mse3}**]')
297
+ st.write("Root Mean Squared Error: ", f':green[**{rmse3}**]')
298
+ st.pyplot(plot_residuals(st.session_state.y_pred3, Y_test))
299
+ st.write("Mean Absolute Error: ", f':green[**{calculate_mae(st.session_state.y_pred3, Y_test)}**]')
300
+ st.pyplot(plot_qq_plot(st.session_state.y_pred3, Y_test))
app/src/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from . import plot, util, pca, cluster_model, model_service, preprocess, predictive_model, llm_service, handle_null_value
app/src/cluster_model.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering
3
+ from sklearn.mixture import GaussianMixture
4
+
5
+ @st.cache_data
6
+ def train_select_cluster_model(X_train, n, model_type, model_params=None):
7
+ """
8
+ Trains a clustering model based on the specified model type and parameters.
9
+
10
+ Parameters:
11
+ - X_train (array-like): The training data set.
12
+ - n (int): The number of clusters to form or the number of components for the Gaussian Mixture model.
13
+ - model_type (int): An integer representing the type of model to train.
14
+ 1 for KMeans, 2 for DBSCAN, 3 for GaussianMixture, 4 for Hierarchical clustering, and 5 for Spectral clustering.
15
+ - model_params (dict, optional): A dictionary of model-specific parameters. Default is None.
16
+
17
+ Returns:
18
+ - The trained clustering model object based on the specified model type.
19
+ """
20
+ if model_type == 1:
21
+ return KMeans_train(X_train, n, model_params)
22
+ elif model_type == 2:
23
+ return DBSCAN_train(X_train, model_params)
24
+ elif model_type == 3:
25
+ return GaussianMixture_train(X_train, n, model_params)
26
+ elif model_type == 4:
27
+ return Hierarchical_train(X_train, n, model_params)
28
+ elif model_type == 5:
29
+ return Spectral_train(X_train, n, model_params)
30
+
31
+ def KMeans_train(X_train, n_clusters=3, model_params=None):
32
+ if model_params is None: model_params = {}
33
+ kmeans = KMeans(n_clusters=n_clusters, **model_params)
34
+ kmeans.fit(X_train)
35
+ return kmeans
36
+
37
+ def DBSCAN_train(X_train, model_params=None):
38
+ if model_params is None: model_params = {}
39
+ dbscan = DBSCAN(**model_params)
40
+ dbscan.fit(X_train)
41
+ return dbscan
42
+
43
+ def GaussianMixture_train(X_train, n_components=1, model_params=None):
44
+ if model_params is None: model_params = {}
45
+ gmm = GaussianMixture(n_components=n_components, **model_params)
46
+ gmm.fit(X_train)
47
+ return gmm
48
+
49
+ def Hierarchical_train(X_train, n_clusters=3, model_params=None):
50
+ if model_params is None: model_params = {}
51
+ hierarchical = AgglomerativeClustering(n_clusters=n_clusters, **model_params)
52
+ hierarchical.fit(X_train)
53
+ return hierarchical
54
+
55
+ def Spectral_train(X_train, n_clusters=3, model_params=None):
56
+ if model_params is None: model_params = {}
57
+ spectral = SpectralClustering(n_clusters=n_clusters, **model_params)
58
+ spectral.fit(X_train)
59
+ return spectral
app/src/config/config.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openai_api_key: "YOUR_OPENAI_API_KEY"
2
+ model4_name: "gpt-4-1106-preview"
3
+ model3_name: "gpt-3.5-turbo-1106"
4
+ numeric_attribute_template: |
5
+ You are a data analyst. You are cleaning the data and processing the attributes in the data that are not numeric. The columns to be processed include: {attributes}. The first 20 items of these data are as follows:
6
+ {data_frame_head}
7
+ Please help me decide whether each attribute should be processed as integer mapping or one-hot encoding based on content and semantics. If there's an attribute containing long text, consider dropping it. Integer mapping is represented by 1, one-hot encoding is represented by 2, and dropping the attribute is represented by 3. Only the data is returned in json format without any other explanation or content. Sample response: {{"color":2,"size":1,"country":2,"brand":2,"gender":1,"comments":3}}
8
+ null_attribute_template: |
9
+ You are a data analyst. You are preprocessing the attributes in the data that contain null values. The columns to be processed include: {attributes}. The types of these attributes are:
10
+ {types_info}
11
+ Statistics for these properties in csv format:
12
+ {description_info}
13
+ Please help me decide how to supplement null values for each attribute based on content, statistics and semantics. The mean filling is represented by 1, the median filling is represented by 2, the mode filling is represented by 3, the introduction of a new category to represent the unknown is represented by 4, and the interpolation filling is represented by 5. Only the data is returned in json format without any other explanation or content. Sample response: {{"grade":2,"annual_income":2,"temperature":1,"fault_type":3,"country":4,"weight":1,"stock price":5}}
14
+ decide_model_template: |
15
+ You are a data analyst. The shape of my data frame is {shape_info}. The head(5) of the data frame is:
16
+ {head_info}
17
+ The nunique() of the data frame is:
18
+ {nunique_info}
19
+ The description of the data frame is:
20
+ {description_info}
21
+ The data has been cleaned and preprocessed, nulls filled, and encoded ready to train the machine learning model. According to the data information provided, please help me decide which machine learning models should be used for classification prediction. Model options are: 1:LogisticRegression, 2:SVC, 3:GaussianNB, 4:RandomForestClassifier, 5:AdaBoostClassifier, 6:XGBClassifier, 7:GradientBoostingClassifier. Please select three models to take into account different model performance indicators. Only the data is returned in json format without any other explanation or content. Sample response: {{"model1":1,"model2":4,"model3":6}}
22
+ decide_clustering_model_template: |
23
+ You are a data analyst. The shape of my data frame is {shape_info}. The description of the data frame is:
24
+ {description_info}
25
+ The data has been cleaned and preprocessed, numerically transformed, and ready to train the clustering models. According to the data information provided, please help me decide which clustering models should be used for discovering natural groupings in the data. The expected number of clusters is {cluster_info}. Model options are: 1:KMeans, 2:DBSCAN, 3:GaussianMixture, 4:AgglomerativeClustering, 5:SpectralClustering. Please select three models to take into account different model performance indicators. Only the data is returned in json format without any other explanation or content. Sample response: {{"model1":1,"model2":2,"model3":3}}
26
+ decide_regression_model_template: |
27
+ You are a data analyst. You are trying to select some regression models to predict the target attribute. The shape of my data frame is {shape_info}. The target variable to be predicted is {Y_name}. The description of the data frame is:
28
+ {description_info}
29
+ The data has been cleaned and preprocessed, numerically transformed, and ready to train the regression models. According to the data information provided, please help me decide which regression models should be used to provide better prediction performance. Model options are: 1:LinearRegression, 2:Ridge, 3:Lasso, 4:RandomForestRegressor, 5:GradientBoostingRegressor, 6:ElasticNet. Please select three models to take into account different model performance indicators. Only the data is returned in json format without any other explanation or content. Sample response: {{"model1":1,"model2":2,"model3":3}}
30
+ decide_target_attribute_template: |
31
+ You are a data analyst. You are trying to find out which attribute is the target attribute from the data frame. The attributes are {attributes}. The types of these attributes are:
32
+ {types_info}
33
+ The head(10) of the data frame is:
34
+ {head_info}
35
+ Determine the target attribute to predict based on the data information provided. Only the data is returned in json format without any other explanation or content. Sample response: {{"target":"species"}}
36
+ If the provided data is not sufficient to determine the target, only return the data in json format {{"target":-1}}
37
+ decide_test_ratio_template: |
38
+ You are a data analyst. You are trying to split the data frame into training set and test set. The shape of my data frame is {shape_info}. Determine the test set ratio based on the shape information provided and it's assumed that the categories of the target variable are balanced. The test set ratio range is 0.01 to 0.25. Only the data is returned in json format without any other explanation or content. Sample response: {{"test_ratio":0.25}}
39
+ decide_balance_template: |
40
+ You are a data analyst. You have a cleaned and pre-processed data frame and you want to handle class imbalance before training the machine learning model. The shape of my data frame is {shape_info}. The description of the data frame is:
41
+ {description_info}
42
+ The number of each value of the target attribute is: {balance_info}
43
+ Determine the balance strategy based on the data information provided. The RandomOverSampler is represented by 1, the SMOTE is represented by 2, the ADASYN is represented by 3, and do not balance is represented by 4. Only the data is returned in json format without any other explanation or content. Sample response: {{"method":2}}
app/src/handle_null_value.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ def contains_missing_value(df):
4
+ """
5
+ Checks if the DataFrame contains any missing values.
6
+ """
7
+ return df.isnull().values.any()
8
+
9
+ def fill_null_values(df, mean_list, median_list, mode_list, new_category_list, interpolation_list):
10
+ """
11
+ Fills missing values in the DataFrame using specified methods for different columns.
12
+
13
+ Parameters:
14
+ - df (DataFrame): The DataFrame with missing values.
15
+ - mean_list (list): Columns to fill missing values with mean.
16
+ - median_list (list): Columns to fill missing values with median.
17
+ - mode_list (list): Columns to fill missing values with mode.
18
+ - new_category_list (list): Columns to fill missing values with a new category (previously intended for 'NaN', now uses interpolation).
19
+ - interpolation_list (list): Columns to fill missing values using interpolation.
20
+
21
+ Returns:
22
+ - df (DataFrame): The DataFrame after filling missing values.
23
+ """
24
+ if mean_list:
25
+ df = fill_with_mean(df, mean_list)
26
+ if median_list:
27
+ df = fill_with_median(df, median_list)
28
+ if mode_list:
29
+ df = fill_with_mode(df, mode_list)
30
+ if new_category_list:
31
+ # df = fill_with_NaN(df, new_category_list)
32
+ df = fill_with_interpolation(df, new_category_list)
33
+ if interpolation_list:
34
+ df = fill_with_interpolation(df, interpolation_list)
35
+ return df
36
+
37
+ def remove_high_null(df, threshold_row=0.5, threshold_col=0.7):
38
+ """
39
+ Remove rows and columns from a DataFrame where the proportion of null values
40
+ is greater than the specified threshold.
41
+
42
+ - param df: Pandas DataFrame to be processed.
43
+ - param threshold_row: Proportion threshold for null values (default is 0.5 for rows).
44
+ - param threshold_col: Proportion threshold for null values (default is 0.7 for columns).
45
+
46
+ - return: DataFrame with high-null rows and columns removed.
47
+ """
48
+ # Calculate the proportion of nulls in each column
49
+ null_prop_col = df.isnull().mean()
50
+ cols_to_drop = null_prop_col[null_prop_col > threshold_col].index
51
+
52
+ # Drop columns with high proportion of nulls
53
+ df_cleaned = df.drop(columns=cols_to_drop)
54
+
55
+ # Calculate the proportion of nulls in each row
56
+ null_prop_row = df_cleaned.isnull().mean(axis=1)
57
+ rows_to_drop = null_prop_row[null_prop_row > threshold_row].index
58
+
59
+ # Drop rows with high proportion of nulls
60
+ df_cleaned = df_cleaned.drop(index=rows_to_drop)
61
+
62
+ return df_cleaned
63
+
64
+ def fill_with_mean(df, attributes):
65
+ for attr in attributes:
66
+ if attr in df.columns:
67
+ df[attr] = df[attr].fillna(df[attr].mean())
68
+ return df
69
+
70
+ def fill_with_median(df, attributes):
71
+ for attr in attributes:
72
+ if attr in df.columns:
73
+ df[attr] = df[attr].fillna(df[attr].median())
74
+ return df
75
+
76
+ def fill_with_mode(df, attributes):
77
+ for attr in attributes:
78
+ if attr in df.columns:
79
+ mode_value = df[attr].mode()[0] if not df[attr].mode().empty else None
80
+ if mode_value is not None:
81
+ df[attr] = df[attr].fillna(mode_value)
82
+ return df
83
+
84
+ def fill_with_interpolation(df, attributes, method='linear'):
85
+ # method: default is 'linear'. 'time', 'index', 'pad', 'nearest', 'quadratic', 'cubic', etc.
86
+ for attr in attributes:
87
+ if attr in df.columns:
88
+ df[attr] = df[attr].interpolate(method=method)
89
+ return df
90
+
91
+ # Deprecated: replaced with interpolation to ensure no missing values
92
+ def fill_with_NaN(df, attributes):
93
+ for attr in attributes:
94
+ if attr in df.columns:
95
+ df[attr] = df[attr].fillna('NaN')
96
+ return df
97
+
98
+ def replace_placeholders_with_nan(df):
99
+ """
100
+ Replaces common placeholders for missing values in object columns with np.nan.
101
+
102
+ Parameters:
103
+ - df (DataFrame): The DataFrame to process.
104
+
105
+ Returns:
106
+ - df (DataFrame): Updated DataFrame with placeholders replaced.
107
+ """
108
+ placeholders = ["NA", "NULL", "?", "", "NaN", "None", "N/A", "n/a", "nan", "none"]
109
+ for col in df.columns:
110
+ if df[col].dtype == 'object':
111
+ df[col] = df[col].apply(lambda x: np.nan if str(x).lower() in placeholders else x)
112
+ return df
app/src/llm_service.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import yaml
3
+ import json
4
+ import re
5
+ import streamlit as st
6
+ from langchain.prompts import PromptTemplate
7
+ from langchain.schema import HumanMessage
8
+ from langchain.chat_models import ChatOpenAI
9
+
10
+ config_path = os.path.join(os.path.dirname(__file__), 'config', 'config.yaml')
11
+ with open(config_path, 'r') as file:
12
+ config = yaml.safe_load(file)
13
+ model4_name = config["model4_name"]
14
+ model3_name = config["model3_name"]
15
+ api_key = config["openai_api_key"]
16
+
17
+ def decide_encode_type(attributes, data_frame_head, model_type = 4, user_api_key = None):
18
+ """
19
+ Decides the encoding type for given attributes using a language model via the OpenAI API.
20
+
21
+ Parameters:
22
+ - attributes (list): A list of attributes for which to decide the encoding type.
23
+ - data_frame_head (DataFrame): The head of the DataFrame containing the attributes. This parameter is expected to be a representation of the DataFrame (e.g., a string or a small subset of the actual DataFrame) that gives an overview of the data.
24
+ - model_type (int, optional): Specifies the model to use. The default model_type=4 corresponds to a predefined model named `model4_name`. Another option is model_type=3, which corresponds to `model3_name`.
25
+ - user_api_key (str, optional): The user's OpenAI API key. If not provided, a default API key `api_key` is used.
26
+
27
+ Returns:
28
+ - A JSON object containing the recommended encoding types for the given attributes. Please refer to prompt templates in config.py for details.
29
+
30
+ Raises:
31
+ - Exception: If there is an issue accessing the OpenAI API, such as an invalid API key or a network connection error, the function will raise an exception with a message indicating the problem.
32
+ """
33
+ try:
34
+ model_name = model4_name if model_type == 4 else model3_name
35
+ user_api_key = api_key if user_api_key is None else user_api_key
36
+ llm = ChatOpenAI(model_name=model_name, openai_api_key=user_api_key, temperature=0)
37
+
38
+ template = config["numeric_attribute_template"]
39
+ prompt_template = PromptTemplate(input_variables=["attributes", "data_frame_head"], template=template)
40
+ summary_prompt = prompt_template.format(attributes=attributes, data_frame_head=data_frame_head)
41
+
42
+ llm_answer = llm([HumanMessage(content=summary_prompt)])
43
+ if '```json' in llm_answer.content:
44
+ match = re.search(r'```json\n(.*?)```', llm_answer.content, re.DOTALL)
45
+ if match: json_str = match.group(1)
46
+ else: json_str = llm_answer.content
47
+ return json.loads(json_str)
48
+ except Exception as e:
49
+ st.error("Cannot access the OpenAI API. Please check your API key or network connection.")
50
+ st.stop()
51
+
52
+ def decide_fill_null(attributes, types_info, description_info, model_type = 4, user_api_key = None):
53
+ """
54
+ Decides the best encoding type for given attributes using an AI model via OpenAI API.
55
+
56
+ Parameters:
57
+ - attributes (list): List of attribute names to consider for encoding.
58
+ - data_frame_head (DataFrame or str): The head of the DataFrame or a string representation, providing context for the encoding decision.
59
+ - model_type (int, optional): The model to use, where 4 is the default. Can be customized to use a different model.
60
+ - user_api_key (str, optional): The user's OpenAI API key. If None, a default key is used.
61
+
62
+ Returns:
63
+ - dict: A JSON object with recommended encoding types for the attributes. Please refer to prompt templates in config.py for details.
64
+
65
+ Raises:
66
+ - Exception: If there is an issue accessing the OpenAI API, such as an invalid API key or a network connection error, the function will raise an exception with a message indicating the problem.
67
+ """
68
+ try:
69
+ model_name = model4_name if model_type == 4 else model3_name
70
+ user_api_key = api_key if user_api_key is None else user_api_key
71
+ llm = ChatOpenAI(model_name=model_name, openai_api_key=user_api_key, temperature=0)
72
+
73
+ template = config["null_attribute_template"]
74
+ prompt_template = PromptTemplate(input_variables=["attributes", "types_info", "description_info"], template=template)
75
+ summary_prompt = prompt_template.format(attributes=attributes, types_info=types_info, description_info=description_info)
76
+
77
+ llm_answer = llm([HumanMessage(content=summary_prompt)])
78
+ if '```json' in llm_answer.content:
79
+ match = re.search(r'```json\n(.*?)```', llm_answer.content, re.DOTALL)
80
+ if match: json_str = match.group(1)
81
+ else: json_str = llm_answer.content
82
+ return json.loads(json_str)
83
+ except Exception as e:
84
+ st.error("Cannot access the OpenAI API. Please check your API key or network connection.")
85
+ st.stop()
86
+
87
+ def decide_model(shape_info, head_info, nunique_info, description_info, model_type = 4, user_api_key = None):
88
+ """
89
+ Decides the most suitable machine learning model based on dataset characteristics.
90
+
91
+ Parameters:
92
+ - shape_info (dict): Information about the shape of the dataset.
93
+ - head_info (str or DataFrame): The head of the dataset or its string representation.
94
+ - nunique_info (dict): Information about the uniqueness of dataset attributes.
95
+ - description_info (str): Descriptive information about the dataset.
96
+ - model_type (int, optional): Specifies which model to consult for decision-making.
97
+ - user_api_key (str, optional): OpenAI API key for making requests.
98
+
99
+ Returns:
100
+ - dict: A JSON object containing the recommended model and configuration. Please refer to prompt templates in config.py for details.
101
+
102
+ Raises:
103
+ - Exception: If there is an issue accessing the OpenAI API, such as an invalid API key or a network connection error, the function will raise an exception with a message indicating the problem.
104
+ """
105
+ try:
106
+ model_name = model4_name if model_type == 4 else model3_name
107
+ user_api_key = api_key if user_api_key is None else user_api_key
108
+ llm = ChatOpenAI(model_name=model_name, openai_api_key=user_api_key, temperature=0)
109
+
110
+ template = config["decide_model_template"]
111
+ prompt_template = PromptTemplate(input_variables=["shape_info", "head_info", "nunique_info", "description_info"], template=template)
112
+ summary_prompt = prompt_template.format(shape_info=shape_info, head_info=head_info, nunique_info=nunique_info, description_info=description_info)
113
+
114
+ llm_answer = llm([HumanMessage(content=summary_prompt)])
115
+ if '```json' in llm_answer.content:
116
+ match = re.search(r'```json\n(.*?)```', llm_answer.content, re.DOTALL)
117
+ if match: json_str = match.group(1)
118
+ else: json_str = llm_answer.content
119
+ return json.loads(json_str)
120
+ except Exception as e:
121
+ st.error("Cannot access the OpenAI API. Please check your API key or network connection.")
122
+ st.stop()
123
+
124
+ def decide_cluster_model(shape_info, description_info, cluster_info, model_type = 4, user_api_key = None):
125
+ """
126
+ Determines the appropriate clustering model based on dataset characteristics.
127
+
128
+ Parameters:
129
+ - shape_info: Information about the dataset shape.
130
+ - description_info: Descriptive statistics or information about the dataset.
131
+ - cluster_info: Additional information relevant to clustering.
132
+ - model_type (int, optional): The model type to use for decision making (default 4).
133
+ - user_api_key (str, optional): The user's API key for OpenAI.
134
+
135
+ Returns:
136
+ - A JSON object with the recommended clustering model and parameters. Please refer to prompt templates in config.py for details.
137
+
138
+ Raises:
139
+ - Exception: If unable to access the OpenAI API or another error occurs.
140
+ """
141
+ try:
142
+ model_name = model4_name if model_type == 4 else model3_name
143
+ user_api_key = api_key if user_api_key is None else user_api_key
144
+ llm = ChatOpenAI(model_name=model_name, openai_api_key=user_api_key, temperature=0)
145
+
146
+ template = config["decide_clustering_model_template"]
147
+ prompt_template = PromptTemplate(input_variables=["shape_info", "description_info", "cluster_info"], template=template)
148
+ summary_prompt = prompt_template.format(shape_info=shape_info, description_info=description_info, cluster_info=cluster_info)
149
+
150
+ llm_answer = llm([HumanMessage(content=summary_prompt)])
151
+ if '```json' in llm_answer.content:
152
+ match = re.search(r'```json\n(.*?)```', llm_answer.content, re.DOTALL)
153
+ if match: json_str = match.group(1)
154
+ else: json_str = llm_answer.content
155
+ return json.loads(json_str)
156
+ except Exception as e:
157
+ st.error("Cannot access the OpenAI API. Please check your API key or network connection.")
158
+ st.stop()
159
+
160
+ def decide_regression_model(shape_info, description_info, Y_name, model_type = 4, user_api_key = None):
161
+ """
162
+ Determines the appropriate regression model based on dataset characteristics and the target variable.
163
+
164
+ Parameters:
165
+ - shape_info: Information about the dataset shape.
166
+ - description_info: Descriptive statistics or information about the dataset.
167
+ - Y_name: The name of the target variable.
168
+ - model_type (int, optional): The model type to use for decision making (default 4).
169
+ - user_api_key (str, optional): The user's API key for OpenAI.
170
+
171
+ Returns:
172
+ - A JSON object with the recommended regression model and parameters. Please refer to prompt templates in config.py for details.
173
+
174
+ Raises:
175
+ - Exception: If unable to access the OpenAI API or another error occurs.
176
+ """
177
+ try:
178
+ model_name = model4_name if model_type == 4 else model3_name
179
+ user_api_key = api_key if user_api_key is None else user_api_key
180
+ llm = ChatOpenAI(model_name=model_name, openai_api_key=user_api_key, temperature=0)
181
+
182
+ template = config["decide_regression_model_template"]
183
+ prompt_template = PromptTemplate(input_variables=["shape_info", "description_info", "Y_name"], template=template)
184
+ summary_prompt = prompt_template.format(shape_info=shape_info, description_info=description_info, Y_name=Y_name)
185
+
186
+ llm_answer = llm([HumanMessage(content=summary_prompt)])
187
+ if '```json' in llm_answer.content:
188
+ match = re.search(r'```json\n(.*?)```', llm_answer.content, re.DOTALL)
189
+ if match: json_str = match.group(1)
190
+ else: json_str = llm_answer.content
191
+ return json.loads(json_str)
192
+ except Exception as e:
193
+ st.error("Cannot access the OpenAI API. Please check your API key or network connection.")
194
+ st.stop()
195
+
196
+ def decide_target_attribute(attributes, types_info, head_info, model_type = 4, user_api_key = None):
197
+ """
198
+ Determines the target attribute for modeling based on dataset attributes and characteristics.
199
+
200
+ Parameters:
201
+ - attributes: A list of dataset attributes.
202
+ - types_info: Information about the data types of the attributes.
203
+ - head_info: A snapshot of the dataset's first few rows.
204
+ - model_type (int, optional): The model type to use for decision making (default 4).
205
+ - user_api_key (str, optional): The user's API key for OpenAI.
206
+
207
+ Returns:
208
+ - The name of the recommended target attribute. Please refer to prompt templates in config.py for details.
209
+
210
+ Raises:
211
+ - Exception: If unable to access the OpenAI API or another error occurs.
212
+ """
213
+ try:
214
+ model_name = model4_name if model_type == 4 else model3_name
215
+ user_api_key = api_key if user_api_key is None else user_api_key
216
+ llm = ChatOpenAI(model_name=model_name, openai_api_key=user_api_key, temperature=0)
217
+
218
+ template = config["decide_target_attribute_template"]
219
+ prompt_template = PromptTemplate(input_variables=["attributes", "types_info", "head_info"], template=template)
220
+ summary_prompt = prompt_template.format(attributes=attributes, types_info=types_info, head_info=head_info)
221
+
222
+ llm_answer = llm([HumanMessage(content=summary_prompt)])
223
+ if '```json' in llm_answer.content:
224
+ match = re.search(r'```json\n(.*?)```', llm_answer.content, re.DOTALL)
225
+ if match: json_str = match.group(1)
226
+ else: json_str = llm_answer.content
227
+ return json.loads(json_str)["target"]
228
+ except Exception as e:
229
+ st.error("Cannot access the OpenAI API. Please check your API key or network connection.")
230
+ st.stop()
231
+
232
+ def decide_test_ratio(shape_info, model_type = 4, user_api_key = None):
233
+ """
234
+ Determines the appropriate train-test split ratio based on dataset characteristics.
235
+
236
+ Parameters:
237
+ - shape_info: Information about the dataset shape.
238
+ - model_type (int, optional): The model type to use for decision making (default 4).
239
+ - user_api_key (str, optional): The user's API key for OpenAI.
240
+
241
+ Returns:
242
+ - The recommended train-test split ratio as a float. Please refer to prompt templates in config.py for details.
243
+
244
+ Raises:
245
+ - Exception: If unable to access the OpenAI API or another error occurs.
246
+ """
247
+ try:
248
+ model_name = model4_name if model_type == 4 else model3_name
249
+ user_api_key = api_key if user_api_key is None else user_api_key
250
+ llm = ChatOpenAI(model_name=model_name, openai_api_key=user_api_key, temperature=0)
251
+
252
+ template = config["decide_test_ratio_template"]
253
+ prompt_template = PromptTemplate(input_variables=["shape_info"], template=template)
254
+ summary_prompt = prompt_template.format(shape_info=shape_info)
255
+
256
+ llm_answer = llm([HumanMessage(content=summary_prompt)])
257
+ if '```json' in llm_answer.content:
258
+ match = re.search(r'```json\n(.*?)```', llm_answer.content, re.DOTALL)
259
+ if match: json_str = match.group(1)
260
+ else: json_str = llm_answer.content
261
+ return json.loads(json_str)["test_ratio"]
262
+ except Exception as e:
263
+ st.error("Cannot access the OpenAI API. Please check your API key or network connection.")
264
+ st.stop()
265
+
266
+ def decide_balance(shape_info, description_info, balance_info, model_type = 4, user_api_key = None):
267
+ """
268
+ Determines the appropriate method to balance the dataset based on its characteristics.
269
+
270
+ Parameters:
271
+ - shape_info: Information about the dataset shape.
272
+ - description_info: Descriptive statistics or information about the dataset.
273
+ - balance_info: Additional information relevant to dataset balancing.
274
+ - model_type (int, optional): The model type to use for decision making (default 4).
275
+ - user_api_key (str, optional): The user's API key for OpenAI.
276
+
277
+ Returns:
278
+ - The recommended method to balance the dataset. Please refer to prompt templates in config.py for details.
279
+
280
+ Raises:
281
+ - Exception: If unable to access the OpenAI API or another error occurs.
282
+ """
283
+ try:
284
+ model_name = model4_name if model_type == 4 else model3_name
285
+ user_api_key = api_key if user_api_key is None else user_api_key
286
+ llm = ChatOpenAI(model_name=model_name, openai_api_key=user_api_key, temperature=0)
287
+
288
+ template = config["decide_balance_template"]
289
+ prompt_template = PromptTemplate(input_variables=["shape_info", "description_info", "balance_info"], template=template)
290
+ summary_prompt = prompt_template.format(shape_info=shape_info, description_info=description_info, balance_info=balance_info)
291
+
292
+ llm_answer = llm([HumanMessage(content=summary_prompt)])
293
+ if '```json' in llm_answer.content:
294
+ match = re.search(r'```json\n(.*?)```', llm_answer.content, re.DOTALL)
295
+ if match: json_str = match.group(1)
296
+ else: json_str = llm_answer.content
297
+ return json.loads(json_str)["method"]
298
+ except Exception as e:
299
+ st.error("Cannot access the OpenAI API. Please check your API key or network connection.")
300
+ st.stop()
app/src/model_service.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import numpy as np
3
+ import streamlit as st
4
+ from collections import Counter
5
+ from sklearn import metrics
6
+ from sklearn.preprocessing import StandardScaler
7
+ from sklearn.cluster import KMeans
8
+ from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
9
+ from joblib import dump
10
+ from sklearn.metrics import roc_curve, silhouette_score, calinski_harabasz_score, davies_bouldin_score, f1_score, r2_score, mean_squared_error, mean_absolute_error
11
+ from sklearn.model_selection import train_test_split
12
+
13
+ def split_data(X, Y, test_size = 0.2, random_state = 42, perform_pca = False):
14
+ """
15
+ Splits the dataset into training and testing sets, optionally standardizing the data if PCA is not performed.
16
+
17
+ :param X: Feature matrix.
18
+ :param Y: Target vector.
19
+ :param test_size: Proportion of the dataset to include in the test split.
20
+ :param random_state: Controls the shuffling applied to the data before applying the split.
21
+ :param perform_pca: Has PCA been performed or not. If not, standardizes the data.
22
+ :return: A tuple containing split and optionally transformed datasets: X_train, X_test, Y_train, Y_test.
23
+ """
24
+ X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state)
25
+
26
+ if not perform_pca:
27
+ scaler = StandardScaler()
28
+ X_train = scaler.fit_transform(X_train)
29
+ X_test = scaler.transform(X_test)
30
+
31
+ return X_train, X_test, Y_train, Y_test
32
+
33
+ def check_and_balance(X, Y, balance_threshold=0.5, method=1):
34
+ """
35
+ Check if the dataset is imbalanced and perform oversampling if necessary using RandomOverSampler, SMOTE, or ADASYN.
36
+
37
+ Args:
38
+ X (DataFrame): Feature set.
39
+ Y (Series): Target variable.
40
+ balance_threshold (float): Threshold for class balance.
41
+ method (int): Method for oversampling. Options are 'random', 'smote', or 'adasyn'.
42
+
43
+ Returns:
44
+ X_resampled, Y_resampled (DataFrame/Series): Resampled data if imbalance is detected, else original data.
45
+ """
46
+ try:
47
+ # Check the distribution of the target variable
48
+ class_distribution = Counter(Y)
49
+
50
+ # Determine if the dataset is imbalanced
51
+ min_class_samples = min(class_distribution.values())
52
+ max_class_samples = max(class_distribution.values())
53
+ is_imbalanced = min_class_samples / max_class_samples < balance_threshold
54
+
55
+ if is_imbalanced and method != 4:
56
+ if method == 1:
57
+ oversampler = RandomOverSampler(random_state=0)
58
+ elif method == 2:
59
+ oversampler = SMOTE(random_state=0)
60
+ elif method == 3:
61
+ oversampler = ADASYN(random_state=0)
62
+
63
+ X_resampled, Y_resampled = oversampler.fit_resample(X, Y)
64
+ return X_resampled, Y_resampled
65
+ else:
66
+ return X, Y
67
+ except Exception as e:
68
+ st.error("The target attribute may be continuous. Please check the data type.")
69
+ st.stop()
70
+
71
+ def estimate_optimal_clusters(df):
72
+ """
73
+ Estimates the optimal number of clusters for KMeans clustering using the elbow method and silhouette scores.
74
+
75
+ :param df: DataFrame containing the dataset to cluster.
76
+ :return: The estimated optimal number of clusters.
77
+ """
78
+ sse = {}
79
+ for k in range(2, 11):
80
+ kmeans = KMeans(n_clusters=k, random_state=42).fit(df)
81
+ sse[k] = kmeans.inertia_
82
+
83
+ # Find the elbow point: compute the first and second differences of the SSE
84
+ sse_values = list(sse.values())
85
+ first_diff = np.diff(sse_values) # first difference
86
+ second_diff = np.diff(first_diff) # second difference
87
+ knee_point = np.argmax(second_diff) + 2
88
+
89
+ # find the optimal number of clusters around the knee point
90
+ silhouette_avg_scores = {}
91
+ for k in range(knee_point - 1, knee_point + 2):
92
+ if k >= 2: # make sure k is at least 2
93
+ kmeans = KMeans(n_clusters=k, random_state=42).fit(df)
94
+ silhouette_avg_scores[k] = silhouette_score(df, kmeans.labels_)
95
+
96
+ # Find the optimal number of clusters based on the highest average silhouette score
97
+ optimal_clusters = max(silhouette_avg_scores, key=silhouette_avg_scores.get)
98
+
99
+ return optimal_clusters
100
+
101
+ def calculate_f1_score(model, X_test, Y_test, binary_classification=True):
102
+ """
103
+ Calculates the F1 score for the predictions made by a model on a test set.
104
+
105
+ The function supports both binary and multi-class settings by adjusting the 'average' parameter in the f1_score calculation.
106
+
107
+ :param model: The trained machine learning model used for predictions.
108
+ :param X_test: The feature matrix for the test set.
109
+ :param Y_test: The true labels for the test set.
110
+ :param binary_classification: If True, calculates the F1 score for binary classification. Otherwise, calculates for multi-class classification using the 'macro' average.
111
+ :return: The F1 score of the model predictions.
112
+ """
113
+ y_pred = model.predict(X_test)
114
+ if binary_classification:
115
+ f1 = f1_score(Y_test, y_pred, average='binary')
116
+ else:
117
+ f1 = f1_score(Y_test, y_pred, average='macro')
118
+ return f1
119
+
120
+ def model_score(model, X_test, Y_test):
121
+ """
122
+ Calculate the model score for classification models.
123
+ """
124
+ score = model.score(X_test, Y_test)
125
+ return score
126
+
127
+ def fpr_and_tpr(model, X_test, Y_test):
128
+ """
129
+ Calculate the false positive rate and true positive rate for classification models.
130
+ """
131
+ Y_pred = model.predict_proba(X_test)[:, 1]
132
+ fpr, tpr, _ = roc_curve(Y_test, Y_pred)
133
+ return fpr, tpr
134
+
135
+ def auc(fpr, tpr):
136
+ """
137
+ Calculate the area under the ROC curve for classification models.
138
+ """
139
+ auc = metrics.auc(fpr, tpr)
140
+ return auc
141
+
142
+ def calculate_silhouette_score(X, labels):
143
+ """
144
+ Calculate the silhouette score for clustering models.
145
+ """
146
+ return silhouette_score(X, labels)
147
+
148
+ def calculate_calinski_harabasz_score(X, labels):
149
+ """
150
+ Calculate the calinski harabasz score for clustering models.
151
+ """
152
+ return calinski_harabasz_score(X, labels)
153
+
154
+ def calculate_davies_bouldin_score(X, labels):
155
+ """
156
+ Calculate the davies bouldin score for clustering models.
157
+ """
158
+ return davies_bouldin_score(X, labels)
159
+
160
+ def gmm_predict(X, model):
161
+ """
162
+ Get the predicted labels for a GMM model.
163
+ """
164
+ labels = model.predict(X)
165
+ return labels
166
+
167
+ def calculate_r2_score(y_pred, Y_test):
168
+ """
169
+ Calculate the r2 score for regression models.
170
+ """
171
+ r2 = r2_score(Y_test, y_pred)
172
+ return r2
173
+
174
+ def calculate_mse_and_rmse(y_pred, Y_test):
175
+ """
176
+ Calculate the mean squared error and root mean squared error for regression models.
177
+ """
178
+ mse = mean_squared_error(Y_test, y_pred)
179
+ rmse = np.sqrt(mse)
180
+ return mse, rmse
181
+
182
+ def calculate_mae(y_pred, Y_test):
183
+ """
184
+ Calculate the mean absolute error for regression models.
185
+ """
186
+ mae = mean_absolute_error(Y_test, y_pred)
187
+ return mae
188
+
189
+ def save_model(model):
190
+ """
191
+ Serializes a machine learning model into a binary format using joblib's dump function and stores it in a BytesIO buffer.
192
+ """
193
+ buffer = io.BytesIO()
194
+ dump(model, buffer)
195
+ buffer.seek(0)
196
+ return buffer.getvalue()
app/src/pca.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.decomposition import PCA
4
+ from sklearn.preprocessing import StandardScaler
5
+ from src.preprocess import convert_to_integer
6
+
7
+ def decide_pca(df, cumulative_variance_threshold=0.95, min_dim_reduction_ratio=0.1):
8
+ """
9
+ Determines whether PCA should be performed based on cumulative variance threshold and dimension reduction ratio.
10
+
11
+ Parameters:
12
+ - df (DataFrame): The input DataFrame.
13
+ - cumulative_variance_threshold (float): The threshold of explained variance to retain. Default is 0.95.
14
+ - min_dim_reduction_ratio (float): The minimum ratio of dimension reduction required to perform PCA. Default is 0.1.
15
+
16
+ Returns:
17
+ - perform_pca (bool): Whether PCA should be performed.
18
+ - n_components (int): The number of principal components to retain.
19
+ """
20
+ # Remove non-numeric columns
21
+ numeric_df = df.select_dtypes(include=[np.number])
22
+
23
+ # Standardizing the Data
24
+ scaler = StandardScaler()
25
+ scaled_data = scaler.fit_transform(numeric_df)
26
+
27
+ # PCA for Explained Variance
28
+ pca = PCA()
29
+ pca.fit(scaled_data)
30
+
31
+ # Calculate cumulative variance
32
+ cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
33
+
34
+ # Find the number of components for the desired threshold
35
+ n_components = np.where(cumulative_variance >= cumulative_variance_threshold)[0][0] + 1
36
+
37
+ # Calculate the dimension reduction ratio
38
+ dim_reduction_ratio = 1 - (n_components / df.shape[1])
39
+
40
+ # Check if PCA should be performed based on the dimension reduction ratio
41
+ perform_pca = dim_reduction_ratio >= min_dim_reduction_ratio
42
+ return perform_pca, n_components
43
+
44
+ def perform_pca(df, n_components, Y_name):
45
+ """
46
+ Performs PCA on the dataset, optionally excluding a target column, and standardizes the data.
47
+
48
+ Parameters:
49
+ - df (DataFrame): The input DataFrame.
50
+ - n_components (int): The number of principal components to retain.
51
+ - Y_name (str, optional): The name of the target column to exclude from PCA. Default is None.
52
+
53
+ Returns:
54
+ - pca_df (DataFrame): DataFrame with principal components and optionally the target column.
55
+ """
56
+ # Save the target column data
57
+ drop_columns = []
58
+ if Y_name:
59
+ target_data = df[Y_name]
60
+ drop_columns.append(Y_name)
61
+
62
+ # Remove non-numeric columns and the target column
63
+ numeric_df = df.select_dtypes(include=[np.number]).drop(columns=drop_columns, errors='ignore')
64
+
65
+ # Standardizing the Data
66
+ scaler = StandardScaler()
67
+ scaled_data = scaler.fit_transform(numeric_df)
68
+
69
+ # Applying PCA
70
+ pca = PCA(n_components=n_components)
71
+ principal_components = pca.fit_transform(scaled_data)
72
+
73
+ # Create a new DataFrame with principal components
74
+ columns = [f'PC{i+1}' for i in range(n_components)]
75
+ pca_df = pd.DataFrame(data=principal_components, columns=columns)
76
+
77
+ # Reattach the target column
78
+ if Y_name:
79
+ pca_df[Y_name] = target_data.reset_index(drop=True)
80
+ pca_df, _ = convert_to_integer(pca_df, columns_to_convert=[Y_name])
81
+
82
+ return pca_df
83
+
84
+ def perform_PCA_for_clustering(df, n_components):
85
+ """
86
+ Applies PCA transformation for clustering tasks on the given DataFrame.
87
+
88
+ Parameters:
89
+ - df (DataFrame): The input DataFrame to apply PCA.
90
+ - n_components (int): The number of principal components to retain.
91
+
92
+ Returns:
93
+ - pca_df (DataFrame): DataFrame of the principal components.
94
+ """
95
+ # Applying PCA
96
+ pca = PCA(n_components=n_components)
97
+ principal_components = pca.fit_transform(df)
98
+
99
+ # Create a new DataFrame with principal components
100
+ columns = [f'PC{i+1}' for i in range(n_components)]
101
+ pca_df = pd.DataFrame(data=principal_components, columns=columns)
102
+
103
+ return pca_df
104
+
105
+ def perform_PCA_for_regression(df, n_components, Y_name):
106
+ """
107
+ Applies PCA for regression tasks, excluding a specified target column from the transformation.
108
+
109
+ Parameters:
110
+ - df (DataFrame): The input DataFrame.
111
+ - n_components (int): The number of principal components to retain.
112
+ - Y_name (str, optional): The name of the target column to exclude from PCA and append back after transformation. Default is None.
113
+
114
+ Returns:
115
+ - pca_df (DataFrame): A new DataFrame with principal components and the target column.
116
+ """
117
+
118
+ # Save the target column data
119
+ drop_columns = []
120
+ if Y_name:
121
+ target_data = df[Y_name]
122
+ drop_columns.append(Y_name)
123
+
124
+ # Remove non-numeric columns and the target column
125
+ numeric_df = df.select_dtypes(include=[np.number]).drop(columns=drop_columns, errors='ignore')
126
+
127
+ # Applying PCA
128
+ pca = PCA(n_components=n_components)
129
+ principal_components = pca.fit_transform(numeric_df)
130
+
131
+ # Create a new DataFrame with principal components
132
+ columns = [f'PC{i+1}' for i in range(n_components)]
133
+ pca_df = pd.DataFrame(data=principal_components, columns=columns)
134
+
135
+ # Reattach the target column
136
+ if Y_name:
137
+ pca_df[Y_name] = target_data.reset_index(drop=True)
138
+ pca_df, _ = convert_to_integer(pca_df, columns_to_convert=[Y_name])
139
+
140
+ return pca_df
app/src/plot.py ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ import seaborn as sns
3
+ import numpy as np
4
+ import pandas as pd
5
+ import streamlit as st
6
+ import matplotlib.pyplot as plt
7
+ import plotly.express as px
8
+ import plotly.graph_objects as go
9
+ import scipy.stats as stats
10
+ from sklearn.decomposition import PCA
11
+ from wordcloud import WordCloud
12
+ from sklearn.metrics import confusion_matrix
13
+ from nltk import regexp_tokenize
14
+
15
+ # Single attribute visualization
16
+ def distribution_histogram(df, attribute):
17
+ """
18
+ Histogram of the distribution of a single attribute.
19
+ """
20
+ if df[attribute].dtype == 'object' or pd.api.types.is_categorical_dtype(df[attribute]):
21
+ codes, uniques = pd.factorize(df[attribute])
22
+ temp_df = pd.DataFrame({attribute: codes})
23
+ fig, ax = plt.subplots(figsize=(8, 6))
24
+ sns.histplot(temp_df[attribute], ax=ax, discrete=True, color='#e17160')
25
+ ax.set_xticks(range(len(uniques)))
26
+ ax.set_xticklabels(uniques, rotation=45, ha='right')
27
+ else:
28
+ fig, ax = plt.subplots(figsize=(6, 4))
29
+ sns.histplot(df[attribute], ax=ax, color='#e17160')
30
+
31
+ ax.set_title(f"Distribution of {attribute}")
32
+ return fig
33
+
34
+ def distribution_boxplot(df, attribute):
35
+ """
36
+ Boxplot of the distribution of a single attribute.
37
+ """
38
+ if df[attribute].dtype == 'object' or pd.api.types.is_categorical_dtype(df[attribute]):
39
+ return -1
40
+ fig, ax = plt.subplots(figsize=(8, 6))
41
+ sns.boxenplot(data=df[attribute], palette=["#32936f", "#26a96c", "#2bc016"])
42
+ ax.set_title(f"Boxplot of {attribute}")
43
+ return fig
44
+
45
+ def count_Y(df, Y_name):
46
+ """
47
+ Donut chart of the distribution of a single attribute.
48
+ """
49
+ if Y_name in df.columns and df[Y_name].nunique() >= 1:
50
+ value_counts = df[Y_name].value_counts()
51
+ fig = px.pie(names=value_counts.index,
52
+ values=value_counts.values,
53
+ title=f'Distribution of {Y_name}',
54
+ hole=0.5,
55
+ color_discrete_sequence=px.colors.sequential.Cividis_r)
56
+ return fig
57
+
58
+ def density_plot(df, column_name):
59
+ """
60
+ Density plot of the distribution of a single attribute.
61
+ """
62
+ if column_name in df.columns:
63
+ fig = px.density_contour(df, x=column_name, y=column_name,
64
+ title=f'Density Plot of {column_name}',
65
+ color_discrete_sequence=px.colors.sequential.Inferno)
66
+ return fig
67
+
68
+ # Mutiple attribute visualization
69
+ def box_plot(df, column_names):
70
+ """
71
+ Box plot of multiple attributes.
72
+ """
73
+ if len(column_names) > 1 and not all(df[column_names].dtypes.apply(lambda x: np.issubdtype(x, np.number))):
74
+ return -1
75
+ valid_columns = [col for col in column_names if col in df.columns]
76
+ if valid_columns:
77
+ fig = px.box(df, y=valid_columns,
78
+ title=f'Box Plot of {", ".join(valid_columns)}',
79
+ color_discrete_sequence=px.colors.sequential.Cividis_r)
80
+ return fig
81
+
82
+ def violin_plot(df, column_names):
83
+ """
84
+ Violin plot of multiple attributes.
85
+ """
86
+ if len(column_names) > 1 and not all(df[column_names].dtypes.apply(lambda x: np.issubdtype(x, np.number))):
87
+ return -1
88
+ valid_columns = [col for col in column_names if col in df.columns]
89
+ if valid_columns:
90
+ fig = px.violin(df, y=valid_columns,
91
+ title=f'Violin Plot of {", ".join(valid_columns)}',
92
+ color_discrete_sequence=px.colors.sequential.Cividis_r)
93
+ return fig
94
+
95
+ def strip_plot(df, column_names):
96
+ """
97
+ Strip plot of multiple attributes.
98
+ """
99
+ if len(column_names) > 1 and not all(df[column_names].dtypes.apply(lambda x: np.issubdtype(x, np.number))):
100
+ return -1
101
+ valid_columns = [col for col in column_names if col in df.columns]
102
+ if valid_columns:
103
+ fig = px.strip(df, y=valid_columns,
104
+ title=f'Strip Plot of {", ".join(valid_columns)}',
105
+ color_discrete_sequence=px.colors.sequential.Cividis_r)
106
+ return fig
107
+
108
+ def multi_plot_scatter(df, selected_attributes):
109
+ """
110
+ Scatter plot of multiple attributes.
111
+ """
112
+ if len(selected_attributes) < 2:
113
+ return -1
114
+
115
+ plt.figure(figsize=(10, 6))
116
+ if df[selected_attributes[0]].dtype not in [np.float64, np.int64]:
117
+ x, x_labels = pd.factorize(df[selected_attributes[0]])
118
+ plt.xticks(ticks=np.arange(len(x_labels)), labels=x_labels, rotation=45)
119
+ else:
120
+ x = df[selected_attributes[0]]
121
+
122
+ if df[selected_attributes[1]].dtype not in [np.float64, np.int64]:
123
+ y, y_labels = pd.factorize(df[selected_attributes[1]])
124
+ plt.yticks(ticks=np.arange(len(y_labels)), labels=y_labels)
125
+ else:
126
+ y = df[selected_attributes[1]]
127
+
128
+ plt.scatter(x, y, c=np.linspace(0, 1, len(df)), cmap='viridis')
129
+ plt.colorbar()
130
+ plt.xlabel(selected_attributes[0])
131
+ plt.ylabel(selected_attributes[1])
132
+ plt.title(f'Scatter Plot of {selected_attributes[0]} vs {selected_attributes[1]}')
133
+ return plt.gcf()
134
+
135
+ def multi_plot_line(df, selected_attributes):
136
+ """
137
+ Line plot of multiple attributes.
138
+ """
139
+ if not all(df[selected_attributes].dtypes.apply(lambda x: np.issubdtype(x, np.number))):
140
+ return -1
141
+ if len(selected_attributes) >= 2:
142
+ plt.figure(figsize=(10, 6))
143
+ colors = plt.cm.viridis(np.linspace(0, 1, len(selected_attributes)))
144
+ for i, attribute in enumerate(selected_attributes):
145
+ plt.plot(df.index, df[attribute], marker='', linewidth=2, color=colors[i], label=attribute)
146
+ plt.legend()
147
+ plt.xlabel(selected_attributes[0])
148
+ plt.ylabel(selected_attributes[1])
149
+ plt.title(f'Line Plot of {selected_attributes[0]} vs {selected_attributes[1]}')
150
+ return plt.gcf()
151
+ else:
152
+ return -2
153
+
154
+ def multi_plot_heatmap(df, selected_attributes):
155
+ """
156
+ Correlation heatmap of multiple attributes.
157
+ """
158
+ if not all(df[selected_attributes].dtypes.apply(lambda x: np.issubdtype(x, np.number))):
159
+ return -1
160
+ if len(selected_attributes) >= 1:
161
+ sns.set_theme()
162
+ plt.figure(figsize=(10, 8))
163
+ sns.heatmap(df[selected_attributes].corr(), annot=True, cmap='viridis')
164
+ plt.title('Heatmap of Correlation')
165
+ return plt.gcf()
166
+
167
+ # Overall visualization
168
+ @st.cache_data
169
+ def correlation_matrix(df):
170
+ """
171
+ Correlation heatmap of all attributes using Seaborn.
172
+ """
173
+ plt.figure(figsize=(16, 12))
174
+ sns.set(font_scale=0.9)
175
+ sns.heatmap(df.corr(), annot=True, cmap='viridis', annot_kws={"size": 12})
176
+ return plt.gcf()
177
+
178
+ @st.cache_data
179
+ def correlation_matrix_plotly(df):
180
+ """
181
+ Correlation heatmap of all attributes using Plotly.
182
+ """
183
+ corr_matrix = df.corr()
184
+ labels = corr_matrix.columns
185
+ text = [[f'{corr_matrix.iloc[i, j]:.2f}' for j in range(len(labels))] for i in range(len(labels))]
186
+ fig = go.Figure(data=go.Heatmap(
187
+ z=corr_matrix.values,
188
+ x=labels,
189
+ y=labels,
190
+ colorscale='Viridis',
191
+ colorbar=dict(title='Correlation'),
192
+ text=text,
193
+ hoverinfo='text',
194
+ ))
195
+ fig.update_layout(
196
+ title='Correlation Matrix Between Attributes',
197
+ xaxis=dict(tickmode='linear'),
198
+ yaxis=dict(tickmode='linear'),
199
+ width=800,
200
+ height=700,
201
+ )
202
+ fig.update_layout(font=dict(size=10))
203
+ return fig
204
+
205
+ @st.cache_data
206
+ def list_all(df, max_plots=16):
207
+ """
208
+ Display histograms of all attributes in the DataFrame.
209
+ """
210
+
211
+ # Calculate the number of plots to display (up to 16)
212
+ num_plots = min(len(df.columns), max_plots)
213
+ nrows = int(np.ceil(num_plots / 4))
214
+ ncols = min(num_plots, 4)
215
+ fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 4 * nrows))
216
+ fig.suptitle('Attribute Distributions', fontsize=20)
217
+ plt.style.use('ggplot')
218
+ sns.set(style="darkgrid")
219
+
220
+ # if only one plot, convert to list
221
+ if num_plots == 1: axes = [axes]
222
+
223
+ # Flatten the axes array
224
+ axes = axes.flatten()
225
+
226
+ # Display the histograms
227
+ for i, column in enumerate(df.columns[:num_plots]):
228
+ sns.histplot(ax=axes[i], data=df, x=column, color='#1867ac')
229
+
230
+ # Hide additional subplots
231
+ for ax in axes[num_plots:]: ax.axis('off')
232
+
233
+ plt.tight_layout()
234
+ plt.subplots_adjust(top=0.95) # Adjust the top to accommodate the title
235
+ return fig
236
+
237
+ # Model evaluation
238
+ def confusion_metrix(model_name, model, X_test, Y_test):
239
+ """
240
+ Confusion matrix plot for classification models
241
+ """
242
+ Y_pred = model.predict(X_test)
243
+ matrix = confusion_matrix(Y_test, Y_pred)
244
+ plt.figure(figsize=(10, 7)) # temporary
245
+ sns_heatmap = sns.heatmap(matrix, annot=True, cmap='Blues', fmt='g', annot_kws={"size": 20})
246
+ plt.title(f"Confusion Matrix for {model_name}", fontsize=20)
247
+ plt.xlabel('Predicted labels', fontsize=16)
248
+ plt.ylabel('True labels', fontsize=16)
249
+ return sns_heatmap.figure
250
+
251
+ def roc(model_name, fpr, tpr):
252
+ """
253
+ ROC curve for classification models
254
+ """
255
+ fig = plt.figure()
256
+ plt.style.use('ggplot')
257
+ plt.plot([0,1],[0,1],'k--')
258
+ plt.plot(fpr, tpr, label=model_name)
259
+ plt.xlabel('False Positive rate')
260
+ plt.ylabel('True Positive rate')
261
+ plt.title(f'ROC Curve - {model_name}')
262
+ plt.legend(loc='best')
263
+ plt.xticks(rotation=45)
264
+ return fig
265
+
266
+ def plot_clusters(X, labels):
267
+ """
268
+ Scatter plot of clusters for clustering models
269
+ """
270
+ sns.set(style="whitegrid")
271
+ pca = PCA(n_components=2)
272
+ X_pca = pca.fit_transform(X)
273
+ unique_labels = set(labels)
274
+ colors = plt.cm.viridis(np.linspace(0, 1, len(unique_labels)))
275
+
276
+ fig, ax = plt.subplots()
277
+ for color, label in zip(colors, unique_labels):
278
+ idx = labels == label
279
+ ax.scatter(X_pca[idx, 0], X_pca[idx, 1], color=color, label=f'Cluster {label}', s=50)
280
+
281
+ ax.set_title('Cluster Scatter Plot')
282
+ ax.legend()
283
+ return fig
284
+
285
+ def plot_residuals(y_pred, Y_test):
286
+ """
287
+ Residual plot for regression models
288
+ """
289
+ residuals = Y_test - y_pred
290
+ fig, ax = plt.subplots()
291
+ sns.residplot(x=y_pred, y=residuals, lowess=True, ax=ax, scatter_kws={'alpha': 0.7}, line_kws={'color': 'purple', 'lw': 2})
292
+ ax.set_xlabel('Predicted Values')
293
+ ax.set_ylabel('Residuals')
294
+ ax.set_title('Residual Plot')
295
+ return fig
296
+
297
+ def plot_predictions_vs_actual(y_pred, Y_test):
298
+ """
299
+ Scatter plot of predicted vs. actual values for regression models
300
+ """
301
+ fig, ax = plt.subplots()
302
+ ax.scatter(Y_test, y_pred, c='#10a37f', marker='x')
303
+ ax.plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()], 'k--', lw=2)
304
+ ax.set_xlabel('Actual')
305
+ ax.set_ylabel('Predicted')
306
+ ax.set_title('Actual vs. Predicted')
307
+ ax.set_facecolor('white')
308
+ ax.grid(True, which='major', linestyle='--', linewidth=0.5, color='gray')
309
+ ax.spines['top'].set_visible(False)
310
+ ax.spines['right'].set_visible(False)
311
+ return fig
312
+
313
+ def plot_qq_plot(y_pred, Y_test):
314
+ """
315
+ Quantile-Quantile plot for regression models
316
+ """
317
+ residuals = Y_test - y_pred
318
+ fig, ax = plt.subplots()
319
+ (osm, osr), (slope, intercept, r) = stats.probplot(residuals, dist="norm", plot=None)
320
+ line = slope * osm + intercept
321
+ ax.plot(osm, line, 'grey', lw=2)
322
+ ax.scatter(osm, osr, alpha=0.8, edgecolors='#e8b517', c='yellow', label='Data Points')
323
+ ax.set_title('Quantile-Quantile Plot')
324
+ ax.set_facecolor('white')
325
+ ax.grid(True, which='major', linestyle='--', linewidth=0.5, color='gray')
326
+ ax.spines['top'].set_visible(False)
327
+ ax.spines['right'].set_visible(False)
328
+ ax.set_xlabel('Theoretical Quantiles')
329
+ ax.set_ylabel('Ordered Values')
330
+ return fig
331
+
332
+ # Advanced Visualization
333
+ @st.cache_data
334
+ def word_cloud_plot(text):
335
+ """
336
+ Generates and displays a word cloud from the given text.
337
+
338
+ The word cloud visualizes the frequency of occurrence of words in the text, with the size of each word indicating its frequency.
339
+
340
+ :param text: The input text from which to generate the word cloud.
341
+ :return: A matplotlib figure object containing the word cloud if successful, -1 otherwise.
342
+ """
343
+ try:
344
+ words = regexp_tokenize(text, pattern='\w+')
345
+ text_dist = nltk.FreqDist([w for w in words])
346
+ wordcloud = WordCloud(width=1200, height=600, background_color ='white').generate_from_frequencies(text_dist)
347
+ fig, ax = plt.subplots(figsize=(10, 7.5))
348
+ ax.imshow(wordcloud, interpolation='bilinear')
349
+ ax.axis('off')
350
+ return fig
351
+ except:
352
+ return -1
353
+
354
+ @st.cache_data
355
+ def world_map(df, country_column, key_attribute):
356
+ """
357
+ Creates a choropleth world map visualization based on the specified DataFrame.
358
+
359
+ The function highlights countries based on a key attribute, providing an interactive map that can be used to analyze geographical data distributions.
360
+
361
+ :param df: DataFrame containing the data to be visualized.
362
+ :param country_column: Name of the column in df that contains country names.
363
+ :param key_attribute: Name of the column in df that contains the data to visualize on the map.
364
+ :return: A Plotly figure object representing the choropleth map if successful, -1 otherwise.
365
+ """
366
+ try:
367
+ hover_data_columns = [col for col in df.columns if col != country_column]
368
+ fig = px.choropleth(df, locations="iso_alpha",
369
+ color=key_attribute,
370
+ hover_name=country_column,
371
+ hover_data=hover_data_columns,
372
+ color_continuous_scale=px.colors.sequential.Cividis,
373
+ projection="equirectangular",)
374
+ return fig
375
+ except:
376
+ return -1
377
+
378
+ @st.cache_data
379
+ def scatter_3d(df, x, y, z):
380
+ """
381
+ Generates a 3D scatter plot from the given DataFrame.
382
+
383
+ Each point in the plot corresponds to a row in the DataFrame, with its position determined by three specified columns. Points are colored based on the values of the z-axis.
384
+
385
+ :param df: DataFrame containing the data to be visualized.
386
+ :param x: Name of the column in df to use for the x-axis values.
387
+ :param y: Name of the column in df to use for the y-axis values.
388
+ :param z: Name of the column in df to use for the z-axis values and color coding.
389
+ :return: A Plotly figure object containing the 3D scatter plot if successful, -1 otherwise.
390
+ """
391
+ try:
392
+ return px.scatter_3d(df, x=x, y=y, z=z, color=z, color_continuous_scale=px.colors.sequential.Viridis)
393
+ except:
394
+ return -1
app/src/predictive_model.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
3
+ from sklearn.linear_model import LogisticRegression
4
+ from sklearn.naive_bayes import GaussianNB
5
+ from sklearn.svm import SVC
6
+ from xgboost import XGBClassifier
7
+
8
+ @st.cache_data
9
+ def train_selected_model(X_train, Y_train, model_type, model_params=None):
10
+ """
11
+ Trains a specific classification model based on the provided model type and parameters.
12
+
13
+ Parameters:
14
+ - X_train (array-like): The training input samples.
15
+ - Y_train (array-like): The target labels for classification.
16
+ - model_type (int): Specifies the type of classification model to be trained.
17
+ 1 for Logistic Regression, 2 for Support Vector Machine (SVM), 3 for Naive Bayes,
18
+ 4 for Random Forest, 5 for AdaBoost, 6 for XGBoost, and 7 for Gradient Boosting.
19
+ - model_params (dict, optional): A dictionary of parameters for the model. Defaults to None.
20
+
21
+ Returns:
22
+ - model: The trained model object based on the specified type.
23
+ """
24
+ if model_type == 1:
25
+ return LogisticRegression_train(X_train, Y_train, model_params)
26
+ elif model_type == 2:
27
+ return SVM_train(X_train, Y_train, model_params)
28
+ elif model_type == 3:
29
+ return NaiveBayes_train(X_train, Y_train, model_params)
30
+ elif model_type == 4:
31
+ return RandomForest_train(X_train, Y_train, model_params=model_params)
32
+ elif model_type == 5:
33
+ return AdaBoost_train(X_train, Y_train, model_params)
34
+ elif model_type == 6:
35
+ return XGBoost_train(X_train, Y_train, model_params)
36
+ elif model_type == 7:
37
+ return GradientBoosting_train(X_train, Y_train, model_params)
38
+
39
+ def LogisticRegression_train(X_train, Y_train, model_params=None):
40
+ if model_params is None: model_params = {}
41
+ logreg = LogisticRegression(**model_params)
42
+ logreg.fit(X_train, Y_train)
43
+ return logreg
44
+
45
+ def SVM_train(X_train, Y_train, model_params=None):
46
+ if model_params is None: model_params = {}
47
+ svm = SVC(**model_params)
48
+ svm.fit(X_train, Y_train)
49
+ return svm
50
+
51
+ def NaiveBayes_train(X_train, Y_train, model_params=None):
52
+ if model_params is None: model_params = {}
53
+ nb = GaussianNB(**model_params)
54
+ nb.fit(X_train, Y_train)
55
+ return nb
56
+
57
+ def RandomForest_train(X_train, Y_train, n_estimators=100, random_state=None, model_params=None):
58
+ if model_params is None: model_params = {}
59
+ rf_params = {'n_estimators': n_estimators, 'random_state': random_state}
60
+ rf_params.update(model_params)
61
+ rf = RandomForestClassifier(**rf_params)
62
+ rf.fit(X_train, Y_train)
63
+ return rf
64
+
65
+ def AdaBoost_train(X_train, Y_train, model_params=None):
66
+ if model_params is None: model_params = {}
67
+ ab = AdaBoostClassifier(**model_params)
68
+ ab.fit(X_train, Y_train)
69
+ return ab
70
+
71
+ def XGBoost_train(X_train, Y_train, model_params=None):
72
+ if model_params is None: model_params = {}
73
+ xgb = XGBClassifier(**model_params)
74
+ xgb.fit(X_train, Y_train)
75
+ return xgb
76
+
77
+ def GradientBoosting_train(X_train, Y_train, model_params=None):
78
+ if model_params is None: model_params = {}
79
+ gb = GradientBoostingClassifier(**model_params)
80
+ gb.fit(X_train, Y_train)
81
+ return gb
app/src/preprocess.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from scipy import stats
4
+ from sklearn.preprocessing import StandardScaler, PowerTransformer
5
+
6
+ def convert_to_numeric(df, convert_int_cols_list, one_hot_cols_list, drop_cols):
7
+ """
8
+ Convert specified columns in the DataFrame to numeric formats and drop specified columns.
9
+ Integer conversion and one-hot encoding are applied based on the provided lists of columns.
10
+ Returns a modified DataFrame and a dictionary of mappings used for conversions.
11
+
12
+ :param df: Pandas DataFrame to be processed.
13
+ :param convert_int_cols_list: List of column names to be converted to integer type.
14
+ :param one_hot_cols_list: List of column names to be converted to one-hot encoding.
15
+ :param drop_cols: List of column names to be dropped from the DataFrame.
16
+ :return: A tuple with two elements:
17
+ 1. DataFrame with specified columns converted and specified columns dropped.
18
+ 2. Dictionary of mappings for each conversion type ('integer_mappings' and 'one_hot_mappings').
19
+ """
20
+ df, int_mapping = convert_to_integer(df, convert_int_cols_list)
21
+ df, one_hot_mapping = convert_to_one_hot(df, one_hot_cols_list)
22
+ df = df.drop(columns=drop_cols, errors='ignore')
23
+ mappings = {'integer_mappings': int_mapping, 'one_hot_mappings': one_hot_mapping}
24
+ return df, mappings
25
+
26
+ def convert_to_integer(df, columns_to_convert=[]):
27
+ """
28
+ Convert specified non-numeric columns in the DataFrame to integer type,
29
+ and return a dictionary of mappings from original values to integers.
30
+
31
+ :param df: Pandas DataFrame to be processed.
32
+ :param columns_to_convert: List of column names to be converted to integer type.
33
+ :return: A tuple with two elements:
34
+ 1. DataFrame with specified columns converted to integer type.
35
+ 2. Dictionary of mappings for each converted column.
36
+ """
37
+ mappings = {}
38
+ for column in columns_to_convert:
39
+
40
+ if df[column].dtype == 'object':
41
+ # Create a mapping from unique values to integers
42
+ unique_values = df[column].unique()
43
+ int_to_value_map = {i: value for i, value in enumerate(unique_values)}
44
+ mappings[column] = int_to_value_map
45
+
46
+ # Apply the reversed mapping to the DataFrame
47
+ value_to_int_map = {v: k for k, v in int_to_value_map.items()}
48
+ df[column] = df[column].map(value_to_int_map)
49
+
50
+ return df, mappings
51
+
52
+ def convert_to_one_hot(df, columns_to_convert=[]):
53
+ """
54
+ Convert specified non-numeric columns in the DataFrame to one-hot encoding,
55
+ and return a modified DataFrame and a dictionary of mappings used for one-hot encoding.
56
+
57
+ :param df: Pandas DataFrame to be processed.
58
+ :param columns_to_convert: List of column names to be converted to one-hot encoding.
59
+ :return: A tuple with two elements:
60
+ 1. DataFrame with specified columns converted to one-hot encoding.
61
+ 2. Dictionary of mappings for each converted column.
62
+ """
63
+ mappings = {}
64
+ df_modified = df.copy()
65
+
66
+ for column in columns_to_convert:
67
+ # Check if the column is categorical
68
+ if df[column].dtype == 'object' or df[column].dtype == 'category':
69
+ # Perform one-hot encoding
70
+ one_hot = pd.get_dummies(df[column], prefix=column)
71
+ # Add the new columns to the modified DataFrame
72
+ df_modified = pd.concat([df_modified, one_hot], axis=1)
73
+ # Drop the original column
74
+ df_modified = df_modified.drop(column, axis=1)
75
+
76
+ # Store the mapping
77
+ mappings[column] = {i: column + '_' + str(i) for i in df[column].unique()}
78
+
79
+ return df_modified, mappings
80
+
81
+ def remove_rows_with_empty_target(df, Y_name):
82
+ """
83
+ Remove rows from the DataFrame where the target column has empty values.
84
+
85
+ :param df: Pandas DataFrame to be processed.
86
+ :param Y_name: Name of the target column to check for empty values.
87
+ :return: DataFrame with rows removed where target column value is empty.
88
+ """
89
+ # Remove rows where the target column is empty (NaN)
90
+ cleaned_df = df.dropna(subset=[Y_name])
91
+ return cleaned_df
92
+
93
+ def remove_duplicates(df):
94
+ """
95
+ Remove duplicate rows from the DataFrame.
96
+ """
97
+ return df.drop_duplicates()
98
+
99
+ def transform_data_for_clustering(df):
100
+ """
101
+ Transform numeric columns in the DataFrame for clustering.
102
+ Applies a PowerTransformer to columns with skewness over a threshold and standardizes them.
103
+ This can help in making the clustering algorithm more effective by normalizing the scale of numerical features.
104
+
105
+ :param df: Pandas DataFrame to be transformed.
106
+ :return: DataFrame with transformed numeric columns suitable for clustering.
107
+ """
108
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
109
+ transformed_df = df.copy()
110
+ pt = PowerTransformer(method='box-cox', standardize=False)
111
+
112
+ for col in numeric_cols:
113
+ if (transformed_df[col] > 0).all():
114
+ skewness = stats.skew(transformed_df[col])
115
+ if abs(skewness) > 0.5:
116
+ transformed_data = pt.fit_transform(transformed_df[[col]])
117
+ transformed_df[col] = transformed_data
118
+
119
+ scaler = StandardScaler()
120
+ transformed_df[numeric_cols] = scaler.fit_transform(transformed_df[numeric_cols])
121
+
122
+ return transformed_df
app/src/regression_model.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
3
+ from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
4
+
5
+ @st.cache_data
6
+ def train_selected_regression_model(X_train, Y_train, model_type, model_params=None):
7
+ """
8
+ Trains a regression model based on the specified model type and parameters.
9
+
10
+ Parameters:
11
+ - X_train (array-like): The training input samples.
12
+ - Y_train (array-like): The target values (real numbers).
13
+ - model_type (int): An integer representing the type of regression model to train.
14
+ 1 for Linear Regression, 2 for Ridge Regression, 3 for Lasso Regression,
15
+ 4 for Random Forest Regressor, 5 for Gradient Boosting Regressor, and 6 for ElasticNet Regression.
16
+ - model_params (dict, optional): A dictionary of model-specific parameters. Default is None.
17
+
18
+ Returns:
19
+ - The trained regression model object based on the specified model type.
20
+ """
21
+ if model_type == 1:
22
+ return LinearRegression_train(X_train, Y_train, model_params)
23
+ elif model_type == 2:
24
+ return RidgeRegression_train(X_train, Y_train, model_params)
25
+ elif model_type == 3:
26
+ return LassoRegression_train(X_train, Y_train, model_params)
27
+ elif model_type == 4:
28
+ return RandomForestRegressor_train(X_train, Y_train, model_params)
29
+ elif model_type == 5:
30
+ return GradientBoostingRegressor_train(X_train, Y_train, model_params)
31
+ elif model_type == 6:
32
+ return ElasticNetRegressor_train(X_train, Y_train, model_params)
33
+
34
+ def LinearRegression_train(X_train, Y_train, model_params=None):
35
+ if model_params is None: model_params = {}
36
+ lr = LinearRegression(**model_params)
37
+ lr.fit(X_train, Y_train)
38
+ return lr
39
+
40
+ def RidgeRegression_train(X_train, Y_train, model_params=None):
41
+ if model_params is None: model_params = {}
42
+ ridge = Ridge(**model_params)
43
+ ridge.fit(X_train, Y_train)
44
+ return ridge
45
+
46
+ def LassoRegression_train(X_train, Y_train, model_params=None):
47
+ if model_params is None: model_params = {}
48
+ lasso = Lasso(**model_params)
49
+ lasso.fit(X_train, Y_train)
50
+ return lasso
51
+
52
+ def RandomForestRegressor_train(X_train, Y_train, model_params=None):
53
+ if model_params is None: model_params = {}
54
+ rf = RandomForestRegressor(**model_params)
55
+ rf.fit(X_train, Y_train)
56
+ return rf
57
+
58
+ def GradientBoostingRegressor_train(X_train, Y_train, model_params=None):
59
+ if model_params is None: model_params = {}
60
+ gbr = GradientBoostingRegressor(**model_params)
61
+ gbr.fit(X_train, Y_train)
62
+ return gbr
63
+
64
+ def ElasticNetRegressor_train(X_train, Y_train, model_params=None):
65
+ if model_params is None: model_params = {}
66
+ en = ElasticNet(**model_params)
67
+ en.fit(X_train, Y_train)
68
+ return en
app/src/util.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import pandas as pd
4
+
5
+ def read_file(file_path):
6
+ """
7
+ Read a file from a given path.
8
+ """
9
+ # Check the size of the file
10
+ if os.path.getsize(file_path) > 200 * 1024 * 1024: # 200MB in bytes
11
+ raise ValueError("Too large file")
12
+
13
+ # Extract the file extension
14
+ file_extension = file_path.split('.')[-1]
15
+
16
+ if file_extension == 'csv':
17
+ # Read CSV file
18
+ return pd.read_csv(file_path)
19
+ elif file_extension == 'json':
20
+ # Read JSON file
21
+ return pd.read_json(file_path)
22
+ elif file_extension in ['xls', 'xlsx']:
23
+ # Read Excel file
24
+ return pd.read_excel(file_path, engine='openpyxl')
25
+ else:
26
+ raise ValueError("Unsupported file format: " + file_extension)
27
+
28
+ def read_file_from_streamlit(uploaded_file):
29
+ """
30
+ Read a file from a given streamlit file.
31
+ """
32
+ # Check the size of the file
33
+ if uploaded_file.size > 200 * 1024 * 1024: # 200MB in bytes
34
+ raise ValueError("Too large file")
35
+
36
+ # Extract the file extension
37
+ file_extension = uploaded_file.name.split('.')[-1]
38
+
39
+ if file_extension == 'csv':
40
+ # Read CSV file
41
+ return pd.read_csv(uploaded_file)
42
+ elif file_extension == 'json':
43
+ # Read JSON file
44
+ return pd.read_json(uploaded_file)
45
+ elif file_extension in ['xls', 'xlsx']:
46
+ # Read Excel file
47
+ # Use io.BytesIO to handle the binary stream
48
+ return pd.read_excel(io.BytesIO(uploaded_file.read()), engine='openpyxl')
49
+ else:
50
+ raise ValueError("Unsupported file format: " + file_extension)
51
+
52
+ def select_Y(df, Y_name):
53
+ """
54
+ Select the target variable from the DataFrame.
55
+ """
56
+ if Y_name in df.columns:
57
+ X = df.drop(Y_name, axis=1)
58
+ Y = df[Y_name]
59
+ return X, Y
60
+ else:
61
+ return -1
62
+
63
+ def check_all_columns_numeric(df):
64
+ """
65
+ Check if all columns in a DataFrame are numeric. Return True if so, False otherwise.
66
+ """
67
+ return df.select_dtypes(include=[int, float]).shape[1] == df.shape[1]
68
+
69
+ def non_numeric_columns_and_head(df, num_rows=20):
70
+ """
71
+ Identify non-numeric columns in a DataFrame and return their names and head.
72
+
73
+ :param df: Pandas DataFrame to be examined.
74
+ :param num_rows: Number of rows to include in the head (default is 20).
75
+ :return: A tuple with two elements:
76
+ 1. List of column names that are not numeric (integer or float).
77
+ 2. DataFrame containing the head of the non-numeric columns.
78
+ """
79
+ # Identify columns that are not of numeric data type
80
+ non_numeric_cols = [col for col in df.columns if not pd.api.types.is_numeric_dtype(df[col])]
81
+
82
+ # Get the head of the non-numeric columns
83
+ non_numeric_head = df[non_numeric_cols].head(num_rows).to_csv()
84
+
85
+ return non_numeric_cols, non_numeric_head
86
+
87
+ def contain_null_attributes_info(df):
88
+ """
89
+ Identifies columns with missing values, summarizes their statistics, and reports their data types.
90
+
91
+ This function checks for attributes within a DataFrame that contain null values,
92
+ generates descriptive statistics for these attributes, and compiles information about their data types.
93
+
94
+ :param df: A pandas DataFrame to be analyzed.
95
+ :return: A tuple containing:
96
+ - A list of columns that contain null values.
97
+ - A string representation of data types for these columns.
98
+ - A CSV-formatted string containing descriptive statistics (count, mean, median, and standard deviation) for these columns.
99
+ Returns an empty list, -1, and -1 if no columns with null values are found.
100
+ """
101
+ attributes = df.columns[df.isnull().any()].tolist()
102
+ if not attributes: return [], -1, -1
103
+
104
+ description_info = df[attributes].describe(percentiles=[.5])
105
+ description_info = description_info.loc[['count', 'mean', '50%', 'std']].round(2).to_csv()
106
+
107
+ dtypes_df = df[attributes].dtypes
108
+ types_info = "\n".join([f"{index}:{dtype}" for index, dtype in dtypes_df.items()])
109
+
110
+ return attributes, types_info, description_info
111
+
112
+ def attribute_info(df):
113
+ """
114
+ Obtain the attributes, types, and head information of the DataFrame.
115
+ """
116
+ attributes = df.columns.tolist()
117
+ dtypes_df = df.dtypes
118
+ types_info = "\n".join([f"{index}:{dtype}" for index, dtype in dtypes_df.items()])
119
+ head_info = df.head(10).to_csv()
120
+
121
+ return attributes, types_info, head_info
122
+
123
+ def get_data_overview(df):
124
+ """
125
+ Obtain the shape, head, nunique, and description information of the DataFrame.
126
+ """
127
+ shape_info = str(df.shape)
128
+ head_info = df.head().to_csv()
129
+ nunique_info = df.nunique().to_csv()
130
+ description_info = df.describe(include='all').to_csv()
131
+ return shape_info, head_info, nunique_info, description_info
132
+
133
+ def get_balance_info(df, Y_name):
134
+ """
135
+ Obtain the shape, description, and balance information of the DataFrame.
136
+ """
137
+ shape_info = df.shape
138
+ description_info = df.describe().to_csv()
139
+ balance_info = df[Y_name].value_counts().to_dict()
140
+ return shape_info, description_info, balance_info
141
+
142
+ def separate_decode_list(decided_dict, Y_name):
143
+ """
144
+ Process the LLM response and return the lists of columns to be converted to integer, one-hot encoding, and drop
145
+ """
146
+ convert_int_cols = [key for key, value in decided_dict.items() if value == 1]
147
+ one_hot_cols = [key for key, value in decided_dict.items() if value == 2]
148
+ drop_cols = [key for key, value in decided_dict.items() if value == 3]
149
+ if Y_name and Y_name in one_hot_cols:
150
+ one_hot_cols.remove(Y_name)
151
+ convert_int_cols.append(Y_name)
152
+ if Y_name and Y_name in drop_cols:
153
+ drop_cols.remove(Y_name)
154
+ convert_int_cols.append(Y_name)
155
+ return convert_int_cols, one_hot_cols, drop_cols
156
+
157
+ def separate_fill_null_list(fill_null_dict):
158
+ """
159
+ Process the LLM response and return the lists of columns to be filled with mean, median, mode, new category, interpolation
160
+ """
161
+ mean_list = [key for key, value in fill_null_dict.items() if value == 1]
162
+ median_list = [key for key, value in fill_null_dict.items() if value == 2]
163
+ mode_list = [key for key, value in fill_null_dict.items() if value == 3]
164
+ new_category_list = [key for key, value in fill_null_dict.items() if value == 4]
165
+ interpolation_list = [key for key, value in fill_null_dict.items() if value == 5]
166
+ return mean_list, median_list, mode_list, new_category_list, interpolation_list
167
+
168
+ def get_selected_models(model_dict):
169
+ """
170
+ Convert the dictionary of models to a list.
171
+ """
172
+ return list(model_dict.values())
173
+
174
+ def get_model_name(model_no):
175
+ """
176
+ Returns the name of the classification model based on the model number.
177
+ """
178
+ if model_no == 1:
179
+ return "Logistic Regression"
180
+ elif model_no == 2:
181
+ return "SVM"
182
+ elif model_no == 3:
183
+ return "Naive Bayes"
184
+ elif model_no == 4:
185
+ return "Random Forest"
186
+ elif model_no == 5:
187
+ return "ADA Boost"
188
+ elif model_no == 6:
189
+ return "XGBoost"
190
+ elif model_no == 7:
191
+ return "Grandient Boost"
192
+
193
+ def get_cluster_method_name(method):
194
+ """
195
+ Returns the name of the clustering method based on the method number.
196
+ """
197
+ if method == 1:
198
+ return "K-Means"
199
+ elif method == 2:
200
+ return "DBSCAN"
201
+ elif method == 3:
202
+ return "Gaussian Mixture"
203
+ elif method == 4:
204
+ return "Agglomerative Clustering"
205
+ elif method == 5:
206
+ return "Spectral Clustering"
207
+
208
+ def get_balance_method_name(method):
209
+ """
210
+ Returns the name of the balance method based on the method number.
211
+ """
212
+ if method == 1:
213
+ return "ROS"
214
+ elif method == 2:
215
+ return "SMOTE"
216
+ elif method == 3:
217
+ return "ADASYN"
218
+ elif method == 4:
219
+ return "None"
220
+
221
+ def get_regression_method_name(method):
222
+ """
223
+ Returns the name of the regression method based on the method number.
224
+ """
225
+ if method == 1:
226
+ return "Linear Regression"
227
+ elif method == 2:
228
+ return "Ridge Regression"
229
+ elif method == 3:
230
+ return "Lasso Regression"
231
+ elif method == 4:
232
+ return "Random Forest"
233
+ elif method == 5:
234
+ return "Gradient Boosting"
235
+ elif method == 6:
236
+ return "Elastic Net"
237
+
238
+ def count_unique(df, Y):
239
+ """
240
+ Counts the number of unique values in a specified column of a DataFrame.
241
+ """
242
+ return df[Y].nunique()
app/util.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ import yaml
4
+ import time
5
+ import random
6
+ import os
7
+
8
+ config_path = os.path.join(os.path.dirname(__file__), 'config', 'config.yaml')
9
+ with open(config_path, 'r') as file:
10
+ config_data = yaml.safe_load(file)
11
+
12
+ def load_lottie():
13
+ r1, r2 = requests.get(config_data['lottie_url1']), requests.get(config_data['lottie_url2'])
14
+ if r1.status_code != 200 or r2.status_code != 200:
15
+ return None
16
+ return r1.json(), r2.json()
17
+
18
+ # write a stream of words
19
+ def stream_data(line):
20
+ for word in line.split():
21
+ yield word + " "
22
+ time.sleep(random.uniform(0.02, 0.05))
23
+
24
+ # Store the welcome message and introduction
25
+ def welcome_message():
26
+ return config_data['welcome_template']
27
+
28
+ def introduction_message():
29
+ return config_data['introduction_template1'], config_data['introduction_template2']
30
+
31
+ # Show developer info at the bottom
32
+ def developer_info():
33
+ time.sleep(2)
34
+ st.write(stream_data(":grey[Streamline Analyst is developed by *Zhe Lin*. You can reach out to me via] :blue[wilson.linzhe@gmail.com] :grey[or] :blue[[GitHub](https://github.com/Wilson-ZheLin)]"))
35
+
36
+ def developer_info_static():
37
+ st.write(":grey[Streamline Analyst is developed by *Zhe Lin*. You can reach out to me via] :blue[wilson.linzhe@gmail.com] :grey[or] :blue[[GitHub](https://github.com/Wilson-ZheLin)]")
app/visualization.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from util import developer_info_static
3
+ from src.plot import list_all, distribution_histogram, distribution_boxplot, count_Y, box_plot, violin_plot, strip_plot, density_plot ,multi_plot_heatmap, multi_plot_scatter, multi_plot_line, word_cloud_plot, world_map, scatter_3d
4
+
5
+ def display_word_cloud(text):
6
+ _, word_cloud_col, _ = st.columns([1, 3, 1])
7
+ with word_cloud_col:
8
+ word_fig = word_cloud_plot(text)
9
+ if word_fig == -1:
10
+ st.error('Data not supported')
11
+ else:
12
+ st.pyplot(word_cloud_plot(text))
13
+
14
+ def data_visualization(DF):
15
+ st.divider()
16
+ st.subheader('Data Visualization')
17
+ attributes = DF.columns.tolist()
18
+
19
+ # Three tabs for three kinds of visualization
20
+ single_tab, multiple_tab, advanced_tab = st.tabs(['Single Attribute Visualization', 'Multiple Attributes Visualization', 'Advanced Visualization'])
21
+
22
+ # Single attribute visualization
23
+ with single_tab:
24
+ _, col_mid, _ = st.columns([1, 5, 1])
25
+ with col_mid:
26
+ plot_area = st.empty()
27
+
28
+ col1, col2 = st.columns(2)
29
+ with col1:
30
+ att = st.selectbox(
31
+ label = 'Select an attribute to visualize:',
32
+ options = attributes,
33
+ index = len(attributes)-1
34
+ )
35
+ st.write(f'Attribute selected: :green[{att}]')
36
+
37
+ with col2:
38
+ plot_types = ['Donut chart', 'Violin plot', 'Distribution histogram', 'Boxplot', 'Density plot', 'Strip plot', 'Distribution boxplot']
39
+ plot_type = st.selectbox(
40
+ key = 'plot_type1',
41
+ label = 'Select a plot type:',
42
+ options = plot_types,
43
+ index = 0
44
+ )
45
+ st.write(f'Plot type selected: :green[{plot_type}]')
46
+
47
+ if plot_type == 'Distribution histogram':
48
+ fig = distribution_histogram(DF, att)
49
+ plot_area.pyplot(fig)
50
+ elif plot_type == 'Distribution boxplot':
51
+ fig = distribution_boxplot(DF, att)
52
+ if fig == -1:
53
+ plot_area.error('The attribute is not numeric')
54
+ else:
55
+ plot_area.pyplot(fig)
56
+ elif plot_type == 'Donut chart':
57
+ fig = count_Y(DF, att)
58
+ plot_area.plotly_chart(fig)
59
+ elif plot_type == 'Boxplot':
60
+ fig = box_plot(DF, [att])
61
+ plot_area.plotly_chart(fig)
62
+ elif plot_type == 'Violin plot':
63
+ fig = violin_plot(DF, [att])
64
+ plot_area.plotly_chart(fig)
65
+ elif plot_type == 'Strip plot':
66
+ fig = strip_plot(DF, [att])
67
+ plot_area.plotly_chart(fig)
68
+ elif plot_type == 'Density plot':
69
+ fig = density_plot(DF, att)
70
+ plot_area.plotly_chart(fig)
71
+
72
+ # Multiple attribute visualization
73
+ with multiple_tab:
74
+ col1, col2 = st.columns([6, 4])
75
+ with col1:
76
+ options = st.multiselect(
77
+ label = 'Select multiple attributes to visualize:',
78
+ options = attributes,
79
+ default = []
80
+ )
81
+ with col2:
82
+ plot_types = ["Violin plot", "Boxplot", "Heatmap", "Strip plot", "Line plot", "Scatter plot"]
83
+ plot_type = st.selectbox(
84
+ key = 'plot_type2',
85
+ label = 'Select a plot type:',
86
+ options = plot_types,
87
+ index = 0
88
+ )
89
+ _, col_mid, _ = st.columns([1, 5, 1])
90
+ with col_mid:
91
+ plot_area = st.empty()
92
+
93
+ if options:
94
+ if plot_type == 'Scatter plot':
95
+ fig = multi_plot_scatter(DF, options)
96
+ if fig == -1:
97
+ plot_area.error('Scatter plot requires two attributes')
98
+ else:
99
+ plot_area.pyplot(fig)
100
+ elif plot_type == 'Heatmap':
101
+ fig = multi_plot_heatmap(DF, options)
102
+ if fig == -1:
103
+ plot_area.error('The attributes are not numeric')
104
+ else:
105
+ plot_area.pyplot(fig)
106
+ elif plot_type == 'Boxplot':
107
+ fig = box_plot(DF, options)
108
+ if fig == -1:
109
+ plot_area.error('The attributes are not numeric')
110
+ else:
111
+ plot_area.plotly_chart(fig)
112
+ elif plot_type == 'Violin plot':
113
+ fig = violin_plot(DF, options)
114
+ if fig == -1:
115
+ plot_area.error('The attributes are not numeric')
116
+ else:
117
+ plot_area.plotly_chart(fig)
118
+ elif plot_type == 'Strip plot':
119
+ fig = strip_plot(DF, options)
120
+ if fig == -1:
121
+ plot_area.error('The attributes are not numeric')
122
+ else:
123
+ plot_area.plotly_chart(fig)
124
+ elif plot_type == 'Line plot':
125
+ fig = multi_plot_line(DF, options)
126
+ if fig == -1:
127
+ plot_area.error('The attributes are not numeric')
128
+ elif fig == -2:
129
+ plot_area.error('Line plot requires two attributes')
130
+ else:
131
+ plot_area.pyplot(fig)
132
+
133
+ # Advanced visualization
134
+ with advanced_tab:
135
+ st.subheader("3D Scatter Plot")
136
+ column_1, column_2, column_3 = st.columns(3)
137
+ with column_1:
138
+ x = st.selectbox(
139
+ key = 'x',
140
+ label = 'Select the x attribute:',
141
+ options = attributes,
142
+ index = 0
143
+ )
144
+ with column_2:
145
+ y = st.selectbox(
146
+ key = 'y',
147
+ label = 'Select the y attribute:',
148
+ options = attributes,
149
+ index = 1 if len(attributes) > 1 else 0
150
+ )
151
+ with column_3:
152
+ z = st.selectbox(
153
+ key = 'z',
154
+ label = 'Select the z attribute:',
155
+ options = attributes,
156
+ index = 2 if len(attributes) > 2 else 0
157
+ )
158
+ if st.button('Generate 3D Plot'):
159
+ _, fig_3d_col, _ = st.columns([1, 3, 1])
160
+ with fig_3d_col:
161
+ fig_3d_1 = scatter_3d(DF, x, y, z)
162
+ if fig_3d_1 == -1:
163
+ st.error('Data not supported')
164
+ else:
165
+ st.plotly_chart(fig_3d_1)
166
+ st.divider()
167
+
168
+ st.subheader('World Cloud')
169
+ upload_txt_checkbox = st.checkbox('Upload a new text file instead')
170
+ if upload_txt_checkbox:
171
+ uploaded_txt = st.file_uploader("Choose a text file", accept_multiple_files=False, type="txt")
172
+ if uploaded_txt:
173
+ text = uploaded_txt.read().decode("utf-8")
174
+ display_word_cloud(text)
175
+ else:
176
+ text_attr = st.selectbox(
177
+ label = 'Select the text attribute:',
178
+ options = attributes,
179
+ index = 0)
180
+ if st.button('Generate Word Cloud'):
181
+ text = DF[text_attr].astype(str).str.cat(sep=' ')
182
+ display_word_cloud(text)
183
+ st.divider()
184
+
185
+ st.subheader('World Heat Map')
186
+ col_1, col_2 = st.columns(2)
187
+ with col_1:
188
+ country_col = st.selectbox(
189
+ key = 'country_col',
190
+ label = 'Select the country attribute:',
191
+ options = attributes,
192
+ index = 0
193
+ )
194
+ with col_2:
195
+ heat_attribute = st.selectbox(
196
+ key = 'heat_attribute',
197
+ label = 'Select the attribute to display in heat map:',
198
+ options = attributes,
199
+ index = len(attributes) - 1
200
+ )
201
+ if st.button("Show Heatmap"):
202
+ _, map_col, _ = st.columns([1, 3, 1])
203
+ with map_col:
204
+ world_fig = world_map(DF, country_col, heat_attribute)
205
+ if world_fig == -1:
206
+ st.error('Data not supported')
207
+ else:
208
+ st.plotly_chart(world_fig)
209
+ st.divider()
210
+
211
+ # Data Overview
212
+ st.subheader('Data Overview')
213
+ if 'data_origin' not in st.session_state:
214
+ st.session_state.data_origin = DF
215
+ st.dataframe(st.session_state.data_origin.describe(), width=1200)
216
+ if 'overall_plot' not in st.session_state:
217
+ st.session_state.overall_plot = list_all(st.session_state.data_origin)
218
+ st.pyplot(st.session_state.overall_plot)
219
+
220
+ st.divider()
221
+ developer_info_static()
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ imbalanced_learn==0.12.0
2
+ joblib==1.2.0
3
+ openai==1.3.4
4
+ langchain==0.1.6
5
+ matplotlib==3.7.2
6
+ nltk==3.8.1
7
+ numpy==1.24.3
8
+ pandas==2.2.0
9
+ plotly==5.18.0
10
+ PyYAML==6.0.1
11
+ Requests==2.31.0
12
+ scikit_learn==1.4.0
13
+ scipy==1.12.0
14
+ seaborn==0.13.2
15
+ streamlit==1.31.0
16
+ streamlit_lottie==0.0.5
17
+ wordcloud==1.9.3
18
+ xgboost==2.0.3
19
+ statsmodels==0.14.0