Streamline-Analyst

Sleeping

App Files Files Community

Wilson-ZheLin commited on Apr 1

Commit

9183c57

•

1 Parent(s): 7c98744

Initial commit

Browse files

Files changed (22) hide show

.gitignore +7 -0
app/__init__.py +0 -0
app/app.py +109 -0
app/cluster_model.py +285 -0
app/config/config.yaml +15 -0
app/prediction_model.py +327 -0
app/regression_model.py +300 -0
app/src/__init__.py +1 -0
app/src/cluster_model.py +59 -0
app/src/config/config.yaml +43 -0
app/src/handle_null_value.py +112 -0
app/src/llm_service.py +300 -0
app/src/model_service.py +196 -0
app/src/pca.py +140 -0
app/src/plot.py +394 -0
app/src/predictive_model.py +81 -0
app/src/preprocess.py +122 -0
app/src/regression_model.py +68 -0
app/src/util.py +242 -0
app/util.py +37 -0
app/visualization.py +221 -0
requirements.txt +19 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+*.pyc
+*.swp
+*.swo
+*.DS_Store
+*.ipynb_checkpoints
+data/
+test.py

app/__init__.py ADDED Viewed

File without changes

app/app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import time
+import streamlit as st
+from streamlit_lottie import st_lottie
+from util import load_lottie, stream_data, welcome_message, introduction_message
+from prediction_model import prediction_model_pipeline
+from cluster_model import cluster_model_pipeline
+from regression_model import regression_model_pipeline
+from visualization import data_visualization
+from src.util import read_file_from_streamlit
+st.set_page_config(page_title="Streamline Analyst", page_icon=":rocket:", layout="wide")
+# TITLE SECTION
+with st.container():
+    st.subheader("Hello there 👋")
+    st.title("Welcome to Streamline Analyst!")
+    if 'initialized' not in st.session_state:
+        st.session_state.initialized = True
+    if st.session_state.initialized:
+        st.session_state.welcome_message = welcome_message()
+        st.write(stream_data(st.session_state.welcome_message))
+        time.sleep(0.5)
+        st.write("[Github > ](https://github.com/Wilson-ZheLin/Streamline-Analyst)")
+        st.session_state.initialized = False
+    else:
+        st.write(st.session_state.welcome_message)
+        st.write("[Github > ](https://github.com/Wilson-ZheLin/Streamline-Analyst)")
+# INTRO SECTION
+with st.container():
+    st.divider()
+    if 'lottie' not in st.session_state:
+        st.session_state.lottie_url1, st.session_state.lottie_url2 = load_lottie()
+        st.session_state.lottie = True
+    left_column_r1, right_column_r1 = st.columns([6, 4])
+    with left_column_r1:
+        st.header("What can Streamline Analyst do?")
+        st.write(introduction_message()[0])
+    with right_column_r1:
+        if st.session_state.lottie:
+            st_lottie(st.session_state.lottie_url1, height=280, key="animation1")
+    left_column_r2, _, right_column_r2 = st.columns([6, 1, 5])
+    with left_column_r2:
+        if st.session_state.lottie:
+            st_lottie(st.session_state.lottie_url2, height=200, key="animation2")
+    with right_column_r2:
+        st.header("Simple to Use")
+        st.write(introduction_message()[1])
+# MAIN SECTION
+with st.container():
+    st.divider()
+    st.header("Let's Get Started")
+    left_column, right_column = st.columns([6, 4])
+    with left_column:
+        API_KEY = st.text_input(
+            "Your API Key won't be stored or shared!",
+            placeholder="Enter your API key here...",
+        )
+        st.write("👆Your OpenAI API key:")
+        uploaded_file = st.file_uploader("Choose a data file. Your data won't be stored as well!", accept_multiple_files=False, type=['csv', 'json', 'xls', 'xlsx'])
+        if uploaded_file:
+            if uploaded_file.getvalue():
+                uploaded_file.seek(0)
+                st.session_state.DF_uploaded = read_file_from_streamlit(uploaded_file)
+                st.session_state.is_file_empty = False
+            else:
+                st.session_state.is_file_empty = True
+    with right_column:
+        SELECTED_MODEL = st.selectbox(
+        'Which OpenAI model do you want to use?',
+        ('GPT-4-Turbo', 'GPT-3.5-Turbo'))
+        MODE = st.selectbox(
+        'Select proper data analysis mode',
+        ('Predictive Classification', 'Clustering Model', 'Regression Model', 'Data Visualization'))
+        st.write(f'Model selected: :green[{SELECTED_MODEL}]')
+        st.write(f'Data analysis mode: :green[{MODE}]')
+    # Proceed Button
+    is_proceed_enabled = uploaded_file is not None and API_KEY != "" or uploaded_file is not None and MODE == "Data Visualization"
+    # Initialize the 'button_clicked' state
+    if 'button_clicked' not in st.session_state:
+        st.session_state.button_clicked = False
+    if st.button('Start Analysis', disabled=(not is_proceed_enabled) or st.session_state.button_clicked, type="primary"):
+        st.session_state.button_clicked = True
+    if "is_file_empty" in st.session_state and st.session_state.is_file_empty:
+        st.caption('Your data file is empty!')
+    # Start Analysis
+    if st.session_state.button_clicked:
+        GPT_MODEL = 4 if SELECTED_MODEL == 'GPT-4-Turbo' else 3.5
+        with st.container():
+            if "DF_uploaded" not in st.session_state:
+                st.error("File is empty!")
+            else:
+                if MODE == 'Predictive Classification':
+                    prediction_model_pipeline(st.session_state.DF_uploaded, API_KEY, GPT_MODEL)
+                elif MODE == 'Clustering Model':
+                    cluster_model_pipeline(st.session_state.DF_uploaded, API_KEY, GPT_MODEL)
+                elif MODE == 'Regression Model':
+                    regression_model_pipeline(st.session_state.DF_uploaded, API_KEY, GPT_MODEL)
+                elif MODE == 'Data Visualization':
+                    data_visualization(st.session_state.DF_uploaded)

app/cluster_model.py ADDED Viewed

	@@ -0,0 +1,285 @@

+import streamlit as st
+from util import developer_info, developer_info_static
+from src.plot import plot_clusters, correlation_matrix_plotly
+from src.handle_null_value import contains_missing_value, remove_high_null, fill_null_values
+from src.preprocess import convert_to_numeric, remove_duplicates, transform_data_for_clustering
+from src.llm_service import decide_fill_null, decide_encode_type, decide_cluster_model
+from src.pca import decide_pca, perform_PCA_for_clustering
+from src.model_service import save_model, calculate_silhouette_score, calculate_calinski_harabasz_score, calculate_davies_bouldin_score, gmm_predict, estimate_optimal_clusters
+from src.cluster_model import train_select_cluster_model
+from src.util import contain_null_attributes_info, separate_fill_null_list, check_all_columns_numeric, non_numeric_columns_and_head, separate_decode_list, get_cluster_method_name
+def start_training_model():
+    st.session_state["start_training"] = True
+def cluster_model_pipeline(DF, API_KEY, GPT_MODEL):
+    st.divider()
+    st.subheader('Data Overview')
+    if 'data_origin' not in st.session_state:
+        st.session_state.data_origin = DF
+    st.dataframe(st.session_state.data_origin.describe(), width=1200)
+    # Data Imputation
+    st.subheader('Handle and Impute Missing Values')
+    if "contain_null" not in st.session_state:
+        st.session_state.contain_null = contains_missing_value(st.session_state.data_origin)
+    if 'filled_df' not in st.session_state:
+        if st.session_state.contain_null:
+            with st.status("Processing **missing values** in the data...", expanded=True) as status:
+                st.write("Filtering out high-frequency missing rows and columns...")
+                filled_df = remove_high_null(DF)
+                st.write("Large language model analysis...")
+                attributes, types_info, description_info = contain_null_attributes_info(filled_df)
+                fill_result_dict = decide_fill_null(attributes, types_info, description_info, GPT_MODEL, API_KEY)
+                st.write("Imputing missing values...")
+                mean_list, median_list, mode_list, new_category_list, interpolation_list = separate_fill_null_list(fill_result_dict)
+                filled_df = fill_null_values(filled_df, mean_list, median_list, mode_list, new_category_list, interpolation_list)
+                # Store the imputed DataFrame in session_state
+                st.session_state.filled_df = filled_df
+                DF = filled_df
+                status.update(label='Missing value processing completed!', state="complete", expanded=False)
+            st.download_button(
+                label="Download Data with Missing Values Imputed",
+                data=st.session_state.filled_df.to_csv(index=False).encode('utf-8'),
+                file_name="imputed_missing_values.csv",
+                mime='text/csv')
+        else:
+            st.session_state.filled_df = DF
+            st.success("No missing values detected. Processing skipped.")
+    else:
+        st.success("Missing value processing completed!")
+        if st.session_state.contain_null:
+            st.download_button(
+                label="Download Data with Missing Values Imputed",
+                data=st.session_state.filled_df.to_csv(index=False).encode('utf-8'),
+                file_name="imputed_missing_values.csv",
+                mime='text/csv')
+    # Data Encoding
+    st.subheader("Process Data Encoding")
+    st.caption("*For considerations of processing time, **NLP features** like **TF-IDF** have not been included in the current pipeline, long text attributes may be dropped.")
+    if 'all_numeric' not in st.session_state:
+        st.session_state.all_numeric = check_all_columns_numeric(st.session_state.data_origin)
+    if 'encoded_df' not in st.session_state:
+        if not st.session_state.all_numeric:
+            with st.status("Encoding non-numeric data using **numeric mapping** and **one-hot**...", expanded=True) as status:
+                non_numeric_attributes, non_numeric_head = non_numeric_columns_and_head(DF)
+                st.write("Large language model analysis...")
+                encode_result_dict = decide_encode_type(non_numeric_attributes, non_numeric_head, GPT_MODEL, API_KEY)
+                st.write("Encoding the data...")
+                convert_int_cols, one_hot_cols, drop_cols = separate_decode_list(encode_result_dict, "")
+                encoded_df, mappings = convert_to_numeric(DF, convert_int_cols, one_hot_cols, drop_cols)
+                # Store the imputed DataFrame in session_state
+                st.session_state.encoded_df = encoded_df
+                DF = encoded_df
+                status.update(label='Data encoding completed!', state="complete", expanded=False)
+            st.download_button(
+                label="Download Encoded Data",
+                data=st.session_state.encoded_df.to_csv(index=False).encode('utf-8'),
+                file_name="encoded_data.csv",
+                mime='text/csv')
+        else:
+            st.session_state.encoded_df = DF
+            st.success("All columns are numeric. Processing skipped.")
+    else:
+        st.success("Data encoded completed using numeric mapping and one-hot!")
+        if not st.session_state.all_numeric:
+            st.download_button(
+                label="Download Encoded Data",
+                data=st.session_state.encoded_df.to_csv(index=False).encode('utf-8'),
+                file_name="encoded_data.csv",
+                mime='text/csv')
+    # Correlation Heatmap
+    if 'df_cleaned1' not in st.session_state:
+        st.session_state.df_cleaned1 = DF
+    st.subheader('Correlation Between Attributes')
+    st.plotly_chart(correlation_matrix_plotly(st.session_state.df_cleaned1))
+    # Remove duplicate entities
+    st.subheader('Remove Duplicate Entities')
+    if 'df_cleaned2' not in st.session_state:
+        st.session_state.df_cleaned2 = remove_duplicates(st.session_state.df_cleaned1)
+        # DF = remove_duplicates(DF)
+    st.info("Duplicate rows removed.")
+    # Data Transformation
+    st.subheader('Data Transformation')
+    if 'data_transformed' not in st.session_state:
+        st.session_state.data_transformed = transform_data_for_clustering(st.session_state.df_cleaned2)
+    st.success("Data transformed by standardization and box-cox if applicable.")
+    # PCA
+    st.subheader('Principal Component Analysis')
+    st.write("Deciding whether to perform PCA...")
+    if 'df_pca' not in st.session_state:
+        _, n_components = decide_pca(st.session_state.df_cleaned2)
+        st.session_state.df_pca = perform_PCA_for_clustering(st.session_state.data_transformed, n_components)
+    st.success("Completed!")
+    # Splitting and Balancing
+    if 'test_percentage' not in st.session_state:
+        st.session_state.test_percentage = 20
+    if 'balance_data' not in st.session_state:
+        st.session_state.balance_data = False
+    if "start_training" not in st.session_state:
+        st.session_state["start_training"] = False
+    if 'model_trained' not in st.session_state:
+        st.session_state['model_trained'] = False
+    splitting_column, balance_column = st.columns(2)
+    with splitting_column:
+        st.subheader(':grey[Data Splitting]')
+        st.caption('Data splitting is not applicable to clustering models.')
+        st.slider('Percentage of test set', 1, 25, st.session_state.test_percentage, key='test_percentage', disabled=True)
+    with balance_column:
+        st.metric(label="Test Data", value="--%", delta=None)
+        st.toggle('Class Balancing', value=st.session_state.balance_data, key='to_perform_balance', disabled=True)
+        st.caption('Class balancing is not applicable to clustering models.')
+    st.button("Start Training Model", on_click=start_training_model, type="primary", disabled=st.session_state['start_training'])
+    # Model Training
+    if st.session_state['start_training']:
+        with st.container():
+            st.header("Modeling")
+            if not st.session_state.get("data_prepared", False):
+                st.session_state.X = st.session_state.df_pca
+                st.session_state.data_prepared = True
+            # Decide model types:
+            if "decided_model" not in st.session_state:
+                st.session_state["decided_model"] = False
+            if "all_set" not in st.session_state:
+                st.session_state["all_set"] = False
+            if not st.session_state["decided_model"]:
+                with st.spinner("Deciding models based on data..."):
+                    shape_info = str(st.session_state.X.shape)
+                    description_info = st.session_state.X.describe().to_csv()
+                    cluster_info = estimate_optimal_clusters(st.session_state.X)
+                    st.session_state.default_cluster = cluster_info
+                    model_dict = decide_cluster_model(shape_info, description_info, cluster_info, GPT_MODEL, API_KEY)
+                    model_list = list(model_dict.values())
+                    if 'model_list' not in st.session_state:
+                        st.session_state.model_list = model_list
+                    st.session_state.decided_model = True
+            # Display results
+            if st.session_state["decided_model"]:
+                display_results(st.session_state.X)
+                st.session_state["all_set"] = True
+            # Download models
+            if st.session_state["all_set"]:
+                download_col1, download_col2, download_col3 = st.columns(3)
+                with download_col1:
+                    st.download_button(label="Download Model", data=st.session_state.downloadable_model1, file_name=f"{st.session_state.model1_name}.joblib", mime="application/octet-stream")
+                with download_col2:
+                    st.download_button(label="Download Model", data=st.session_state.downloadable_model2, file_name=f"{st.session_state.model2_name}.joblib", mime="application/octet-stream")
+                with download_col3:
+                    st.download_button(label="Download Model", data=st.session_state.downloadable_model3, file_name=f"{st.session_state.model3_name}.joblib", mime="application/octet-stream")
+    # Footer
+    st.divider()
+    if "all_set" in st.session_state and st.session_state["all_set"]:
+        if "has_been_set" not in st.session_state:
+            st.session_state["has_been_set"] = True
+            developer_info()
+        else:
+            developer_info_static()
+def display_results(X):
+    st.success("Models selected based on your data!")
+    # Data set metrics
+    st.metric(label="Total Data", value=len(X), delta=None)
+    # Model training
+    model_col1, model_col2, model_col3 = st.columns(3)
+    with model_col1:
+        if "model1_name" not in st.session_state:
+            st.session_state.model1_name = get_cluster_method_name(st.session_state.model_list[0])
+        st.subheader(st.session_state.model1_name)
+        # Slider for model parameters
+        if st.session_state.model_list[0] == 2:
+            st.caption('N-cluster is not applicable to DBSCAN.')
+        else:
+            st.caption(f'N-cluster for {st.session_state.model1_name}:')
+        n_clusters1 = st.slider('N clusters', 2, 20, st.session_state.default_cluster, label_visibility="collapsed", key='n_clusters1', disabled=st.session_state.model_list[0] == 2)
+        with st.spinner("Model training in progress..."):
+            st.session_state.model1 = train_select_cluster_model(X, n_clusters1, st.session_state.model_list[0])
+            st.session_state.downloadable_model1 = save_model(st.session_state.model1)
+        if st.session_state.model_list[0] != 3:
+            label1 = st.session_state.model1.labels_
+        else:
+            label1 = gmm_predict(X, st.session_state.model1)
+        # Visualization
+        st.pyplot(plot_clusters(X, label1))
+        # Model metrics
+        st.write(f"Silhouette score: ", f'\n:green[**{calculate_silhouette_score(X, label1)}**]')
+        st.write(f"Calinski-Harabasz score: ", f'\n:green[**{calculate_calinski_harabasz_score(X, label1)}**]')
+        st.write(f"Davies-Bouldin score: ", f'\n:green[**{calculate_davies_bouldin_score(X, label1)}**]')
+    with model_col2:
+        if "model2_name" not in st.session_state:
+            st.session_state.model2_name = get_cluster_method_name(st.session_state.model_list[1])
+        st.subheader(st.session_state.model2_name)
+        # Slider for model parameters
+        if st.session_state.model_list[1] == 2:
+            st.caption('N-cluster is not applicable to DBSCAN.')
+        else:
+            st.caption(f'N-cluster for {st.session_state.model2_name}:')
+        n_clusters2 = st.slider('N clusters', 2, 20, st.session_state.default_cluster, label_visibility="collapsed", key='n_clusters2', disabled=st.session_state.model_list[1] == 2)
+        with st.spinner("Model training in progress..."):
+            st.session_state.model2 = train_select_cluster_model(X, n_clusters2, st.session_state.model_list[1])
+            st.session_state.downloadable_model2 = save_model(st.session_state.model2)
+        if st.session_state.model_list[1] != 3:
+            label2 = st.session_state.model2.labels_
+        else:
+            label2 = gmm_predict(X, st.session_state.model2)
+        # Visualization
+        st.pyplot(plot_clusters(X, label2))
+        # Model metrics
+        st.write(f"Silhouette score: ", f'\n:green[**{calculate_silhouette_score(X, label2)}**]')
+        st.write(f"Calinski-Harabasz score: ", f'\n:green[**{calculate_calinski_harabasz_score(X, label2)}**]')
+        st.write(f"Davies-Bouldin score: ", f'\n:green[**{calculate_davies_bouldin_score(X, label2)}**]')
+    with model_col3:
+        if "model3_name" not in st.session_state:
+            st.session_state.model3_name = get_cluster_method_name(st.session_state.model_list[2])
+        st.subheader(st.session_state.model3_name)
+        # Slider for model parameters
+        if st.session_state.model_list[2] == 2:
+            st.caption('N-cluster is not applicable to DBSCAN.')
+        else:
+            st.caption(f'N-cluster for {st.session_state.model3_name}:')
+        n_clusters3 = st.slider('N clusters', 2, 20, st.session_state.default_cluster, label_visibility="collapsed", key='n_clusters3', disabled=st.session_state.model_list[2] == 2)
+        with st.spinner("Model training in progress..."):
+            st.session_state.model3 = train_select_cluster_model(X, n_clusters3, st.session_state.model_list[2])
+            st.session_state.downloadable_model3 = save_model(st.session_state.model3)
+        if st.session_state.model_list[2] != 3:
+            label3 = st.session_state.model3.labels_
+        else:
+            label3 = gmm_predict(X, st.session_state.model3)
+        # Visualization
+        st.pyplot(plot_clusters(X, label3))
+        # Model metrics
+        st.write(f"Silhouette score: ", f'\n:green[**{calculate_silhouette_score(X, label3)}**]')
+        st.write(f"Calinski-Harabasz score: ", f'\n:green[**{calculate_calinski_harabasz_score(X, label3)}**]')
+        st.write(f"Davies-Bouldin score: ", f'\n:green[**{calculate_davies_bouldin_score(X, label3)}**]')

app/config/config.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+lottie_url1: "https://lottie.host/f89e48e2-55e5-4fdf-a406-0be2b00cc2af/ECJa6PGrCV.json"
+lottie_url2: "https://lottie.host/05824020-0a23-4373-8418-721bd6e68504/FE5XXRT455.json"
+welcome_template: "Streamline Analyst 🪄 is an advanced, open-source application powered by LLMs that streamlines the entire process of data analysis. It automates all the tasks from data preprocessing to model testing, simplifying complex data tasks with precision."
+introduction_template1: |
+  As a data analysis agent, **Streamline Analyst** is capable of making autonomous decisions based on your data:
+  - Effortless Data Preprocessing
+  - Intelligent Encoding & Balancing
+  - Automated Model Selection & Training
+  - Dynamic Data Visualization
+  - And much more...
+introduction_template2: |
+  **You only need to**:
+  1. **Select** your data file
+  2. **Choose** an analysis mode
+  3. **Press** the Start button

app/prediction_model.py ADDED Viewed

	@@ -0,0 +1,327 @@

+import streamlit as st
+from util import developer_info, developer_info_static
+from src.plot import confusion_metrix, roc, correlation_matrix_plotly
+from src.handle_null_value import contains_missing_value, remove_high_null, fill_null_values
+from src.preprocess import convert_to_numeric, remove_rows_with_empty_target, remove_duplicates
+from src.llm_service import decide_fill_null, decide_encode_type, decide_model, decide_target_attribute, decide_test_ratio, decide_balance
+from src.pca import decide_pca, perform_pca
+from src.model_service import split_data, check_and_balance, fpr_and_tpr, auc, save_model, calculate_f1_score
+from src.predictive_model import train_selected_model
+from src.util import select_Y, contain_null_attributes_info, separate_fill_null_list, check_all_columns_numeric, non_numeric_columns_and_head, separate_decode_list, get_data_overview, get_selected_models, get_model_name, count_unique, attribute_info, get_balance_info, get_balance_method_name
+def update_balance_data():
+    st.session_state.balance_data = st.session_state.to_perform_balance
+def start_training_model():
+    st.session_state["start_training"] = True
+def prediction_model_pipeline(DF, API_KEY, GPT_MODEL):
+    st.divider()
+    st.subheader('Data Overview')
+    if 'data_origin' not in st.session_state:
+        st.session_state.data_origin = DF
+    st.dataframe(st.session_state.data_origin.describe(), width=1200)
+    attributes = st.session_state.data_origin.columns.tolist()
+    # Select the target variable
+    if 'target_selected' not in st.session_state:
+        st.session_state.target_selected = False
+    st.subheader('Target Variable')
+    if not st.session_state.target_selected:
+        with st.spinner("AI is analyzing the data..."):
+            attributes_for_target, types_info_for_target, head_info_for_target = attribute_info(st.session_state.data_origin)
+            st.session_state.target_Y = decide_target_attribute(attributes_for_target, types_info_for_target, head_info_for_target, GPT_MODEL, API_KEY)
+        if st.session_state.target_Y != -1:
+            selected_Y = st.session_state.target_Y
+            st.success("Target variable has been selected by the AI!")
+            st.write(f'Target attribute selected: :green[**{selected_Y}**]')
+            st.session_state.target_selected = True
+        else:
+            st.info("AI cannot determine the target variable from the data. Please select the target variable")
+            target_col1, target_col2 = st.columns([9, 1])
+            with target_col1:
+                selected_Y = st.selectbox(
+                    label = 'Select the target variable to predict:',
+                    options = attributes,
+                    index = len(attributes)-1,
+                    label_visibility='collapsed'
+                )
+            with target_col2:
+                if st.button("Confirm", type="primary"):
+                    st.session_state.target_selected = True
+        st.session_state.selected_Y = selected_Y
+    else:
+        if st.session_state.target_Y != -1:
+            st.success("Target variable has been selected by the AI!")
+        st.write(f"Target variable selected: :green[**{st.session_state.selected_Y}**]")
+    if st.session_state.target_selected:
+        # Data Imputation
+        st.subheader('Handle and Impute Missing Values')
+        if "contain_null" not in st.session_state:
+            st.session_state.contain_null = contains_missing_value(st.session_state.data_origin)
+        if 'filled_df' not in st.session_state:
+            if st.session_state.contain_null:
+                with st.status("Processing **missing values** in the data...", expanded=True) as status:
+                    st.write("Filtering out high-frequency missing rows and columns...")
+                    filled_df = remove_high_null(DF)
+                    filled_df = remove_rows_with_empty_target(filled_df, st.session_state.selected_Y)
+                    st.write("Large language model analysis...")
+                    attributes, types_info, description_info = contain_null_attributes_info(filled_df)
+                    fill_result_dict = decide_fill_null(attributes, types_info, description_info, GPT_MODEL, API_KEY)
+                    st.write("Imputing missing values...")
+                    mean_list, median_list, mode_list, new_category_list, interpolation_list = separate_fill_null_list(fill_result_dict)
+                    filled_df = fill_null_values(filled_df, mean_list, median_list, mode_list, new_category_list, interpolation_list)
+                    # Store the imputed DataFrame in session_state
+                    st.session_state.filled_df = filled_df
+                    DF = filled_df
+                    status.update(label='Missing value processing completed!', state="complete", expanded=False)
+                st.download_button(
+                    label="Download Data with Missing Values Imputed",
+                    data=st.session_state.filled_df.to_csv(index=False).encode('utf-8'),
+                    file_name="imputed_missing_values.csv",
+                    mime='text/csv')
+            else:
+                st.session_state.filled_df = DF
+                st.success("No missing values detected. Processing skipped.")
+        else:
+            st.success("Missing value processing completed!")
+            if st.session_state.contain_null:
+                st.download_button(
+                    label="Download Data with Missing Values Imputed",
+                    data=st.session_state.filled_df.to_csv(index=False).encode('utf-8'),
+                    file_name="imputed_missing_values.csv",
+                    mime='text/csv')
+        # Data Encoding
+        st.subheader("Process Data Encoding")
+        st.caption("*For considerations of processing time, **NLP features** like **TF-IDF** have not been included in the current pipeline, long text attributes may be dropped.")
+        if 'all_numeric' not in st.session_state:
+            st.session_state.all_numeric = check_all_columns_numeric(st.session_state.data_origin)
+        if 'encoded_df' not in st.session_state:
+            if not st.session_state.all_numeric:
+                with st.status("Encoding non-numeric data using **numeric mapping** and **one-hot**...", expanded=True) as status:
+                    non_numeric_attributes, non_numeric_head = non_numeric_columns_and_head(DF)
+                    st.write("Large language model analysis...")
+                    encode_result_dict = decide_encode_type(non_numeric_attributes, non_numeric_head, GPT_MODEL, API_KEY)
+                    st.write("Encoding the data...")
+                    convert_int_cols, one_hot_cols, drop_cols = separate_decode_list(encode_result_dict, st.session_state.selected_Y)
+                    encoded_df, mappings = convert_to_numeric(DF, convert_int_cols, one_hot_cols, drop_cols)
+                    # Store the imputed DataFrame in session_state
+                    st.session_state.encoded_df = encoded_df
+                    DF = encoded_df
+                    status.update(label='Data encoding completed!', state="complete", expanded=False)
+                st.download_button(
+                    label="Download Encoded Data",
+                    data=st.session_state.encoded_df.to_csv(index=False).encode('utf-8'),
+                    file_name="encoded_data.csv",
+                    mime='text/csv')
+            else:
+                st.session_state.encoded_df = DF
+                st.success("All columns are numeric. Processing skipped.")
+        else:
+            st.success("Data encoded completed using numeric mapping and one-hot!")
+            if not st.session_state.all_numeric:
+                st.download_button(
+                    label="Download Encoded Data",
+                    data=st.session_state.encoded_df.to_csv(index=False).encode('utf-8'),
+                    file_name="encoded_data.csv",
+                    mime='text/csv')
+        # Correlation Heatmap
+        if 'df_cleaned1' not in st.session_state:
+            st.session_state.df_cleaned1 = DF
+        st.subheader('Correlation Between Attributes')
+        st.plotly_chart(correlation_matrix_plotly(st.session_state.df_cleaned1))
+        # Remove duplicate entities
+        st.subheader('Remove Duplicate Entities')
+        if 'df_cleaned2' not in st.session_state:
+            st.session_state.df_cleaned2 = remove_duplicates(st.session_state.df_cleaned1)
+            # DF = remove_duplicates(DF)
+        st.info("Duplicate rows removed.")
+        # PCA
+        st.subheader('Principal Component Analysis')
+        st.write("Deciding whether to perform PCA...")
+        if 'df_pca' not in st.session_state:
+            to_perform_pca, n_components = decide_pca(st.session_state.df_cleaned2.drop(columns=[st.session_state.selected_Y]))
+            if 'to_perform_pca' not in st.session_state:
+                st.session_state.to_perform_pca = to_perform_pca
+            if st.session_state.to_perform_pca:
+                st.session_state.df_pca = perform_pca(st.session_state.df_cleaned2, n_components, st.session_state.selected_Y)
+            else:
+                st.session_state.df_pca = st.session_state.df_cleaned2
+        st.success("Completed!")
+        # Splitting and Balancing
+        if 'balance_data' not in st.session_state:
+            st.session_state.balance_data = True
+        if "start_training" not in st.session_state:
+            st.session_state["start_training"] = False
+        if 'model_trained' not in st.session_state:
+            st.session_state['model_trained'] = False
+        if 'is_binary' not in st.session_state:
+            st.session_state['is_binary'] = count_unique(st.session_state.df_pca, st.session_state.selected_Y) == 2
+        # AI decide the testing set percentage
+        if 'test_percentage' not in st.session_state:
+            with st.spinner("Deciding testing set percentage based on data..."):
+                st.session_state.test_percentage = int(decide_test_ratio(st.session_state.df_pca.shape, GPT_MODEL, API_KEY) * 100)
+        splitting_column, balance_column = st.columns(2)
+        with splitting_column:
+            st.subheader('Data Splitting')
+            st.caption('AI recommended test percentage for the model')
+            st.slider('Percentage of test set', 1, 25, st.session_state.test_percentage, key='test_percentage', disabled=st.session_state['start_training'])
+        with balance_column:
+            st.metric(label="Test Data", value=f"{st.session_state.test_percentage}%", delta=None)
+            st.toggle('Class Balancing', value=st.session_state.balance_data, key='to_perform_balance', on_change=update_balance_data, disabled=st.session_state['start_training'])
+            st.caption('Strategies for handling imbalanced data sets and to enhance machine learning model performance.')
+            st.caption('AI will select the most appropriate method to balance the data.')
+        st.button("Start Training Model", on_click=start_training_model, type="primary", disabled=st.session_state['start_training'])
+        # Model Training
+        if st.session_state['start_training']:
+            with st.container():
+                st.header("Modeling")
+                X, Y = select_Y(st.session_state.df_pca, st.session_state.selected_Y)
+                # Balancing
+                if st.session_state.balance_data and "balance_method" not in st.session_state:
+                    with st.spinner("AI is deciding the balance strategy for the data..."):
+                        shape_info_balance, description_info_balance, balance_info_balance = get_balance_info(st.session_state.df_pca, st.session_state.selected_Y)
+                        st.session_state.balance_method = int(decide_balance(shape_info_balance, description_info_balance, balance_info_balance, GPT_MODEL, API_KEY))
+                        X_train_res, Y_train_res = check_and_balance(X, Y, method = st.session_state.balance_method)
+                else:
+                    X_train_res, Y_train_res = X, Y
+                    if 'balance_method' not in st.session_state:
+                        st.session_state.balance_method = 4
+                # Splitting the data
+                if not st.session_state.get("data_splitted", False):
+                    st.session_state.X_train, st.session_state.X_test, st.session_state.Y_train, st.session_state.Y_test = split_data(X_train_res, Y_train_res, st.session_state.test_percentage / 100, 42, st.session_state.to_perform_pca)
+                    st.session_state["data_splitted"] = True
+                # Decide model types:
+                if "decided_model" not in st.session_state:
+                    st.session_state["decided_model"] = False
+                if "all_set" not in st.session_state:
+                    st.session_state["all_set"] = False
+                if not st.session_state["decided_model"]:
+                    with st.spinner("Deciding models based on data..."):
+                        shape_info, head_info, nunique_info, description_info = get_data_overview(st.session_state.df_pca)
+                        model_dict = decide_model(shape_info, head_info, nunique_info, description_info, GPT_MODEL, API_KEY)
+                        model_list = get_selected_models(model_dict)
+                        if 'model_list' not in st.session_state:
+                            st.session_state.model_list = model_list
+                        st.session_state["decided_model"] = True
+                # Display results
+                if st.session_state["decided_model"]:
+                    display_results(st.session_state.X_train, st.session_state.X_test, st.session_state.Y_train, st.session_state.Y_test)
+                    st.session_state["all_set"] = True
+                # Download models
+                if st.session_state["all_set"]:
+                    download_col1, download_col2, download_col3 = st.columns(3)
+                    with download_col1:
+                        st.download_button(label="Download Model", data=st.session_state.downloadable_model1, file_name=f"{st.session_state.model1_name}.joblib", mime="application/octet-stream")
+                    with download_col2:
+                        st.download_button(label="Download Model", data=st.session_state.downloadable_model2, file_name=f"{st.session_state.model2_name}.joblib", mime="application/octet-stream")
+                    with download_col3:
+                        st.download_button(label="Download Model", data=st.session_state.downloadable_model3, file_name=f"{st.session_state.model3_name}.joblib", mime="application/octet-stream")
+        # Footer
+        st.divider()
+        if "all_set" in st.session_state and st.session_state["all_set"]:
+            if "has_been_set" not in st.session_state:
+                st.session_state["has_been_set"] = True
+                developer_info()
+            else:
+                developer_info_static()
+def display_results(X_train, X_test, Y_train, Y_test):
+    st.success("Models selected based on your data!")
+    # Data set metrics
+    data_col1, data_col2, data_col3, balance_col4 = st.columns(4)
+    with data_col1:
+        st.metric(label="Total Data", value=len(X_train)+len(X_test), delta=None)
+    with data_col2:
+        st.metric(label="Training Data", value=len(X_train), delta=None)
+    with data_col3:
+        st.metric(label="Testing Data", value=len(X_test), delta=None)
+    with balance_col4:
+        st.metric(label="Balance Strategy", value=get_balance_method_name(st.session_state.balance_method), delta=None)
+    # Model training
+    model_col1, model_col2, model_col3 = st.columns(3)
+    with model_col1:
+        if "model1_name" not in st.session_state:
+            st.session_state.model1_name = get_model_name(st.session_state.model_list[0])
+        st.subheader(st.session_state.model1_name)
+        with st.spinner("Model training in progress..."):
+            if 'model1' not in st.session_state:
+                st.session_state.model1 = train_selected_model(X_train, Y_train, st.session_state.model_list[0])
+                st.session_state.downloadable_model1 = save_model(st.session_state.model1)
+        # Model metrics
+        st.write(f"The accuracy of the {st.session_state.model1_name}: ", f'\n:green[**{st.session_state.model1.score(X_test, Y_test)}**]')
+        st.pyplot(confusion_metrix(st.session_state.model1_name, st.session_state.model1, X_test, Y_test))
+        st.write("F1 Score: ", f':green[**{calculate_f1_score(st.session_state.model1, X_test, Y_test, st.session_state.is_binary)}**]')
+        if st.session_state.model_list[0] != 2 and st.session_state['is_binary']:
+            if 'fpr1' not in st.session_state:
+                fpr1, tpr1 = fpr_and_tpr(st.session_state.model1, X_test, Y_test)
+                st.session_state.fpr1 = fpr1
+                st.session_state.tpr1 = tpr1
+            st.pyplot(roc(st.session_state.model1_name, st.session_state.fpr1, st.session_state.tpr1))
+            st.write(f"The AUC of the {st.session_state.model1_name}: ", f'\n:green[**{auc(st.session_state.fpr1, st.session_state.tpr1)}**]')
+    with model_col2:
+        if "model2_name" not in st.session_state:
+            st.session_state.model2_name = get_model_name(st.session_state.model_list[1])
+        st.subheader(st.session_state.model2_name)
+        with st.spinner("Model training in progress..."):
+            if 'model2' not in st.session_state:
+                st.session_state.model2 = train_selected_model(X_train, Y_train, st.session_state.model_list[1])
+                st.session_state.downloadable_model2 = save_model(st.session_state.model2)
+        # Model metrics
+        st.write(f"The accuracy of the {st.session_state.model2_name}: ", f'\n:green[**{st.session_state.model2.score(X_test, Y_test)}**]')
+        st.pyplot(confusion_metrix(st.session_state.model2_name, st.session_state.model2, X_test, Y_test))
+        st.write("F1 Score: ", f':green[**{calculate_f1_score(st.session_state.model2, X_test, Y_test, st.session_state.is_binary)}**]')
+        if st.session_state.model_list[1] != 2 and st.session_state['is_binary']:
+            if 'fpr2' not in st.session_state:
+                fpr2, tpr2 = fpr_and_tpr(st.session_state.model2, X_test, Y_test)
+                st.session_state.fpr2 = fpr2
+                st.session_state.tpr2 = tpr2
+            st.pyplot(roc(st.session_state.model2_name, st.session_state.fpr2, st.session_state.tpr2))
+            st.write(f"The AUC of the {st.session_state.model2_name}: ", f'\n:green[**{auc(st.session_state.fpr2, st.session_state.tpr2)}**]')
+    with model_col3:
+        if "model3_name" not in st.session_state:
+            st.session_state.model3_name = get_model_name(st.session_state.model_list[2])
+        st.subheader(st.session_state.model3_name)
+        with st.spinner("Model training in progress..."):
+            if 'model3' not in st.session_state:
+                st.session_state.model3 = train_selected_model(X_train, Y_train, st.session_state.model_list[2])
+                st.session_state.downloadable_model3 = save_model(st.session_state.model3)
+        # Model metrics
+        st.write(f"The accuracy of the {st.session_state.model3_name}: ", f'\n:green[**{st.session_state.model3.score(X_test, Y_test)}**]')
+        st.pyplot(confusion_metrix(st.session_state.model3_name, st.session_state.model3, X_test, Y_test))
+        st.write("F1 Score: ", f':green[**{calculate_f1_score(st.session_state.model3, X_test, Y_test, st.session_state.is_binary)}**]')
+        if st.session_state.model_list[2] != 2 and st.session_state['is_binary']:
+            if 'fpr3' not in st.session_state:
+                fpr3, tpr3 = fpr_and_tpr(st.session_state.model3, X_test, Y_test)
+                st.session_state.fpr3 = fpr3
+                st.session_state.tpr3 = tpr3
+            st.pyplot(roc(st.session_state.model3_name, st.session_state.fpr3, st.session_state.tpr3))
+            st.write(f"The AUC of the {st.session_state.model3_name}: ", f'\n:green[**{auc(st.session_state.fpr3, st.session_state.tpr3)}**]')

app/regression_model.py ADDED Viewed

	@@ -0,0 +1,300 @@

+import streamlit as st
+from util import developer_info, developer_info_static
+from src.plot import correlation_matrix_plotly, plot_residuals, plot_predictions_vs_actual, plot_qq_plot
+from src.handle_null_value import contains_missing_value, remove_high_null, fill_null_values
+from src.preprocess import convert_to_numeric, remove_rows_with_empty_target, remove_duplicates, transform_data_for_clustering
+from src.llm_service import decide_fill_null, decide_encode_type, decide_target_attribute, decide_test_ratio, decide_regression_model
+from src.pca import decide_pca, perform_PCA_for_regression
+from src.model_service import split_data, save_model, calculate_r2_score, calculate_mse_and_rmse, calculate_mae
+from src.regression_model import train_selected_regression_model
+from src.util import select_Y, contain_null_attributes_info, separate_fill_null_list, check_all_columns_numeric, non_numeric_columns_and_head, separate_decode_list, get_data_overview, attribute_info, get_regression_method_name
+def start_training_model():
+    st.session_state["start_training"] = True
+def regression_model_pipeline(DF, API_KEY, GPT_MODEL):
+    st.divider()
+    st.subheader('Data Overview')
+    if 'data_origin' not in st.session_state:
+        st.session_state.data_origin = DF
+    st.dataframe(st.session_state.data_origin.describe(), width=1200)
+    attributes = st.session_state.data_origin.columns.tolist()
+    # Select the target variable
+    if 'target_selected' not in st.session_state:
+        st.session_state.target_selected = False
+    st.subheader('Target Variable')
+    if not st.session_state.target_selected:
+        with st.spinner("AI is analyzing the data..."):
+            attributes_for_target, types_info_for_target, head_info_for_target = attribute_info(st.session_state.data_origin)
+            st.session_state.target_Y = decide_target_attribute(attributes_for_target, types_info_for_target, head_info_for_target, GPT_MODEL, API_KEY)
+        if st.session_state.target_Y != -1:
+            selected_Y = st.session_state.target_Y
+            st.success("Target variable has been selected by the AI!")
+            st.write(f'Target attribute selected: :green[**{selected_Y}**]')
+            st.session_state.target_selected = True
+        else:
+            st.info("AI cannot determine the target variable from the data. Please select the target variable")
+            target_col1, target_col2 = st.columns([9, 1])
+            with target_col1:
+                selected_Y = st.selectbox(
+                    label = 'Select the target variable to predict:',
+                    options = attributes,
+                    index = len(attributes)-1,
+                    label_visibility='collapsed'
+                )
+            with target_col2:
+                if st.button("Confirm", type="primary"):
+                    st.session_state.target_selected = True
+        st.session_state.selected_Y = selected_Y
+    else:
+        if st.session_state.target_Y != -1:
+            st.success("Target variable has been selected by the AI!")
+        st.write(f"Target variable selected: :green[**{st.session_state.selected_Y}**]")
+    if st.session_state.target_selected:
+        # Data Imputation
+        st.subheader('Handle and Impute Missing Values')
+        if "contain_null" not in st.session_state:
+            st.session_state.contain_null = contains_missing_value(st.session_state.data_origin)
+        if 'filled_df' not in st.session_state:
+            if st.session_state.contain_null:
+                with st.status("Processing **missing values** in the data...", expanded=True) as status:
+                    st.write("Filtering out high-frequency missing rows and columns...")
+                    filled_df = remove_high_null(DF)
+                    filled_df = remove_rows_with_empty_target(filled_df, st.session_state.selected_Y)
+                    st.write("Large language model analysis...")
+                    attributes, types_info, description_info = contain_null_attributes_info(filled_df)
+                    fill_result_dict = decide_fill_null(attributes, types_info, description_info, GPT_MODEL, API_KEY)
+                    st.write("Imputing missing values...")
+                    mean_list, median_list, mode_list, new_category_list, interpolation_list = separate_fill_null_list(fill_result_dict)
+                    filled_df = fill_null_values(filled_df, mean_list, median_list, mode_list, new_category_list, interpolation_list)
+                    # Store the imputed DataFrame in session_state
+                    st.session_state.filled_df = filled_df
+                    DF = filled_df
+                    status.update(label='Missing value processing completed!', state="complete", expanded=False)
+                st.download_button(
+                    label="Download Data with Missing Values Imputed",
+                    data=st.session_state.filled_df.to_csv(index=False).encode('utf-8'),
+                    file_name="imputed_missing_values.csv",
+                    mime='text/csv')
+            else:
+                st.session_state.filled_df = DF
+                st.success("No missing values detected. Processing skipped.")
+        else:
+            st.success("Missing value processing completed!")
+            if st.session_state.contain_null:
+                st.download_button(
+                    label="Download Data with Missing Values Imputed",
+                    data=st.session_state.filled_df.to_csv(index=False).encode('utf-8'),
+                    file_name="imputed_missing_values.csv",
+                    mime='text/csv')
+        # Data Encoding
+        st.subheader("Process Data Encoding")
+        st.caption("*For considerations of processing time, **NLP features** like **TF-IDF** have not been included in the current pipeline, long text attributes may be dropped.")
+        if 'all_numeric' not in st.session_state:
+            st.session_state.all_numeric = check_all_columns_numeric(st.session_state.data_origin)
+        if 'encoded_df' not in st.session_state:
+            if not st.session_state.all_numeric:
+                with st.status("Encoding non-numeric data using **numeric mapping** and **one-hot**...", expanded=True) as status:
+                    non_numeric_attributes, non_numeric_head = non_numeric_columns_and_head(DF)
+                    st.write("Large language model analysis...")
+                    encode_result_dict = decide_encode_type(non_numeric_attributes, non_numeric_head, GPT_MODEL, API_KEY)
+                    st.write("Encoding the data...")
+                    convert_int_cols, one_hot_cols, drop_cols = separate_decode_list(encode_result_dict, st.session_state.selected_Y)
+                    encoded_df, mappings = convert_to_numeric(DF, convert_int_cols, one_hot_cols, drop_cols)
+                    # Store the imputed DataFrame in session_state
+                    st.session_state.encoded_df = encoded_df
+                    DF = encoded_df
+                    status.update(label='Data encoding completed!', state="complete", expanded=False)
+                st.download_button(
+                    label="Download Encoded Data",
+                    data=st.session_state.encoded_df.to_csv(index=False).encode('utf-8'),
+                    file_name="encoded_data.csv",
+                    mime='text/csv')
+            else:
+                st.session_state.encoded_df = DF
+                st.success("All columns are numeric. Processing skipped.")
+        else:
+            st.success("Data encoded completed using numeric mapping and one-hot!")
+            if not st.session_state.all_numeric:
+                st.download_button(
+                    label="Download Encoded Data",
+                    data=st.session_state.encoded_df.to_csv(index=False).encode('utf-8'),
+                    file_name="encoded_data.csv",
+                    mime='text/csv')
+        # Correlation Heatmap
+        if 'df_cleaned1' not in st.session_state:
+            st.session_state.df_cleaned1 = DF
+        st.subheader('Correlation Between Attributes')
+        st.plotly_chart(correlation_matrix_plotly(st.session_state.df_cleaned1))
+        # Remove duplicate entities
+        st.subheader('Remove Duplicate Entities')
+        if 'df_cleaned2' not in st.session_state:
+            st.session_state.df_cleaned2 = remove_duplicates(st.session_state.df_cleaned1)
+            # DF = remove_duplicates(DF)
+        st.info("Duplicate rows removed.")
+        # Data Transformation
+        st.subheader('Data Transformation')
+        if 'data_transformed' not in st.session_state:
+            st.session_state.data_transformed = transform_data_for_clustering(st.session_state.df_cleaned2)
+        st.success("Data transformed by standardization and box-cox if applicable.")
+        # PCA
+        st.subheader('Principal Component Analysis')
+        st.write("Deciding whether to perform PCA...")
+        if 'df_pca' not in st.session_state:
+            _, n_components = decide_pca(st.session_state.df_cleaned2)
+            st.session_state.df_pca = perform_PCA_for_regression(st.session_state.data_transformed, n_components, st.session_state.selected_Y)
+        st.success("Completed!")
+        if "start_training" not in st.session_state:
+            st.session_state["start_training"] = False
+        # AI decide the testing set percentage
+        if 'test_percentage' not in st.session_state:
+            with st.spinner("Deciding testing set percentage based on data..."):
+                st.session_state.test_percentage = int(decide_test_ratio(st.session_state.df_pca.shape, GPT_MODEL, API_KEY) * 100)
+        splitting_column, balance_column = st.columns(2)
+        with splitting_column:
+            st.subheader('Data Splitting')
+            st.caption('AI recommended test percentage for the model')
+            st.slider('Percentage of test set', 1, 25, st.session_state.test_percentage, key='test_percentage', disabled=st.session_state['start_training'])
+        with balance_column:
+            st.metric(label="Test Data", value=f"{st.session_state.test_percentage}%", delta=None)
+            st.toggle('Class Balancing', value=False, key='to_perform_balance', disabled=True)
+            st.caption('Class balancing is not applicable to regression models.')
+        st.button("Start Training Model", on_click=start_training_model, type="primary", disabled=st.session_state['start_training'])
+        # Model Training
+        if st.session_state['start_training']:
+            with st.container():
+                st.header("Modeling")
+                X_train_res, Y_train_res = select_Y(st.session_state.df_pca, st.session_state.selected_Y)
+                # Splitting the data
+                if not st.session_state.get("data_splitted", False):
+                    st.session_state.X_train, st.session_state.X_test, st.session_state.Y_train, st.session_state.Y_test = split_data(X_train_res, Y_train_res, st.session_state.test_percentage / 100, 42, True)
+                    st.session_state["data_splitted"] = True
+                # Decide model types:
+                if "decided_model" not in st.session_state:
+                    st.session_state["decided_model"] = False
+                if "all_set" not in st.session_state:
+                    st.session_state["all_set"] = False
+                if not st.session_state["decided_model"]:
+                    with st.spinner("Deciding models based on data..."):
+                        shape_info, _, _, description_info = get_data_overview(st.session_state.df_pca)
+                        model_dict = decide_regression_model(shape_info, description_info, st.session_state.selected_Y, GPT_MODEL, API_KEY)
+                        model_list = list(model_dict.values())
+                        if 'model_list' not in st.session_state:
+                            st.session_state.model_list = model_list
+                        st.session_state["decided_model"] = True
+                # Show modeling results
+                if st.session_state["decided_model"]:
+                    display_results(st.session_state.X_train, st.session_state.X_test, st.session_state.Y_train, st.session_state.Y_test)
+                    st.session_state["all_set"] = True
+                # Download models
+                if st.session_state["all_set"]:
+                    download_col1, download_col2, download_col3 = st.columns(3)
+                    with download_col1:
+                        st.download_button(label="Download Model", data=st.session_state.downloadable_model1, file_name=f"{st.session_state.model1_name}.joblib", mime="application/octet-stream")
+                    with download_col2:
+                        st.download_button(label="Download Model", data=st.session_state.downloadable_model2, file_name=f"{st.session_state.model2_name}.joblib", mime="application/octet-stream")
+                    with download_col3:
+                        st.download_button(label="Download Model", data=st.session_state.downloadable_model3, file_name=f"{st.session_state.model3_name}.joblib", mime="application/octet-stream")
+        # Footer
+        st.divider()
+        if "all_set" in st.session_state and st.session_state["all_set"]:
+            if "has_been_set" not in st.session_state:
+                st.session_state["has_been_set"] = True
+                developer_info()
+            else:
+                developer_info_static()
+def display_results(X_train, X_test, Y_train, Y_test):
+    st.success("Models selected based on your data!")
+    # Data set metrics
+    data_col1, data_col2, data_col3 = st.columns(3)
+    with data_col1:
+        st.metric(label="Total Data", value=len(X_train)+len(X_test), delta=None)
+    with data_col2:
+        st.metric(label="Training Data", value=len(X_train), delta=None)
+    with data_col3:
+        st.metric(label="Testing Data", value=len(X_test), delta=None)
+    # Model training
+    model_col1, model_col2, model_col3 = st.columns(3)
+    with model_col1:
+        if "model1_name" not in st.session_state:
+            st.session_state.model1_name = get_regression_method_name(st.session_state.model_list[0])
+        st.subheader(st.session_state.model1_name)
+        with st.spinner("Model training in progress..."):
+            if 'model1' not in st.session_state:
+                st.session_state.model1 = train_selected_regression_model(X_train, Y_train, st.session_state.model_list[0])
+                st.session_state.y_pred1 = st.session_state.model1.predict(X_test)
+                st.session_state.downloadable_model1 = save_model(st.session_state.model1)
+        # Model metrics
+        st.write("R2 Score: ", f':green[**{calculate_r2_score(st.session_state.y_pred1, Y_test)}**]')
+        st.pyplot(plot_predictions_vs_actual(st.session_state.y_pred1, Y_test))
+        mse1, rmse1 = calculate_mse_and_rmse(st.session_state.y_pred1, Y_test)
+        st.write("Mean Squared Error: ", f':green[**{mse1}**]')
+        st.write("Root Mean Squared Error: ", f':green[**{rmse1}**]')
+        st.pyplot(plot_residuals(st.session_state.y_pred1, Y_test))
+        st.write("Mean Absolute Error: ", f':green[**{calculate_mae(st.session_state.y_pred1, Y_test)}**]')
+        st.pyplot(plot_qq_plot(st.session_state.y_pred1, Y_test))
+    with model_col2:
+        if "model2_name" not in st.session_state:
+            st.session_state.model2_name = get_regression_method_name(st.session_state.model_list[1])
+        st.subheader(st.session_state.model2_name)
+        with st.spinner("Model training in progress..."):
+            if 'model2' not in st.session_state:
+                st.session_state.model2 = train_selected_regression_model(X_train, Y_train, st.session_state.model_list[1])
+                st.session_state.y_pred = st.session_state.model2.predict(X_test)
+                st.session_state.downloadable_model2 = save_model(st.session_state.model2)
+        # Model metrics
+        st.write("R2 Score: ", f':green[**{calculate_r2_score(st.session_state.y_pred, Y_test)}**]')
+        st.pyplot(plot_predictions_vs_actual(st.session_state.y_pred, Y_test))
+        mse2, rmse2 = calculate_mse_and_rmse(st.session_state.y_pred, Y_test)
+        st.write("Mean Squared Error: ", f':green[**{mse2}**]')
+        st.write("Root Mean Squared Error: ", f':green[**{rmse2}**]')
+        st.pyplot(plot_residuals(st.session_state.y_pred, Y_test))
+        st.write("Mean Absolute Error: ", f':green[**{calculate_mae(st.session_state.y_pred, Y_test)}**]')
+        st.pyplot(plot_qq_plot(st.session_state.y_pred, Y_test))
+    with model_col3:
+        if "model3_name" not in st.session_state:
+            st.session_state.model3_name = get_regression_method_name(st.session_state.model_list[2])
+        st.subheader(st.session_state.model3_name)
+        with st.spinner("Model training in progress..."):
+            if 'model3' not in st.session_state:
+                st.session_state.model3 = train_selected_regression_model(X_train, Y_train, st.session_state.model_list[2])
+                st.session_state.y_pred3 = st.session_state.model3.predict(X_test)
+                st.session_state.downloadable_model3 = save_model(st.session_state.model3)
+        # Model metrics
+        st.write("R2 Score: ", f':green[**{calculate_r2_score(st.session_state.y_pred3, Y_test)}**]')
+        st.pyplot(plot_predictions_vs_actual(st.session_state.y_pred3, Y_test))
+        mse3, rmse3 = calculate_mse_and_rmse(st.session_state.y_pred3, Y_test)
+        st.write("Mean Squared Error: ", f':green[**{mse3}**]')
+        st.write("Root Mean Squared Error: ", f':green[**{rmse3}**]')
+        st.pyplot(plot_residuals(st.session_state.y_pred3, Y_test))
+        st.write("Mean Absolute Error: ", f':green[**{calculate_mae(st.session_state.y_pred3, Y_test)}**]')
+        st.pyplot(plot_qq_plot(st.session_state.y_pred3, Y_test))

app/src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import plot, util, pca, cluster_model, model_service, preprocess, predictive_model, llm_service, handle_null_value

app/src/cluster_model.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import streamlit as st
+from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering
+from sklearn.mixture import GaussianMixture
+@st.cache_data
+def train_select_cluster_model(X_train, n, model_type, model_params=None):
+    """
+    Trains a clustering model based on the specified model type and parameters.
+    Parameters:
+    - X_train (array-like): The training data set.
+    - n (int): The number of clusters to form or the number of components for the Gaussian Mixture model.
+    - model_type (int): An integer representing the type of model to train.
+        1 for KMeans, 2 for DBSCAN, 3 for GaussianMixture, 4 for Hierarchical clustering, and 5 for Spectral clustering.
+    - model_params (dict, optional): A dictionary of model-specific parameters. Default is None.
+    Returns:
+    - The trained clustering model object based on the specified model type.
+    """
+    if model_type == 1:
+        return KMeans_train(X_train, n, model_params)
+    elif model_type == 2:
+        return DBSCAN_train(X_train, model_params)
+    elif model_type == 3:
+        return GaussianMixture_train(X_train, n, model_params)
+    elif model_type == 4:
+        return Hierarchical_train(X_train, n, model_params)
+    elif model_type == 5:
+        return Spectral_train(X_train, n, model_params)
+def KMeans_train(X_train, n_clusters=3, model_params=None):
+    if model_params is None: model_params = {}
+    kmeans = KMeans(n_clusters=n_clusters, **model_params)
+    kmeans.fit(X_train)
+    return kmeans
+def DBSCAN_train(X_train, model_params=None):
+    if model_params is None: model_params = {}
+    dbscan = DBSCAN(**model_params)
+    dbscan.fit(X_train)
+    return dbscan
+def GaussianMixture_train(X_train, n_components=1, model_params=None):
+    if model_params is None: model_params = {}
+    gmm = GaussianMixture(n_components=n_components, **model_params)
+    gmm.fit(X_train)
+    return gmm
+def Hierarchical_train(X_train, n_clusters=3, model_params=None):
+    if model_params is None: model_params = {}
+    hierarchical = AgglomerativeClustering(n_clusters=n_clusters, **model_params)
+    hierarchical.fit(X_train)
+    return hierarchical
+def Spectral_train(X_train, n_clusters=3, model_params=None):
+    if model_params is None: model_params = {}
+    spectral = SpectralClustering(n_clusters=n_clusters, **model_params)
+    spectral.fit(X_train)
+    return spectral

app/src/config/config.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+openai_api_key: "YOUR_OPENAI_API_KEY"
+model4_name: "gpt-4-1106-preview"
+model3_name: "gpt-3.5-turbo-1106"
+numeric_attribute_template: |
+  You are a data analyst. You are cleaning the data and processing the attributes in the data that are not numeric. The columns to be processed include: {attributes}. The first 20 items of these data are as follows:
+  {data_frame_head}
+  Please help me decide whether each attribute should be processed as integer mapping or one-hot encoding based on content and semantics. If there's an attribute containing long text, consider dropping it. Integer mapping is represented by 1, one-hot encoding is represented by 2, and dropping the attribute is represented by 3. Only the data is returned in json format without any other explanation or content. Sample response: {{"color":2,"size":1,"country":2,"brand":2,"gender":1,"comments":3}}
+null_attribute_template: |
+  You are a data analyst. You are preprocessing the attributes in the data that contain null values. The columns to be processed include: {attributes}. The types of these attributes are:
+  {types_info}
+  Statistics for these properties in csv format:
+  {description_info}
+  Please help me decide how to supplement null values for each attribute based on content, statistics and semantics. The mean filling is represented by 1, the median filling is represented by 2, the mode filling is represented by 3, the introduction of a new category to represent the unknown is represented by 4, and the interpolation filling is represented by 5. Only the data is returned in json format without any other explanation or content. Sample response: {{"grade":2,"annual_income":2,"temperature":1,"fault_type":3,"country":4,"weight":1,"stock price":5}}
+decide_model_template: |
+  You are a data analyst. The shape of my data frame is {shape_info}. The head(5) of the data frame is:
+  {head_info}
+  The nunique() of the data frame is:
+  {nunique_info}
+  The description of the data frame is:
+  {description_info}
+  The data has been cleaned and preprocessed, nulls filled, and encoded ready to train the machine learning model. According to the data information provided, please help me decide which machine learning models should be used for classification prediction. Model options are: 1:LogisticRegression, 2:SVC, 3:GaussianNB, 4:RandomForestClassifier, 5:AdaBoostClassifier, 6:XGBClassifier, 7:GradientBoostingClassifier. Please select three models to take into account different model performance indicators. Only the data is returned in json format without any other explanation or content. Sample response: {{"model1":1,"model2":4,"model3":6}}
+decide_clustering_model_template: |
+  You are a data analyst. The shape of my data frame is {shape_info}. The description of the data frame is:
+  {description_info}
+  The data has been cleaned and preprocessed, numerically transformed, and ready to train the clustering models. According to the data information provided, please help me decide which clustering models should be used for discovering natural groupings in the data. The expected number of clusters is {cluster_info}. Model options are: 1:KMeans, 2:DBSCAN, 3:GaussianMixture, 4:AgglomerativeClustering, 5:SpectralClustering. Please select three models to take into account different model performance indicators. Only the data is returned in json format without any other explanation or content. Sample response: {{"model1":1,"model2":2,"model3":3}}
+decide_regression_model_template: |
+  You are a data analyst. You are trying to select some regression models to predict the target attribute. The shape of my data frame is {shape_info}. The target variable to be predicted is {Y_name}. The description of the data frame is:
+  {description_info}
+  The data has been cleaned and preprocessed, numerically transformed, and ready to train the regression models. According to the data information provided, please help me decide which regression models should be used to provide better prediction performance. Model options are: 1:LinearRegression, 2:Ridge, 3:Lasso, 4:RandomForestRegressor, 5:GradientBoostingRegressor, 6:ElasticNet. Please select three models to take into account different model performance indicators. Only the data is returned in json format without any other explanation or content. Sample response: {{"model1":1,"model2":2,"model3":3}}
+decide_target_attribute_template: |
+  You are a data analyst. You are trying to find out which attribute is the target attribute from the data frame. The attributes are {attributes}. The types of these attributes are:
+  {types_info}
+  The head(10) of the data frame is:
+  {head_info}
+  Determine the target attribute to predict based on the data information provided. Only the data is returned in json format without any other explanation or content. Sample response: {{"target":"species"}}
+  If the provided data is not sufficient to determine the target, only return the data in json format {{"target":-1}}
+decide_test_ratio_template: |
+  You are a data analyst. You are trying to split the data frame into training set and test set. The shape of my data frame is {shape_info}. Determine the test set ratio based on the shape information provided and it's assumed that the categories of the target variable are balanced. The test set ratio range is 0.01 to 0.25. Only the data is returned in json format without any other explanation or content. Sample response: {{"test_ratio":0.25}}
+decide_balance_template: |
+  You are a data analyst. You have a cleaned and pre-processed data frame and you want to handle class imbalance before training the machine learning model. The shape of my data frame is {shape_info}. The description of the data frame is:
+  {description_info}
+  The number of each value of the target attribute is: {balance_info}
+  Determine the balance strategy based on the data information provided. The RandomOverSampler is represented by 1, the SMOTE is represented by 2, the ADASYN is represented by 3, and do not balance is represented by 4. Only the data is returned in json format without any other explanation or content. Sample response: {{"method":2}}

app/src/handle_null_value.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import numpy as np
+def contains_missing_value(df):
+    """
+    Checks if the DataFrame contains any missing values.
+    """
+    return df.isnull().values.any()
+def fill_null_values(df, mean_list, median_list, mode_list, new_category_list, interpolation_list):
+    """
+    Fills missing values in the DataFrame using specified methods for different columns.
+    Parameters:
+    - df (DataFrame): The DataFrame with missing values.
+    - mean_list (list): Columns to fill missing values with mean.
+    - median_list (list): Columns to fill missing values with median.
+    - mode_list (list): Columns to fill missing values with mode.
+    - new_category_list (list): Columns to fill missing values with a new category (previously intended for 'NaN', now uses interpolation).
+    - interpolation_list (list): Columns to fill missing values using interpolation.
+    Returns:
+    - df (DataFrame): The DataFrame after filling missing values.
+    """
+    if mean_list:
+        df = fill_with_mean(df, mean_list)
+    if median_list:
+        df = fill_with_median(df, median_list)
+    if mode_list:
+        df = fill_with_mode(df, mode_list)
+    if new_category_list:
+        # df = fill_with_NaN(df, new_category_list)
+        df = fill_with_interpolation(df, new_category_list)
+    if interpolation_list:
+        df = fill_with_interpolation(df, interpolation_list)
+    return df
+def remove_high_null(df, threshold_row=0.5, threshold_col=0.7):
+    """
+    Remove rows and columns from a DataFrame where the proportion of null values
+    is greater than the specified threshold.
+    - param df: Pandas DataFrame to be processed.
+    - param threshold_row: Proportion threshold for null values (default is 0.5 for rows).
+    - param threshold_col: Proportion threshold for null values (default is 0.7 for columns).
+    - return: DataFrame with high-null rows and columns removed.
+    """
+    # Calculate the proportion of nulls in each column
+    null_prop_col = df.isnull().mean()
+    cols_to_drop = null_prop_col[null_prop_col > threshold_col].index
+    # Drop columns with high proportion of nulls
+    df_cleaned = df.drop(columns=cols_to_drop)
+    # Calculate the proportion of nulls in each row
+    null_prop_row = df_cleaned.isnull().mean(axis=1)
+    rows_to_drop = null_prop_row[null_prop_row > threshold_row].index
+    # Drop rows with high proportion of nulls
+    df_cleaned = df_cleaned.drop(index=rows_to_drop)
+    return df_cleaned
+def fill_with_mean(df, attributes):
+    for attr in attributes:
+        if attr in df.columns:
+            df[attr] = df[attr].fillna(df[attr].mean())
+    return df
+def fill_with_median(df, attributes):
+    for attr in attributes:
+        if attr in df.columns:
+            df[attr] = df[attr].fillna(df[attr].median())
+    return df
+def fill_with_mode(df, attributes):
+    for attr in attributes:
+        if attr in df.columns:
+            mode_value = df[attr].mode()[0] if not df[attr].mode().empty else None
+            if mode_value is not None:
+                df[attr] = df[attr].fillna(mode_value)
+    return df
+def fill_with_interpolation(df, attributes, method='linear'):
+    # method: default is 'linear'. 'time', 'index', 'pad', 'nearest', 'quadratic', 'cubic', etc.
+    for attr in attributes:
+        if attr in df.columns:
+            df[attr] = df[attr].interpolate(method=method)
+    return df
+# Deprecated: replaced with interpolation to ensure no missing values
+def fill_with_NaN(df, attributes):
+    for attr in attributes:
+        if attr in df.columns:
+            df[attr] = df[attr].fillna('NaN')
+    return df
+def replace_placeholders_with_nan(df):
+    """
+    Replaces common placeholders for missing values in object columns with np.nan.
+    Parameters:
+    - df (DataFrame): The DataFrame to process.
+    Returns:
+    - df (DataFrame): Updated DataFrame with placeholders replaced.
+    """
+    placeholders = ["NA", "NULL", "?", "", "NaN", "None", "N/A", "n/a", "nan", "none"]
+    for col in df.columns:
+        if df[col].dtype == 'object':
+            df[col] = df[col].apply(lambda x: np.nan if str(x).lower() in placeholders else x)
+    return df

app/src/llm_service.py ADDED Viewed

	@@ -0,0 +1,300 @@

+import os
+import yaml
+import json
+import re
+import streamlit as st
+from langchain.prompts import PromptTemplate
+from langchain.schema import HumanMessage
+from langchain.chat_models import ChatOpenAI
+config_path = os.path.join(os.path.dirname(__file__), 'config', 'config.yaml')
+with open(config_path, 'r') as file:
+    config = yaml.safe_load(file)
+model4_name = config["model4_name"]
+model3_name = config["model3_name"]
+api_key = config["openai_api_key"]
+def decide_encode_type(attributes, data_frame_head, model_type = 4, user_api_key = None):
+    """
+    Decides the encoding type for given attributes using a language model via the OpenAI API.
+    Parameters:
+    - attributes (list): A list of attributes for which to decide the encoding type.
+    - data_frame_head (DataFrame): The head of the DataFrame containing the attributes. This parameter is expected to be a representation of the DataFrame (e.g., a string or a small subset of the actual DataFrame) that gives an overview of the data.
+    - model_type (int, optional): Specifies the model to use. The default model_type=4 corresponds to a predefined model named `model4_name`. Another option is model_type=3, which corresponds to `model3_name`.
+    - user_api_key (str, optional): The user's OpenAI API key. If not provided, a default API key `api_key` is used.
+    Returns:
+    - A JSON object containing the recommended encoding types for the given attributes. Please refer to prompt templates in config.py for details.
+    Raises:
+    - Exception: If there is an issue accessing the OpenAI API, such as an invalid API key or a network connection error, the function will raise an exception with a message indicating the problem.
+    """
+    try:
+        model_name = model4_name if model_type == 4 else model3_name
+        user_api_key = api_key if user_api_key is None else user_api_key
+        llm = ChatOpenAI(model_name=model_name, openai_api_key=user_api_key, temperature=0)
+        template = config["numeric_attribute_template"]
+        prompt_template = PromptTemplate(input_variables=["attributes", "data_frame_head"], template=template)
+        summary_prompt = prompt_template.format(attributes=attributes, data_frame_head=data_frame_head)
+        llm_answer = llm([HumanMessage(content=summary_prompt)])
+        if '```json' in llm_answer.content:
+            match = re.search(r'```json\n(.*?)```', llm_answer.content, re.DOTALL)
+            if match: json_str = match.group(1)
+        else: json_str = llm_answer.content
+        return json.loads(json_str)
+    except Exception as e:
+        st.error("Cannot access the OpenAI API. Please check your API key or network connection.")
+        st.stop()
+def decide_fill_null(attributes, types_info, description_info, model_type = 4, user_api_key = None):
+    """
+    Decides the best encoding type for given attributes using an AI model via OpenAI API.
+    Parameters:
+    - attributes (list): List of attribute names to consider for encoding.
+    - data_frame_head (DataFrame or str): The head of the DataFrame or a string representation, providing context for the encoding decision.
+    - model_type (int, optional): The model to use, where 4 is the default. Can be customized to use a different model.
+    - user_api_key (str, optional): The user's OpenAI API key. If None, a default key is used.
+    Returns:
+    - dict: A JSON object with recommended encoding types for the attributes. Please refer to prompt templates in config.py for details.
+    Raises:
+    - Exception: If there is an issue accessing the OpenAI API, such as an invalid API key or a network connection error, the function will raise an exception with a message indicating the problem.
+    """
+    try:
+        model_name = model4_name if model_type == 4 else model3_name
+        user_api_key = api_key if user_api_key is None else user_api_key
+        llm = ChatOpenAI(model_name=model_name, openai_api_key=user_api_key, temperature=0)
+        template = config["null_attribute_template"]
+        prompt_template = PromptTemplate(input_variables=["attributes", "types_info", "description_info"], template=template)
+        summary_prompt = prompt_template.format(attributes=attributes, types_info=types_info, description_info=description_info)
+        llm_answer = llm([HumanMessage(content=summary_prompt)])
+        if '```json' in llm_answer.content:
+            match = re.search(r'```json\n(.*?)```', llm_answer.content, re.DOTALL)
+            if match: json_str = match.group(1)
+        else: json_str = llm_answer.content
+        return json.loads(json_str)
+    except Exception as e:
+        st.error("Cannot access the OpenAI API. Please check your API key or network connection.")
+        st.stop()
+def decide_model(shape_info, head_info, nunique_info, description_info, model_type = 4, user_api_key = None):
+    """
+    Decides the most suitable machine learning model based on dataset characteristics.
+    Parameters:
+    - shape_info (dict): Information about the shape of the dataset.
+    - head_info (str or DataFrame): The head of the dataset or its string representation.
+    - nunique_info (dict): Information about the uniqueness of dataset attributes.
+    - description_info (str): Descriptive information about the dataset.
+    - model_type (int, optional): Specifies which model to consult for decision-making.
+    - user_api_key (str, optional): OpenAI API key for making requests.
+    Returns:
+    - dict: A JSON object containing the recommended model and configuration. Please refer to prompt templates in config.py for details.
+    Raises:
+    - Exception: If there is an issue accessing the OpenAI API, such as an invalid API key or a network connection error, the function will raise an exception with a message indicating the problem.
+    """
+    try:
+        model_name = model4_name if model_type == 4 else model3_name
+        user_api_key = api_key if user_api_key is None else user_api_key
+        llm = ChatOpenAI(model_name=model_name, openai_api_key=user_api_key, temperature=0)
+        template = config["decide_model_template"]
+        prompt_template = PromptTemplate(input_variables=["shape_info", "head_info", "nunique_info", "description_info"], template=template)
+        summary_prompt = prompt_template.format(shape_info=shape_info, head_info=head_info, nunique_info=nunique_info, description_info=description_info)
+        llm_answer = llm([HumanMessage(content=summary_prompt)])
+        if '```json' in llm_answer.content:
+            match = re.search(r'```json\n(.*?)```', llm_answer.content, re.DOTALL)
+            if match: json_str = match.group(1)
+        else: json_str = llm_answer.content
+        return json.loads(json_str)
+    except Exception as e:
+        st.error("Cannot access the OpenAI API. Please check your API key or network connection.")
+        st.stop()
+def decide_cluster_model(shape_info, description_info, cluster_info, model_type = 4, user_api_key = None):
+    """
+    Determines the appropriate clustering model based on dataset characteristics.
+    Parameters:
+    - shape_info: Information about the dataset shape.
+    - description_info: Descriptive statistics or information about the dataset.
+    - cluster_info: Additional information relevant to clustering.
+    - model_type (int, optional): The model type to use for decision making (default 4).
+    - user_api_key (str, optional): The user's API key for OpenAI.
+    Returns:
+    - A JSON object with the recommended clustering model and parameters. Please refer to prompt templates in config.py for details.
+    Raises:
+    - Exception: If unable to access the OpenAI API or another error occurs.
+    """
+    try:
+        model_name = model4_name if model_type == 4 else model3_name
+        user_api_key = api_key if user_api_key is None else user_api_key
+        llm = ChatOpenAI(model_name=model_name, openai_api_key=user_api_key, temperature=0)
+        template = config["decide_clustering_model_template"]
+        prompt_template = PromptTemplate(input_variables=["shape_info", "description_info", "cluster_info"], template=template)
+        summary_prompt = prompt_template.format(shape_info=shape_info, description_info=description_info, cluster_info=cluster_info)
+        llm_answer = llm([HumanMessage(content=summary_prompt)])
+        if '```json' in llm_answer.content:
+            match = re.search(r'```json\n(.*?)```', llm_answer.content, re.DOTALL)
+            if match: json_str = match.group(1)
+        else: json_str = llm_answer.content
+        return json.loads(json_str)
+    except Exception as e:
+        st.error("Cannot access the OpenAI API. Please check your API key or network connection.")
+        st.stop()
+def decide_regression_model(shape_info, description_info, Y_name, model_type = 4, user_api_key = None):
+    """
+    Determines the appropriate regression model based on dataset characteristics and the target variable.
+    Parameters:
+    - shape_info: Information about the dataset shape.
+    - description_info: Descriptive statistics or information about the dataset.
+    - Y_name: The name of the target variable.
+    - model_type (int, optional): The model type to use for decision making (default 4).
+    - user_api_key (str, optional): The user's API key for OpenAI.
+    Returns:
+    - A JSON object with the recommended regression model and parameters. Please refer to prompt templates in config.py for details.
+    Raises:
+    - Exception: If unable to access the OpenAI API or another error occurs.
+    """
+    try:
+        model_name = model4_name if model_type == 4 else model3_name
+        user_api_key = api_key if user_api_key is None else user_api_key
+        llm = ChatOpenAI(model_name=model_name, openai_api_key=user_api_key, temperature=0)
+        template = config["decide_regression_model_template"]
+        prompt_template = PromptTemplate(input_variables=["shape_info", "description_info", "Y_name"], template=template)
+        summary_prompt = prompt_template.format(shape_info=shape_info, description_info=description_info, Y_name=Y_name)
+        llm_answer = llm([HumanMessage(content=summary_prompt)])
+        if '```json' in llm_answer.content:
+            match = re.search(r'```json\n(.*?)```', llm_answer.content, re.DOTALL)
+            if match: json_str = match.group(1)
+        else: json_str = llm_answer.content
+        return json.loads(json_str)
+    except Exception as e:
+        st.error("Cannot access the OpenAI API. Please check your API key or network connection.")
+        st.stop()
+def decide_target_attribute(attributes, types_info, head_info, model_type = 4, user_api_key = None):
+    """
+    Determines the target attribute for modeling based on dataset attributes and characteristics.
+    Parameters:
+    - attributes: A list of dataset attributes.
+    - types_info: Information about the data types of the attributes.
+    - head_info: A snapshot of the dataset's first few rows.
+    - model_type (int, optional): The model type to use for decision making (default 4).
+    - user_api_key (str, optional): The user's API key for OpenAI.
+    Returns:
+    - The name of the recommended target attribute. Please refer to prompt templates in config.py for details.
+    Raises:
+    - Exception: If unable to access the OpenAI API or another error occurs.
+    """
+    try:
+        model_name = model4_name if model_type == 4 else model3_name
+        user_api_key = api_key if user_api_key is None else user_api_key
+        llm = ChatOpenAI(model_name=model_name, openai_api_key=user_api_key, temperature=0)
+        template = config["decide_target_attribute_template"]
+        prompt_template = PromptTemplate(input_variables=["attributes", "types_info", "head_info"], template=template)
+        summary_prompt = prompt_template.format(attributes=attributes, types_info=types_info, head_info=head_info)
+        llm_answer = llm([HumanMessage(content=summary_prompt)])
+        if '```json' in llm_answer.content:
+            match = re.search(r'```json\n(.*?)```', llm_answer.content, re.DOTALL)
+            if match: json_str = match.group(1)
+        else: json_str = llm_answer.content
+        return json.loads(json_str)["target"]
+    except Exception as e:
+        st.error("Cannot access the OpenAI API. Please check your API key or network connection.")
+        st.stop()
+def decide_test_ratio(shape_info, model_type = 4, user_api_key = None):
+    """
+    Determines the appropriate train-test split ratio based on dataset characteristics.
+    Parameters:
+    - shape_info: Information about the dataset shape.
+    - model_type (int, optional): The model type to use for decision making (default 4).
+    - user_api_key (str, optional): The user's API key for OpenAI.
+    Returns:
+    - The recommended train-test split ratio as a float. Please refer to prompt templates in config.py for details.
+    Raises:
+    - Exception: If unable to access the OpenAI API or another error occurs.
+    """
+    try:
+        model_name = model4_name if model_type == 4 else model3_name
+        user_api_key = api_key if user_api_key is None else user_api_key
+        llm = ChatOpenAI(model_name=model_name, openai_api_key=user_api_key, temperature=0)
+        template = config["decide_test_ratio_template"]
+        prompt_template = PromptTemplate(input_variables=["shape_info"], template=template)
+        summary_prompt = prompt_template.format(shape_info=shape_info)
+        llm_answer = llm([HumanMessage(content=summary_prompt)])
+        if '```json' in llm_answer.content:
+            match = re.search(r'```json\n(.*?)```', llm_answer.content, re.DOTALL)
+            if match: json_str = match.group(1)
+        else: json_str = llm_answer.content
+        return json.loads(json_str)["test_ratio"]
+    except Exception as e:
+        st.error("Cannot access the OpenAI API. Please check your API key or network connection.")
+        st.stop()
+def decide_balance(shape_info, description_info, balance_info, model_type = 4, user_api_key = None):
+    """
+    Determines the appropriate method to balance the dataset based on its characteristics.
+    Parameters:
+    - shape_info: Information about the dataset shape.
+    - description_info: Descriptive statistics or information about the dataset.
+    - balance_info: Additional information relevant to dataset balancing.
+    - model_type (int, optional): The model type to use for decision making (default 4).
+    - user_api_key (str, optional): The user's API key for OpenAI.
+    Returns:
+    - The recommended method to balance the dataset. Please refer to prompt templates in config.py for details.
+    Raises:
+    - Exception: If unable to access the OpenAI API or another error occurs.
+    """
+    try:
+        model_name = model4_name if model_type == 4 else model3_name
+        user_api_key = api_key if user_api_key is None else user_api_key
+        llm = ChatOpenAI(model_name=model_name, openai_api_key=user_api_key, temperature=0)
+        template = config["decide_balance_template"]
+        prompt_template = PromptTemplate(input_variables=["shape_info", "description_info", "balance_info"], template=template)
+        summary_prompt = prompt_template.format(shape_info=shape_info, description_info=description_info, balance_info=balance_info)
+        llm_answer = llm([HumanMessage(content=summary_prompt)])
+        if '```json' in llm_answer.content:
+            match = re.search(r'```json\n(.*?)```', llm_answer.content, re.DOTALL)
+            if match: json_str = match.group(1)
+        else: json_str = llm_answer.content
+        return json.loads(json_str)["method"]
+    except Exception as e:
+        st.error("Cannot access the OpenAI API. Please check your API key or network connection.")
+        st.stop()

app/src/model_service.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import io
+import numpy as np
+import streamlit as st
+from collections import Counter
+from sklearn import metrics
+from sklearn.preprocessing import StandardScaler
+from sklearn.cluster import KMeans
+from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
+from joblib import dump
+from sklearn.metrics import roc_curve, silhouette_score, calinski_harabasz_score, davies_bouldin_score, f1_score, r2_score, mean_squared_error, mean_absolute_error
+from sklearn.model_selection import train_test_split
+def split_data(X, Y, test_size = 0.2, random_state = 42, perform_pca = False):
+    """
+    Splits the dataset into training and testing sets, optionally standardizing the data if PCA is not performed.
+    :param X: Feature matrix.
+    :param Y: Target vector.
+    :param test_size: Proportion of the dataset to include in the test split.
+    :param random_state: Controls the shuffling applied to the data before applying the split.
+    :param perform_pca: Has PCA been performed or not. If not, standardizes the data.
+    :return: A tuple containing split and optionally transformed datasets: X_train, X_test, Y_train, Y_test.
+    """
+    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state)
+    if not perform_pca:
+        scaler = StandardScaler()
+        X_train = scaler.fit_transform(X_train)
+        X_test = scaler.transform(X_test)
+    return X_train, X_test, Y_train, Y_test
+def check_and_balance(X, Y, balance_threshold=0.5, method=1):
+    """
+    Check if the dataset is imbalanced and perform oversampling if necessary using RandomOverSampler, SMOTE, or ADASYN.
+    Args:
+    X (DataFrame): Feature set.
+    Y (Series): Target variable.
+    balance_threshold (float): Threshold for class balance.
+    method (int): Method for oversampling. Options are 'random', 'smote', or 'adasyn'.
+    Returns:
+    X_resampled, Y_resampled (DataFrame/Series): Resampled data if imbalance is detected, else original data.
+    """
+    try:
+        # Check the distribution of the target variable
+        class_distribution = Counter(Y)
+        # Determine if the dataset is imbalanced
+        min_class_samples = min(class_distribution.values())
+        max_class_samples = max(class_distribution.values())
+        is_imbalanced = min_class_samples / max_class_samples < balance_threshold
+        if is_imbalanced and method != 4:
+            if method == 1:
+                oversampler = RandomOverSampler(random_state=0)
+            elif method == 2:
+                oversampler = SMOTE(random_state=0)
+            elif method == 3:
+                oversampler = ADASYN(random_state=0)
+            X_resampled, Y_resampled = oversampler.fit_resample(X, Y)
+            return X_resampled, Y_resampled
+        else:
+            return X, Y
+    except Exception as e:
+        st.error("The target attribute may be continuous. Please check the data type.")
+        st.stop()
+def estimate_optimal_clusters(df):
+    """
+    Estimates the optimal number of clusters for KMeans clustering using the elbow method and silhouette scores.
+    :param df: DataFrame containing the dataset to cluster.
+    :return: The estimated optimal number of clusters.
+    """
+    sse = {}
+    for k in range(2, 11):
+        kmeans = KMeans(n_clusters=k, random_state=42).fit(df)
+        sse[k] = kmeans.inertia_
+    # Find the elbow point: compute the first and second differences of the SSE
+    sse_values = list(sse.values())
+    first_diff = np.diff(sse_values)  # first difference
+    second_diff = np.diff(first_diff)  # second difference
+    knee_point = np.argmax(second_diff) + 2
+    # find the optimal number of clusters around the knee point
+    silhouette_avg_scores = {}
+    for k in range(knee_point - 1, knee_point + 2):
+        if k >= 2:  # make sure k is at least 2
+            kmeans = KMeans(n_clusters=k, random_state=42).fit(df)
+            silhouette_avg_scores[k] = silhouette_score(df, kmeans.labels_)
+    # Find the optimal number of clusters based on the highest average silhouette score
+    optimal_clusters = max(silhouette_avg_scores, key=silhouette_avg_scores.get)
+    return optimal_clusters
+def calculate_f1_score(model, X_test, Y_test, binary_classification=True):
+    """
+    Calculates the F1 score for the predictions made by a model on a test set.
+    The function supports both binary and multi-class settings by adjusting the 'average' parameter in the f1_score calculation.
+    :param model: The trained machine learning model used for predictions.
+    :param X_test: The feature matrix for the test set.
+    :param Y_test: The true labels for the test set.
+    :param binary_classification: If True, calculates the F1 score for binary classification. Otherwise, calculates for multi-class classification using the 'macro' average.
+    :return: The F1 score of the model predictions.
+    """
+    y_pred = model.predict(X_test)
+    if binary_classification:
+        f1 = f1_score(Y_test, y_pred, average='binary')
+    else:
+        f1 = f1_score(Y_test, y_pred, average='macro')
+    return f1
+def model_score(model, X_test, Y_test):
+    """
+    Calculate the model score for classification models.
+    """
+    score = model.score(X_test, Y_test)
+    return score
+def fpr_and_tpr(model, X_test, Y_test):
+    """
+    Calculate the false positive rate and true positive rate for classification models.
+    """
+    Y_pred = model.predict_proba(X_test)[:, 1]
+    fpr, tpr, _ = roc_curve(Y_test, Y_pred)
+    return fpr, tpr
+def auc(fpr, tpr):
+    """
+    Calculate the area under the ROC curve for classification models.
+    """
+    auc = metrics.auc(fpr, tpr)
+    return auc
+def calculate_silhouette_score(X, labels):
+    """
+    Calculate the silhouette score for clustering models.
+    """
+    return silhouette_score(X, labels)
+def calculate_calinski_harabasz_score(X, labels):
+    """
+    Calculate the calinski harabasz score for clustering models.
+    """
+    return calinski_harabasz_score(X, labels)
+def calculate_davies_bouldin_score(X, labels):
+    """
+    Calculate the davies bouldin score for clustering models.
+    """
+    return davies_bouldin_score(X, labels)
+def gmm_predict(X, model):
+    """
+    Get the predicted labels for a GMM model.
+    """
+    labels = model.predict(X)
+    return labels
+def calculate_r2_score(y_pred, Y_test):
+    """
+    Calculate the r2 score for regression models.
+    """
+    r2 = r2_score(Y_test, y_pred)
+    return r2
+def calculate_mse_and_rmse(y_pred, Y_test):
+    """
+    Calculate the mean squared error and root mean squared error for regression models.
+    """
+    mse = mean_squared_error(Y_test, y_pred)
+    rmse = np.sqrt(mse)
+    return mse, rmse
+def calculate_mae(y_pred, Y_test):
+    """
+    Calculate the mean absolute error for regression models.
+    """
+    mae = mean_absolute_error(Y_test, y_pred)
+    return mae
+def save_model(model):
+    """
+    Serializes a machine learning model into a binary format using joblib's dump function and stores it in a BytesIO buffer.
+    """
+    buffer = io.BytesIO()
+    dump(model, buffer)
+    buffer.seek(0)
+    return buffer.getvalue()

app/src/pca.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import pandas as pd
+import numpy as np
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+from src.preprocess import convert_to_integer
+def decide_pca(df, cumulative_variance_threshold=0.95, min_dim_reduction_ratio=0.1):
+    """
+    Determines whether PCA should be performed based on cumulative variance threshold and dimension reduction ratio.
+    Parameters:
+    - df (DataFrame): The input DataFrame.
+    - cumulative_variance_threshold (float): The threshold of explained variance to retain. Default is 0.95.
+    - min_dim_reduction_ratio (float): The minimum ratio of dimension reduction required to perform PCA. Default is 0.1.
+    Returns:
+    - perform_pca (bool): Whether PCA should be performed.
+    - n_components (int): The number of principal components to retain.
+    """
+    # Remove non-numeric columns
+    numeric_df = df.select_dtypes(include=[np.number])
+    # Standardizing the Data
+    scaler = StandardScaler()
+    scaled_data = scaler.fit_transform(numeric_df)
+    # PCA for Explained Variance
+    pca = PCA()
+    pca.fit(scaled_data)
+    # Calculate cumulative variance
+    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
+    # Find the number of components for the desired threshold
+    n_components = np.where(cumulative_variance >= cumulative_variance_threshold)[0][0] + 1
+    # Calculate the dimension reduction ratio
+    dim_reduction_ratio = 1 - (n_components / df.shape[1])
+    # Check if PCA should be performed based on the dimension reduction ratio
+    perform_pca = dim_reduction_ratio >= min_dim_reduction_ratio
+    return perform_pca, n_components
+def perform_pca(df, n_components, Y_name):
+    """
+    Performs PCA on the dataset, optionally excluding a target column, and standardizes the data.
+    Parameters:
+    - df (DataFrame): The input DataFrame.
+    - n_components (int): The number of principal components to retain.
+    - Y_name (str, optional): The name of the target column to exclude from PCA. Default is None.
+    Returns:
+    - pca_df (DataFrame): DataFrame with principal components and optionally the target column.
+    """
+    # Save the target column data
+    drop_columns = []
+    if Y_name:
+        target_data = df[Y_name]
+        drop_columns.append(Y_name)
+    # Remove non-numeric columns and the target column
+    numeric_df = df.select_dtypes(include=[np.number]).drop(columns=drop_columns, errors='ignore')
+    # Standardizing the Data
+    scaler = StandardScaler()
+    scaled_data = scaler.fit_transform(numeric_df)
+    # Applying PCA
+    pca = PCA(n_components=n_components)
+    principal_components = pca.fit_transform(scaled_data)
+    # Create a new DataFrame with principal components
+    columns = [f'PC{i+1}' for i in range(n_components)]
+    pca_df = pd.DataFrame(data=principal_components, columns=columns)
+    # Reattach the target column
+    if Y_name:
+        pca_df[Y_name] = target_data.reset_index(drop=True)
+        pca_df, _ = convert_to_integer(pca_df, columns_to_convert=[Y_name])
+    return pca_df
+def perform_PCA_for_clustering(df, n_components):
+    """
+    Applies PCA transformation for clustering tasks on the given DataFrame.
+    Parameters:
+    - df (DataFrame): The input DataFrame to apply PCA.
+    - n_components (int): The number of principal components to retain.
+    Returns:
+    - pca_df (DataFrame): DataFrame of the principal components.
+    """
+    # Applying PCA
+    pca = PCA(n_components=n_components)
+    principal_components = pca.fit_transform(df)
+    # Create a new DataFrame with principal components
+    columns = [f'PC{i+1}' for i in range(n_components)]
+    pca_df = pd.DataFrame(data=principal_components, columns=columns)
+    return pca_df
+def perform_PCA_for_regression(df, n_components, Y_name):
+    """
+    Applies PCA for regression tasks, excluding a specified target column from the transformation.
+    Parameters:
+    - df (DataFrame): The input DataFrame.
+    - n_components (int): The number of principal components to retain.
+    - Y_name (str, optional): The name of the target column to exclude from PCA and append back after transformation. Default is None.
+    Returns:
+    - pca_df (DataFrame): A new DataFrame with principal components and the target column.
+    """
+    # Save the target column data
+    drop_columns = []
+    if Y_name:
+        target_data = df[Y_name]
+        drop_columns.append(Y_name)
+    # Remove non-numeric columns and the target column
+    numeric_df = df.select_dtypes(include=[np.number]).drop(columns=drop_columns, errors='ignore')
+    # Applying PCA
+    pca = PCA(n_components=n_components)
+    principal_components = pca.fit_transform(numeric_df)
+    # Create a new DataFrame with principal components
+    columns = [f'PC{i+1}' for i in range(n_components)]
+    pca_df = pd.DataFrame(data=principal_components, columns=columns)
+    # Reattach the target column
+    if Y_name:
+        pca_df[Y_name] = target_data.reset_index(drop=True)
+        pca_df, _ = convert_to_integer(pca_df, columns_to_convert=[Y_name])
+    return pca_df

app/src/plot.py ADDED Viewed

	@@ -0,0 +1,394 @@

+import nltk
+import seaborn as sns
+import numpy as np
+import pandas as pd
+import streamlit as st
+import matplotlib.pyplot as plt
+import plotly.express as px
+import plotly.graph_objects as go
+import scipy.stats as stats
+from sklearn.decomposition import PCA
+from wordcloud import WordCloud
+from sklearn.metrics import confusion_matrix
+from nltk import regexp_tokenize
+# Single attribute visualization
+def distribution_histogram(df, attribute):
+    """
+    Histogram of the distribution of a single attribute.
+    """
+    if df[attribute].dtype == 'object' or pd.api.types.is_categorical_dtype(df[attribute]):
+        codes, uniques = pd.factorize(df[attribute])
+        temp_df = pd.DataFrame({attribute: codes})
+        fig, ax = plt.subplots(figsize=(8, 6))
+        sns.histplot(temp_df[attribute], ax=ax, discrete=True, color='#e17160')
+        ax.set_xticks(range(len(uniques)))
+        ax.set_xticklabels(uniques, rotation=45, ha='right')
+    else:
+        fig, ax = plt.subplots(figsize=(6, 4))
+        sns.histplot(df[attribute], ax=ax, color='#e17160')
+    ax.set_title(f"Distribution of {attribute}")
+    return fig
+def distribution_boxplot(df, attribute):
+    """
+    Boxplot of the distribution of a single attribute.
+    """
+    if df[attribute].dtype == 'object' or pd.api.types.is_categorical_dtype(df[attribute]):
+        return -1
+    fig, ax = plt.subplots(figsize=(8, 6))
+    sns.boxenplot(data=df[attribute], palette=["#32936f", "#26a96c", "#2bc016"])
+    ax.set_title(f"Boxplot of {attribute}")
+    return fig
+def count_Y(df, Y_name):
+    """
+    Donut chart of the distribution of a single attribute.
+    """
+    if Y_name in df.columns and df[Y_name].nunique() >= 1:
+        value_counts = df[Y_name].value_counts()
+        fig = px.pie(names=value_counts.index,
+                     values=value_counts.values,
+                     title=f'Distribution of {Y_name}',
+                     hole=0.5,
+                     color_discrete_sequence=px.colors.sequential.Cividis_r)
+        return fig
+def density_plot(df, column_name):
+    """
+    Density plot of the distribution of a single attribute.
+    """
+    if column_name in df.columns:
+        fig = px.density_contour(df, x=column_name, y=column_name,
+                                 title=f'Density Plot of {column_name}',
+                                 color_discrete_sequence=px.colors.sequential.Inferno)
+        return fig
+# Mutiple attribute visualization
+def box_plot(df, column_names):
+    """
+    Box plot of multiple attributes.
+    """
+    if len(column_names) > 1 and not all(df[column_names].dtypes.apply(lambda x: np.issubdtype(x, np.number))):
+        return -1
+    valid_columns = [col for col in column_names if col in df.columns]
+    if valid_columns:
+        fig = px.box(df, y=valid_columns,
+                     title=f'Box Plot of {", ".join(valid_columns)}',
+                     color_discrete_sequence=px.colors.sequential.Cividis_r)
+        return fig
+def violin_plot(df, column_names):
+    """
+    Violin plot of multiple attributes.
+    """
+    if len(column_names) > 1 and not all(df[column_names].dtypes.apply(lambda x: np.issubdtype(x, np.number))):
+        return -1
+    valid_columns = [col for col in column_names if col in df.columns]
+    if valid_columns:
+        fig = px.violin(df, y=valid_columns,
+                        title=f'Violin Plot of {", ".join(valid_columns)}',
+                        color_discrete_sequence=px.colors.sequential.Cividis_r)
+        return fig
+def strip_plot(df, column_names):
+    """
+    Strip plot of multiple attributes.
+    """
+    if len(column_names) > 1 and not all(df[column_names].dtypes.apply(lambda x: np.issubdtype(x, np.number))):
+        return -1
+    valid_columns = [col for col in column_names if col in df.columns]
+    if valid_columns:
+        fig = px.strip(df, y=valid_columns,
+                       title=f'Strip Plot of {", ".join(valid_columns)}',
+                       color_discrete_sequence=px.colors.sequential.Cividis_r)
+        return fig
+def multi_plot_scatter(df, selected_attributes):
+    """
+    Scatter plot of multiple attributes.
+    """
+    if len(selected_attributes) < 2:
+        return -1
+    plt.figure(figsize=(10, 6))
+    if df[selected_attributes[0]].dtype not in [np.float64, np.int64]:
+        x, x_labels = pd.factorize(df[selected_attributes[0]])
+        plt.xticks(ticks=np.arange(len(x_labels)), labels=x_labels, rotation=45)
+    else:
+        x = df[selected_attributes[0]]
+    if df[selected_attributes[1]].dtype not in [np.float64, np.int64]:
+        y, y_labels = pd.factorize(df[selected_attributes[1]])
+        plt.yticks(ticks=np.arange(len(y_labels)), labels=y_labels)
+    else:
+        y = df[selected_attributes[1]]
+    plt.scatter(x, y, c=np.linspace(0, 1, len(df)), cmap='viridis')
+    plt.colorbar()
+    plt.xlabel(selected_attributes[0])
+    plt.ylabel(selected_attributes[1])
+    plt.title(f'Scatter Plot of {selected_attributes[0]} vs {selected_attributes[1]}')
+    return plt.gcf()
+def multi_plot_line(df, selected_attributes):
+    """
+    Line plot of multiple attributes.
+    """
+    if not all(df[selected_attributes].dtypes.apply(lambda x: np.issubdtype(x, np.number))):
+        return -1
+    if len(selected_attributes) >= 2:
+        plt.figure(figsize=(10, 6))
+        colors = plt.cm.viridis(np.linspace(0, 1, len(selected_attributes)))
+        for i, attribute in enumerate(selected_attributes):
+            plt.plot(df.index, df[attribute], marker='', linewidth=2, color=colors[i], label=attribute)
+        plt.legend()
+        plt.xlabel(selected_attributes[0])
+        plt.ylabel(selected_attributes[1])
+        plt.title(f'Line Plot of {selected_attributes[0]} vs {selected_attributes[1]}')
+        return plt.gcf()
+    else:
+        return -2
+def multi_plot_heatmap(df, selected_attributes):
+    """
+    Correlation heatmap of multiple attributes.
+    """
+    if not all(df[selected_attributes].dtypes.apply(lambda x: np.issubdtype(x, np.number))):
+        return -1
+    if len(selected_attributes) >= 1:
+        sns.set_theme()
+        plt.figure(figsize=(10, 8))
+        sns.heatmap(df[selected_attributes].corr(), annot=True, cmap='viridis')
+        plt.title('Heatmap of Correlation')
+        return plt.gcf()
+# Overall visualization
+@st.cache_data
+def correlation_matrix(df):
+    """
+    Correlation heatmap of all attributes using Seaborn.
+    """
+    plt.figure(figsize=(16, 12))
+    sns.set(font_scale=0.9)
+    sns.heatmap(df.corr(), annot=True, cmap='viridis', annot_kws={"size": 12})
+    return plt.gcf()
+@st.cache_data
+def correlation_matrix_plotly(df):
+    """
+    Correlation heatmap of all attributes using Plotly.
+    """
+    corr_matrix = df.corr()
+    labels = corr_matrix.columns
+    text = [[f'{corr_matrix.iloc[i, j]:.2f}' for j in range(len(labels))] for i in range(len(labels))]
+    fig = go.Figure(data=go.Heatmap(
+        z=corr_matrix.values,
+        x=labels,
+        y=labels,
+        colorscale='Viridis',
+        colorbar=dict(title='Correlation'),
+        text=text,
+        hoverinfo='text',
+    ))
+    fig.update_layout(
+        title='Correlation Matrix Between Attributes',
+        xaxis=dict(tickmode='linear'),
+        yaxis=dict(tickmode='linear'),
+        width=800,
+        height=700,
+    )
+    fig.update_layout(font=dict(size=10))
+    return fig
+@st.cache_data
+def list_all(df, max_plots=16):
+    """
+    Display histograms of all attributes in the DataFrame.
+    """
+    # Calculate the number of plots to display (up to 16)
+    num_plots = min(len(df.columns), max_plots)
+    nrows = int(np.ceil(num_plots / 4))
+    ncols = min(num_plots, 4)
+    fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 4 * nrows))
+    fig.suptitle('Attribute Distributions', fontsize=20)
+    plt.style.use('ggplot')
+    sns.set(style="darkgrid")
+    # if only one plot, convert to list
+    if num_plots == 1: axes = [axes]
+    # Flatten the axes array
+    axes = axes.flatten()
+    # Display the histograms
+    for i, column in enumerate(df.columns[:num_plots]):
+        sns.histplot(ax=axes[i], data=df, x=column, color='#1867ac')
+    # Hide additional subplots
+    for ax in axes[num_plots:]: ax.axis('off')
+    plt.tight_layout()
+    plt.subplots_adjust(top=0.95) # Adjust the top to accommodate the title
+    return fig
+# Model evaluation
+def confusion_metrix(model_name, model, X_test, Y_test):
+    """
+    Confusion matrix plot for classification models
+    """
+    Y_pred = model.predict(X_test)
+    matrix = confusion_matrix(Y_test, Y_pred)
+    plt.figure(figsize=(10, 7)) # temporary
+    sns_heatmap = sns.heatmap(matrix, annot=True, cmap='Blues', fmt='g', annot_kws={"size": 20})
+    plt.title(f"Confusion Matrix for {model_name}", fontsize=20)
+    plt.xlabel('Predicted labels', fontsize=16)
+    plt.ylabel('True labels', fontsize=16)
+    return sns_heatmap.figure
+def roc(model_name, fpr, tpr):
+    """
+    ROC curve for classification models
+    """
+    fig = plt.figure()
+    plt.style.use('ggplot')
+    plt.plot([0,1],[0,1],'k--')
+    plt.plot(fpr, tpr, label=model_name)
+    plt.xlabel('False Positive rate')
+    plt.ylabel('True Positive rate')
+    plt.title(f'ROC Curve - {model_name}')
+    plt.legend(loc='best')
+    plt.xticks(rotation=45)
+    return fig
+def plot_clusters(X, labels):
+    """
+    Scatter plot of clusters for clustering models
+    """
+    sns.set(style="whitegrid")
+    pca = PCA(n_components=2)
+    X_pca = pca.fit_transform(X)
+    unique_labels = set(labels)
+    colors = plt.cm.viridis(np.linspace(0, 1, len(unique_labels)))
+    fig, ax = plt.subplots()
+    for color, label in zip(colors, unique_labels):
+        idx = labels == label
+        ax.scatter(X_pca[idx, 0], X_pca[idx, 1], color=color, label=f'Cluster {label}', s=50)
+    ax.set_title('Cluster Scatter Plot')
+    ax.legend()
+    return fig
+def plot_residuals(y_pred, Y_test):
+    """
+    Residual plot for regression models
+    """
+    residuals = Y_test - y_pred
+    fig, ax = plt.subplots()
+    sns.residplot(x=y_pred, y=residuals, lowess=True, ax=ax, scatter_kws={'alpha': 0.7}, line_kws={'color': 'purple', 'lw': 2})
+    ax.set_xlabel('Predicted Values')
+    ax.set_ylabel('Residuals')
+    ax.set_title('Residual Plot')
+    return fig
+def plot_predictions_vs_actual(y_pred, Y_test):
+    """
+    Scatter plot of predicted vs. actual values for regression models
+    """
+    fig, ax = plt.subplots()
+    ax.scatter(Y_test, y_pred, c='#10a37f', marker='x')
+    ax.plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()], 'k--', lw=2)
+    ax.set_xlabel('Actual')
+    ax.set_ylabel('Predicted')
+    ax.set_title('Actual vs. Predicted')
+    ax.set_facecolor('white')
+    ax.grid(True, which='major', linestyle='--', linewidth=0.5, color='gray')
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    return fig
+def plot_qq_plot(y_pred, Y_test):
+    """
+    Quantile-Quantile plot for regression models
+    """
+    residuals = Y_test - y_pred
+    fig, ax = plt.subplots()
+    (osm, osr), (slope, intercept, r) = stats.probplot(residuals, dist="norm", plot=None)
+    line = slope * osm + intercept
+    ax.plot(osm, line, 'grey', lw=2)
+    ax.scatter(osm, osr, alpha=0.8, edgecolors='#e8b517', c='yellow', label='Data Points')
+    ax.set_title('Quantile-Quantile Plot')
+    ax.set_facecolor('white')
+    ax.grid(True, which='major', linestyle='--', linewidth=0.5, color='gray')
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_xlabel('Theoretical Quantiles')
+    ax.set_ylabel('Ordered Values')
+    return fig
+# Advanced Visualization
+@st.cache_data
+def word_cloud_plot(text):
+    """
+    Generates and displays a word cloud from the given text.
+    The word cloud visualizes the frequency of occurrence of words in the text, with the size of each word indicating its frequency.
+    :param text: The input text from which to generate the word cloud.
+    :return: A matplotlib figure object containing the word cloud if successful, -1 otherwise.
+    """
+    try:
+        words = regexp_tokenize(text, pattern='\w+')
+        text_dist = nltk.FreqDist([w for w in words])
+        wordcloud = WordCloud(width=1200, height=600, background_color ='white').generate_from_frequencies(text_dist)
+        fig, ax = plt.subplots(figsize=(10, 7.5))
+        ax.imshow(wordcloud, interpolation='bilinear')
+        ax.axis('off')
+        return fig
+    except:
+        return -1
+@st.cache_data
+def world_map(df, country_column, key_attribute):
+    """
+    Creates a choropleth world map visualization based on the specified DataFrame.
+    The function highlights countries based on a key attribute, providing an interactive map that can be used to analyze geographical data distributions.
+    :param df: DataFrame containing the data to be visualized.
+    :param country_column: Name of the column in df that contains country names.
+    :param key_attribute: Name of the column in df that contains the data to visualize on the map.
+    :return: A Plotly figure object representing the choropleth map if successful, -1 otherwise.
+    """
+    try:
+        hover_data_columns = [col for col in df.columns if col != country_column]
+        fig = px.choropleth(df, locations="iso_alpha",
+                            color=key_attribute,
+                            hover_name=country_column,
+                            hover_data=hover_data_columns,
+                            color_continuous_scale=px.colors.sequential.Cividis,
+                            projection="equirectangular",)
+        return fig
+    except:
+        return -1
+@st.cache_data
+def scatter_3d(df, x, y, z):
+    """
+    Generates a 3D scatter plot from the given DataFrame.
+    Each point in the plot corresponds to a row in the DataFrame, with its position determined by three specified columns. Points are colored based on the values of the z-axis.
+    :param df: DataFrame containing the data to be visualized.
+    :param x: Name of the column in df to use for the x-axis values.
+    :param y: Name of the column in df to use for the y-axis values.
+    :param z: Name of the column in df to use for the z-axis values and color coding.
+    :return: A Plotly figure object containing the 3D scatter plot if successful, -1 otherwise.
+    """
+    try:
+        return px.scatter_3d(df, x=x, y=y, z=z, color=z, color_continuous_scale=px.colors.sequential.Viridis)
+    except:
+        return -1

app/src/predictive_model.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import streamlit as st
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.naive_bayes import GaussianNB
+from sklearn.svm import SVC
+from xgboost import XGBClassifier
+@st.cache_data
+def train_selected_model(X_train, Y_train, model_type, model_params=None):
+    """
+    Trains a specific classification model based on the provided model type and parameters.
+    Parameters:
+    - X_train (array-like): The training input samples.
+    - Y_train (array-like): The target labels for classification.
+    - model_type (int): Specifies the type of classification model to be trained.
+        1 for Logistic Regression, 2 for Support Vector Machine (SVM), 3 for Naive Bayes,
+        4 for Random Forest, 5 for AdaBoost, 6 for XGBoost, and 7 for Gradient Boosting.
+    - model_params (dict, optional): A dictionary of parameters for the model. Defaults to None.
+    Returns:
+    - model: The trained model object based on the specified type.
+    """
+    if model_type == 1:
+        return LogisticRegression_train(X_train, Y_train, model_params)
+    elif model_type == 2:
+        return SVM_train(X_train, Y_train, model_params)
+    elif model_type == 3:
+        return NaiveBayes_train(X_train, Y_train, model_params)
+    elif model_type == 4:
+        return RandomForest_train(X_train, Y_train, model_params=model_params)
+    elif model_type == 5:
+        return AdaBoost_train(X_train, Y_train, model_params)
+    elif model_type == 6:
+        return XGBoost_train(X_train, Y_train, model_params)
+    elif model_type == 7:
+        return GradientBoosting_train(X_train, Y_train, model_params)
+def LogisticRegression_train(X_train, Y_train, model_params=None):
+    if model_params is None: model_params = {}
+    logreg = LogisticRegression(**model_params)
+    logreg.fit(X_train, Y_train)
+    return logreg
+def SVM_train(X_train, Y_train, model_params=None):
+    if model_params is None: model_params = {}
+    svm = SVC(**model_params)
+    svm.fit(X_train, Y_train)
+    return svm
+def NaiveBayes_train(X_train, Y_train, model_params=None):
+    if model_params is None: model_params = {}
+    nb = GaussianNB(**model_params)
+    nb.fit(X_train, Y_train)
+    return nb
+def RandomForest_train(X_train, Y_train, n_estimators=100, random_state=None, model_params=None):
+    if model_params is None: model_params = {}
+    rf_params = {'n_estimators': n_estimators, 'random_state': random_state}
+    rf_params.update(model_params)
+    rf = RandomForestClassifier(**rf_params)
+    rf.fit(X_train, Y_train)
+    return rf
+def AdaBoost_train(X_train, Y_train, model_params=None):
+    if model_params is None: model_params = {}
+    ab = AdaBoostClassifier(**model_params)
+    ab.fit(X_train, Y_train)
+    return ab
+def XGBoost_train(X_train, Y_train, model_params=None):
+    if model_params is None: model_params = {}
+    xgb = XGBClassifier(**model_params)
+    xgb.fit(X_train, Y_train)
+    return xgb
+def GradientBoosting_train(X_train, Y_train, model_params=None):
+    if model_params is None: model_params = {}
+    gb = GradientBoostingClassifier(**model_params)
+    gb.fit(X_train, Y_train)
+    return gb

app/src/preprocess.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import pandas as pd
+import numpy as np
+from scipy import stats
+from sklearn.preprocessing import StandardScaler, PowerTransformer
+def convert_to_numeric(df, convert_int_cols_list, one_hot_cols_list, drop_cols):
+    """
+    Convert specified columns in the DataFrame to numeric formats and drop specified columns.
+    Integer conversion and one-hot encoding are applied based on the provided lists of columns.
+    Returns a modified DataFrame and a dictionary of mappings used for conversions.
+    :param df: Pandas DataFrame to be processed.
+    :param convert_int_cols_list: List of column names to be converted to integer type.
+    :param one_hot_cols_list: List of column names to be converted to one-hot encoding.
+    :param drop_cols: List of column names to be dropped from the DataFrame.
+    :return: A tuple with two elements:
+             1. DataFrame with specified columns converted and specified columns dropped.
+             2. Dictionary of mappings for each conversion type ('integer_mappings' and 'one_hot_mappings').
+    """
+    df, int_mapping = convert_to_integer(df, convert_int_cols_list)
+    df, one_hot_mapping = convert_to_one_hot(df, one_hot_cols_list)
+    df = df.drop(columns=drop_cols, errors='ignore')
+    mappings = {'integer_mappings': int_mapping, 'one_hot_mappings': one_hot_mapping}
+    return df, mappings
+def convert_to_integer(df, columns_to_convert=[]):
+    """
+    Convert specified non-numeric columns in the DataFrame to integer type,
+    and return a dictionary of mappings from original values to integers.
+    :param df: Pandas DataFrame to be processed.
+    :param columns_to_convert: List of column names to be converted to integer type.
+    :return: A tuple with two elements:
+             1. DataFrame with specified columns converted to integer type.
+             2. Dictionary of mappings for each converted column.
+    """
+    mappings = {}
+    for column in columns_to_convert:
+        if df[column].dtype == 'object':
+            # Create a mapping from unique values to integers
+            unique_values = df[column].unique()
+            int_to_value_map = {i: value for i, value in enumerate(unique_values)}
+            mappings[column] = int_to_value_map
+            # Apply the reversed mapping to the DataFrame
+            value_to_int_map = {v: k for k, v in int_to_value_map.items()}
+            df[column] = df[column].map(value_to_int_map)
+    return df, mappings
+def convert_to_one_hot(df, columns_to_convert=[]):
+    """
+    Convert specified non-numeric columns in the DataFrame to one-hot encoding,
+    and return a modified DataFrame and a dictionary of mappings used for one-hot encoding.
+    :param df: Pandas DataFrame to be processed.
+    :param columns_to_convert: List of column names to be converted to one-hot encoding.
+    :return: A tuple with two elements:
+             1. DataFrame with specified columns converted to one-hot encoding.
+             2. Dictionary of mappings for each converted column.
+    """
+    mappings = {}
+    df_modified = df.copy()
+    for column in columns_to_convert:
+        # Check if the column is categorical
+        if df[column].dtype == 'object' or df[column].dtype == 'category':
+            # Perform one-hot encoding
+            one_hot = pd.get_dummies(df[column], prefix=column)
+            # Add the new columns to the modified DataFrame
+            df_modified = pd.concat([df_modified, one_hot], axis=1)
+            # Drop the original column
+            df_modified = df_modified.drop(column, axis=1)
+            # Store the mapping
+            mappings[column] = {i: column + '_' + str(i) for i in df[column].unique()}
+    return df_modified, mappings
+def remove_rows_with_empty_target(df, Y_name):
+    """
+    Remove rows from the DataFrame where the target column has empty values.
+    :param df: Pandas DataFrame to be processed.
+    :param Y_name: Name of the target column to check for empty values.
+    :return: DataFrame with rows removed where target column value is empty.
+    """
+    # Remove rows where the target column is empty (NaN)
+    cleaned_df = df.dropna(subset=[Y_name])
+    return cleaned_df
+def remove_duplicates(df):
+    """
+    Remove duplicate rows from the DataFrame.
+    """
+    return df.drop_duplicates()
+def transform_data_for_clustering(df):
+    """
+    Transform numeric columns in the DataFrame for clustering.
+    Applies a PowerTransformer to columns with skewness over a threshold and standardizes them.
+    This can help in making the clustering algorithm more effective by normalizing the scale of numerical features.
+    :param df: Pandas DataFrame to be transformed.
+    :return: DataFrame with transformed numeric columns suitable for clustering.
+    """
+    numeric_cols = df.select_dtypes(include=[np.number]).columns
+    transformed_df = df.copy()
+    pt = PowerTransformer(method='box-cox', standardize=False)
+    for col in numeric_cols:
+        if (transformed_df[col] > 0).all():
+            skewness = stats.skew(transformed_df[col])
+            if abs(skewness) > 0.5:
+                transformed_data = pt.fit_transform(transformed_df[[col]])
+                transformed_df[col] = transformed_data
+    scaler = StandardScaler()
+    transformed_df[numeric_cols] = scaler.fit_transform(transformed_df[numeric_cols])
+    return transformed_df

app/src/regression_model.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import streamlit as st
+from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
+from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
+@st.cache_data
+def train_selected_regression_model(X_train, Y_train, model_type, model_params=None):
+    """
+    Trains a regression model based on the specified model type and parameters.
+    Parameters:
+    - X_train (array-like): The training input samples.
+    - Y_train (array-like): The target values (real numbers).
+    - model_type (int): An integer representing the type of regression model to train.
+        1 for Linear Regression, 2 for Ridge Regression, 3 for Lasso Regression,
+        4 for Random Forest Regressor, 5 for Gradient Boosting Regressor, and 6 for ElasticNet Regression.
+    - model_params (dict, optional): A dictionary of model-specific parameters. Default is None.
+    Returns:
+    - The trained regression model object based on the specified model type.
+    """
+    if model_type == 1:
+        return LinearRegression_train(X_train, Y_train, model_params)
+    elif model_type == 2:
+        return RidgeRegression_train(X_train, Y_train, model_params)
+    elif model_type == 3:
+        return LassoRegression_train(X_train, Y_train, model_params)
+    elif model_type == 4:
+        return RandomForestRegressor_train(X_train, Y_train, model_params)
+    elif model_type == 5:
+        return GradientBoostingRegressor_train(X_train, Y_train, model_params)
+    elif model_type == 6:
+        return ElasticNetRegressor_train(X_train, Y_train, model_params)
+def LinearRegression_train(X_train, Y_train, model_params=None):
+    if model_params is None: model_params = {}
+    lr = LinearRegression(**model_params)
+    lr.fit(X_train, Y_train)
+    return lr
+def RidgeRegression_train(X_train, Y_train, model_params=None):
+    if model_params is None: model_params = {}
+    ridge = Ridge(**model_params)
+    ridge.fit(X_train, Y_train)
+    return ridge
+def LassoRegression_train(X_train, Y_train, model_params=None):
+    if model_params is None: model_params = {}
+    lasso = Lasso(**model_params)
+    lasso.fit(X_train, Y_train)
+    return lasso
+def RandomForestRegressor_train(X_train, Y_train, model_params=None):
+    if model_params is None: model_params = {}
+    rf = RandomForestRegressor(**model_params)
+    rf.fit(X_train, Y_train)
+    return rf
+def GradientBoostingRegressor_train(X_train, Y_train, model_params=None):
+    if model_params is None: model_params = {}
+    gbr = GradientBoostingRegressor(**model_params)
+    gbr.fit(X_train, Y_train)
+    return gbr
+def ElasticNetRegressor_train(X_train, Y_train, model_params=None):
+    if model_params is None: model_params = {}
+    en = ElasticNet(**model_params)
+    en.fit(X_train, Y_train)
+    return en

app/src/util.py ADDED Viewed

	@@ -0,0 +1,242 @@

+import os
+import io
+import pandas as pd
+def read_file(file_path):
+    """
+    Read a file from a given path.
+    """
+    # Check the size of the file
+    if os.path.getsize(file_path) > 200 * 1024 * 1024:  # 200MB in bytes
+        raise ValueError("Too large file")
+    # Extract the file extension
+    file_extension = file_path.split('.')[-1]
+    if file_extension == 'csv':
+        # Read CSV file
+        return pd.read_csv(file_path)
+    elif file_extension == 'json':
+        # Read JSON file
+        return pd.read_json(file_path)
+    elif file_extension in ['xls', 'xlsx']:
+        # Read Excel file
+        return pd.read_excel(file_path, engine='openpyxl')
+    else:
+        raise ValueError("Unsupported file format: " + file_extension)
+def read_file_from_streamlit(uploaded_file):
+    """
+    Read a file from a given streamlit file.
+    """
+    # Check the size of the file
+    if uploaded_file.size > 200 * 1024 * 1024:  # 200MB in bytes
+        raise ValueError("Too large file")
+    # Extract the file extension
+    file_extension = uploaded_file.name.split('.')[-1]
+    if file_extension == 'csv':
+        # Read CSV file
+        return pd.read_csv(uploaded_file)
+    elif file_extension == 'json':
+        # Read JSON file
+        return pd.read_json(uploaded_file)
+    elif file_extension in ['xls', 'xlsx']:
+        # Read Excel file
+        # Use io.BytesIO to handle the binary stream
+        return pd.read_excel(io.BytesIO(uploaded_file.read()), engine='openpyxl')
+    else:
+        raise ValueError("Unsupported file format: " + file_extension)
+def select_Y(df, Y_name):
+    """
+    Select the target variable from the DataFrame.
+    """
+    if Y_name in df.columns:
+        X = df.drop(Y_name, axis=1)
+        Y = df[Y_name]
+        return X, Y
+    else:
+        return -1
+def check_all_columns_numeric(df):
+    """
+    Check if all columns in a DataFrame are numeric. Return True if so, False otherwise.
+    """
+    return df.select_dtypes(include=[int, float]).shape[1] == df.shape[1]
+def non_numeric_columns_and_head(df, num_rows=20):
+    """
+    Identify non-numeric columns in a DataFrame and return their names and head.
+    :param df: Pandas DataFrame to be examined.
+    :param num_rows: Number of rows to include in the head (default is 20).
+    :return: A tuple with two elements:
+             1. List of column names that are not numeric (integer or float).
+             2. DataFrame containing the head of the non-numeric columns.
+    """
+    # Identify columns that are not of numeric data type
+    non_numeric_cols = [col for col in df.columns if not pd.api.types.is_numeric_dtype(df[col])]
+    # Get the head of the non-numeric columns
+    non_numeric_head = df[non_numeric_cols].head(num_rows).to_csv()
+    return non_numeric_cols, non_numeric_head
+def contain_null_attributes_info(df):
+    """
+    Identifies columns with missing values, summarizes their statistics, and reports their data types.
+    This function checks for attributes within a DataFrame that contain null values,
+    generates descriptive statistics for these attributes, and compiles information about their data types.
+    :param df: A pandas DataFrame to be analyzed.
+    :return: A tuple containing:
+             - A list of columns that contain null values.
+             - A string representation of data types for these columns.
+             - A CSV-formatted string containing descriptive statistics (count, mean, median, and standard deviation) for these columns.
+               Returns an empty list, -1, and -1 if no columns with null values are found.
+    """
+    attributes = df.columns[df.isnull().any()].tolist()
+    if not attributes: return [], -1, -1
+    description_info = df[attributes].describe(percentiles=[.5])
+    description_info = description_info.loc[['count', 'mean', '50%', 'std']].round(2).to_csv()
+    dtypes_df = df[attributes].dtypes
+    types_info = "\n".join([f"{index}:{dtype}" for index, dtype in dtypes_df.items()])
+    return attributes, types_info, description_info
+def attribute_info(df):
+    """
+    Obtain the attributes, types, and head information of the DataFrame.
+    """
+    attributes = df.columns.tolist()
+    dtypes_df = df.dtypes
+    types_info = "\n".join([f"{index}:{dtype}" for index, dtype in dtypes_df.items()])
+    head_info = df.head(10).to_csv()
+    return attributes, types_info, head_info
+def get_data_overview(df):
+    """
+    Obtain the shape, head, nunique, and description information of the DataFrame.
+    """
+    shape_info = str(df.shape)
+    head_info = df.head().to_csv()
+    nunique_info = df.nunique().to_csv()
+    description_info = df.describe(include='all').to_csv()
+    return shape_info, head_info, nunique_info, description_info
+def get_balance_info(df, Y_name):
+    """
+    Obtain the shape, description, and balance information of the DataFrame.
+    """
+    shape_info = df.shape
+    description_info = df.describe().to_csv()
+    balance_info = df[Y_name].value_counts().to_dict()
+    return shape_info, description_info, balance_info
+def separate_decode_list(decided_dict, Y_name):
+    """
+    Process the LLM response and return the lists of columns to be converted to integer, one-hot encoding, and drop
+    """
+    convert_int_cols = [key for key, value in decided_dict.items() if value == 1]
+    one_hot_cols = [key for key, value in decided_dict.items() if value == 2]
+    drop_cols = [key for key, value in decided_dict.items() if value == 3]
+    if Y_name and Y_name in one_hot_cols:
+        one_hot_cols.remove(Y_name)
+        convert_int_cols.append(Y_name)
+    if Y_name and Y_name in drop_cols:
+        drop_cols.remove(Y_name)
+        convert_int_cols.append(Y_name)
+    return convert_int_cols, one_hot_cols, drop_cols
+def separate_fill_null_list(fill_null_dict):
+    """
+    Process the LLM response and return the lists of columns to be filled with mean, median, mode, new category, interpolation
+    """
+    mean_list = [key for key, value in fill_null_dict.items() if value == 1]
+    median_list = [key for key, value in fill_null_dict.items() if value == 2]
+    mode_list = [key for key, value in fill_null_dict.items() if value == 3]
+    new_category_list = [key for key, value in fill_null_dict.items() if value == 4]
+    interpolation_list = [key for key, value in fill_null_dict.items() if value == 5]
+    return mean_list, median_list, mode_list, new_category_list, interpolation_list
+def get_selected_models(model_dict):
+    """
+    Convert the dictionary of models to a list.
+    """
+    return list(model_dict.values())
+def get_model_name(model_no):
+    """
+    Returns the name of the classification model based on the model number.
+    """
+    if model_no == 1:
+        return "Logistic Regression"
+    elif model_no == 2:
+        return "SVM"
+    elif model_no == 3:
+        return "Naive Bayes"
+    elif model_no == 4:
+        return "Random Forest"
+    elif model_no == 5:
+        return "ADA Boost"
+    elif model_no == 6:
+        return "XGBoost"
+    elif model_no == 7:
+        return "Grandient Boost"
+def get_cluster_method_name(method):
+    """
+    Returns the name of the clustering method based on the method number.
+    """
+    if method == 1:
+        return "K-Means"
+    elif method == 2:
+        return "DBSCAN"
+    elif method == 3:
+        return "Gaussian Mixture"
+    elif method == 4:
+        return "Agglomerative Clustering"
+    elif method == 5:
+        return "Spectral Clustering"
+def get_balance_method_name(method):
+    """
+    Returns the name of the balance method based on the method number.
+    """
+    if method == 1:
+        return "ROS"
+    elif method == 2:
+        return "SMOTE"
+    elif method == 3:
+        return "ADASYN"
+    elif method == 4:
+        return "None"
+def get_regression_method_name(method):
+    """
+    Returns the name of the regression method based on the method number.
+    """
+    if method == 1:
+        return "Linear Regression"
+    elif method == 2:
+        return "Ridge Regression"
+    elif method == 3:
+        return "Lasso Regression"
+    elif method == 4:
+        return "Random Forest"
+    elif method == 5:
+        return "Gradient Boosting"
+    elif method == 6:
+        return "Elastic Net"
+def count_unique(df, Y):
+    """
+    Counts the number of unique values in a specified column of a DataFrame.
+    """
+    return df[Y].nunique()

app/util.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import streamlit as st
+import requests
+import yaml
+import time
+import random
+import os
+config_path = os.path.join(os.path.dirname(__file__), 'config', 'config.yaml')
+with open(config_path, 'r') as file:
+    config_data = yaml.safe_load(file)
+def load_lottie():
+    r1, r2 = requests.get(config_data['lottie_url1']), requests.get(config_data['lottie_url2'])
+    if r1.status_code != 200 or r2.status_code != 200:
+        return None
+    return r1.json(), r2.json()
+# write a stream of words
+def stream_data(line):
+    for word in line.split():
+        yield word + " "
+        time.sleep(random.uniform(0.02, 0.05))
+# Store the welcome message and introduction
+def welcome_message():
+    return config_data['welcome_template']
+def introduction_message():
+    return config_data['introduction_template1'], config_data['introduction_template2']
+# Show developer info at the bottom
+def developer_info():
+    time.sleep(2)
+    st.write(stream_data(":grey[Streamline Analyst is developed by *Zhe Lin*. You can reach out to me via] :blue[wilson.linzhe@gmail.com] :grey[or] :blue[[GitHub](https://github.com/Wilson-ZheLin)]"))
+def developer_info_static():
+    st.write(":grey[Streamline Analyst is developed by *Zhe Lin*. You can reach out to me via] :blue[wilson.linzhe@gmail.com] :grey[or] :blue[[GitHub](https://github.com/Wilson-ZheLin)]")

app/visualization.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import streamlit as st
+from util import developer_info_static
+from src.plot import list_all, distribution_histogram, distribution_boxplot, count_Y, box_plot, violin_plot, strip_plot, density_plot ,multi_plot_heatmap, multi_plot_scatter, multi_plot_line, word_cloud_plot, world_map, scatter_3d
+def display_word_cloud(text):
+    _, word_cloud_col, _ = st.columns([1, 3, 1])
+    with word_cloud_col:
+        word_fig = word_cloud_plot(text)
+        if word_fig == -1:
+            st.error('Data not supported')
+        else:
+            st.pyplot(word_cloud_plot(text))
+def data_visualization(DF):
+    st.divider()
+    st.subheader('Data Visualization')
+    attributes = DF.columns.tolist()
+    # Three tabs for three kinds of visualization
+    single_tab, multiple_tab, advanced_tab = st.tabs(['Single Attribute Visualization', 'Multiple Attributes Visualization', 'Advanced Visualization'])
+    # Single attribute visualization
+    with single_tab:
+        _, col_mid, _ = st.columns([1, 5, 1])
+        with col_mid:
+            plot_area = st.empty()
+        col1, col2 = st.columns(2)
+        with col1:
+            att = st.selectbox(
+                label = 'Select an attribute to visualize:',
+                options = attributes,
+                index = len(attributes)-1
+            )
+            st.write(f'Attribute selected: :green[{att}]')
+        with col2:
+            plot_types = ['Donut chart', 'Violin plot', 'Distribution histogram', 'Boxplot', 'Density plot', 'Strip plot', 'Distribution boxplot']
+            plot_type = st.selectbox(
+                key = 'plot_type1',
+                label = 'Select a plot type:',
+                options = plot_types,
+                index = 0
+            )
+            st.write(f'Plot type selected: :green[{plot_type}]')
+        if plot_type == 'Distribution histogram':
+            fig = distribution_histogram(DF, att)
+            plot_area.pyplot(fig)
+        elif plot_type == 'Distribution boxplot':
+            fig = distribution_boxplot(DF, att)
+            if fig == -1:
+                plot_area.error('The attribute is not numeric')
+            else:
+                plot_area.pyplot(fig)
+        elif plot_type == 'Donut chart':
+            fig = count_Y(DF, att)
+            plot_area.plotly_chart(fig)
+        elif plot_type == 'Boxplot':
+            fig = box_plot(DF, [att])
+            plot_area.plotly_chart(fig)
+        elif plot_type == 'Violin plot':
+            fig = violin_plot(DF, [att])
+            plot_area.plotly_chart(fig)
+        elif plot_type == 'Strip plot':
+            fig = strip_plot(DF, [att])
+            plot_area.plotly_chart(fig)
+        elif plot_type == 'Density plot':
+            fig = density_plot(DF, att)
+            plot_area.plotly_chart(fig)
+    # Multiple attribute visualization
+    with multiple_tab:
+        col1, col2 = st.columns([6, 4])
+        with col1:
+            options = st.multiselect(
+                label = 'Select multiple attributes to visualize:',
+                options = attributes,
+                default = []
+            )
+        with col2:
+            plot_types = ["Violin plot", "Boxplot", "Heatmap", "Strip plot", "Line plot", "Scatter plot"]
+            plot_type = st.selectbox(
+                key = 'plot_type2',
+                label = 'Select a plot type:',
+                options = plot_types,
+                index = 0
+            )
+        _, col_mid, _ = st.columns([1, 5, 1])
+        with col_mid:
+            plot_area = st.empty()
+        if options:
+            if plot_type == 'Scatter plot':
+                fig = multi_plot_scatter(DF, options)
+                if fig == -1:
+                    plot_area.error('Scatter plot requires two attributes')
+                else:
+                    plot_area.pyplot(fig)
+            elif plot_type == 'Heatmap':
+                fig = multi_plot_heatmap(DF, options)
+                if fig == -1:
+                    plot_area.error('The attributes are not numeric')
+                else:
+                    plot_area.pyplot(fig)
+            elif plot_type == 'Boxplot':
+                fig = box_plot(DF, options)
+                if fig == -1:
+                    plot_area.error('The attributes are not numeric')
+                else:
+                    plot_area.plotly_chart(fig)
+            elif plot_type == 'Violin plot':
+                fig = violin_plot(DF, options)
+                if fig == -1:
+                    plot_area.error('The attributes are not numeric')
+                else:
+                    plot_area.plotly_chart(fig)
+            elif plot_type == 'Strip plot':
+                fig = strip_plot(DF, options)
+                if fig == -1:
+                    plot_area.error('The attributes are not numeric')
+                else:
+                    plot_area.plotly_chart(fig)
+            elif plot_type == 'Line plot':
+                fig = multi_plot_line(DF, options)
+                if fig == -1:
+                    plot_area.error('The attributes are not numeric')
+                elif fig == -2:
+                    plot_area.error('Line plot requires two attributes')
+                else:
+                    plot_area.pyplot(fig)
+    # Advanced visualization
+    with advanced_tab:
+        st.subheader("3D Scatter Plot")
+        column_1, column_2, column_3 = st.columns(3)
+        with column_1:
+            x = st.selectbox(
+                key = 'x',
+                label = 'Select the x attribute:',
+                options = attributes,
+                index = 0
+            )
+        with column_2:
+            y = st.selectbox(
+                key = 'y',
+                label = 'Select the y attribute:',
+                options = attributes,
+                index = 1 if len(attributes) > 1 else 0
+            )
+        with column_3:
+            z = st.selectbox(
+                key = 'z',
+                label = 'Select the z attribute:',
+                options = attributes,
+                index = 2 if len(attributes) > 2 else 0
+            )
+        if st.button('Generate 3D Plot'):
+            _, fig_3d_col, _ = st.columns([1, 3, 1])
+            with fig_3d_col:
+                fig_3d_1 = scatter_3d(DF, x, y, z)
+                if fig_3d_1 == -1:
+                    st.error('Data not supported')
+                else:
+                    st.plotly_chart(fig_3d_1)
+        st.divider()
+        st.subheader('World Cloud')
+        upload_txt_checkbox = st.checkbox('Upload a new text file instead')
+        if upload_txt_checkbox:
+            uploaded_txt = st.file_uploader("Choose a text file", accept_multiple_files=False, type="txt")
+            if uploaded_txt:
+                text = uploaded_txt.read().decode("utf-8")
+                display_word_cloud(text)
+        else:
+            text_attr = st.selectbox(
+                label = 'Select the text attribute:',
+                options = attributes,
+                index = 0)
+            if st.button('Generate Word Cloud'):
+                text = DF[text_attr].astype(str).str.cat(sep=' ')
+                display_word_cloud(text)
+        st.divider()
+        st.subheader('World Heat Map')
+        col_1, col_2 = st.columns(2)
+        with col_1:
+            country_col = st.selectbox(
+                key = 'country_col',
+                label = 'Select the country attribute:',
+                options = attributes,
+                index = 0
+            )
+        with col_2:
+            heat_attribute = st.selectbox(
+                key = 'heat_attribute',
+                label = 'Select the attribute to display in heat map:',
+                options = attributes,
+                index = len(attributes) - 1
+            )
+        if st.button("Show Heatmap"):
+            _, map_col, _ = st.columns([1, 3, 1])
+            with map_col:
+                world_fig = world_map(DF, country_col, heat_attribute)
+                if world_fig == -1:
+                    st.error('Data not supported')
+                else:
+                    st.plotly_chart(world_fig)
+    st.divider()
+    # Data Overview
+    st.subheader('Data Overview')
+    if 'data_origin' not in st.session_state:
+        st.session_state.data_origin = DF
+    st.dataframe(st.session_state.data_origin.describe(), width=1200)
+    if 'overall_plot' not in st.session_state:
+        st.session_state.overall_plot = list_all(st.session_state.data_origin)
+    st.pyplot(st.session_state.overall_plot)
+    st.divider()
+    developer_info_static()

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+imbalanced_learn==0.12.0
+joblib==1.2.0
+openai==1.3.4
+langchain==0.1.6
+matplotlib==3.7.2
+nltk==3.8.1
+numpy==1.24.3
+pandas==2.2.0
+plotly==5.18.0
+PyYAML==6.0.1
+Requests==2.31.0
+scikit_learn==1.4.0
+scipy==1.12.0
+seaborn==0.13.2
+streamlit==1.31.0
+streamlit_lottie==0.0.5
+wordcloud==1.9.3
+xgboost==2.0.3
+statsmodels==0.14.0