|
import os |
|
import io |
|
import pandas as pd |
|
|
|
def read_file(file_path): |
|
""" |
|
Read a file from a given path. |
|
""" |
|
|
|
if os.path.getsize(file_path) > 200 * 1024 * 1024: |
|
raise ValueError("Too large file") |
|
|
|
|
|
file_extension = file_path.split('.')[-1] |
|
|
|
if file_extension == 'csv': |
|
|
|
return pd.read_csv(file_path) |
|
elif file_extension == 'json': |
|
|
|
return pd.read_json(file_path) |
|
elif file_extension in ['xls', 'xlsx']: |
|
|
|
return pd.read_excel(file_path, engine='openpyxl') |
|
else: |
|
raise ValueError("Unsupported file format: " + file_extension) |
|
|
|
def read_file_from_streamlit(uploaded_file): |
|
""" |
|
Read a file from a given streamlit file. |
|
""" |
|
|
|
if uploaded_file.size > 200 * 1024 * 1024: |
|
raise ValueError("Too large file") |
|
|
|
|
|
file_extension = uploaded_file.name.split('.')[-1] |
|
|
|
if file_extension == 'csv': |
|
|
|
return pd.read_csv(uploaded_file) |
|
elif file_extension == 'json': |
|
|
|
return pd.read_json(uploaded_file) |
|
elif file_extension in ['xls', 'xlsx']: |
|
|
|
|
|
return pd.read_excel(io.BytesIO(uploaded_file.read()), engine='openpyxl') |
|
else: |
|
raise ValueError("Unsupported file format: " + file_extension) |
|
|
|
def select_Y(df, Y_name): |
|
""" |
|
Select the target variable from the DataFrame. |
|
""" |
|
if Y_name in df.columns: |
|
X = df.drop(Y_name, axis=1) |
|
Y = df[Y_name] |
|
return X, Y |
|
else: |
|
return -1 |
|
|
|
def check_all_columns_numeric(df): |
|
""" |
|
Check if all columns in a DataFrame are numeric. Return True if so, False otherwise. |
|
""" |
|
return df.select_dtypes(include=[int, float]).shape[1] == df.shape[1] |
|
|
|
def non_numeric_columns_and_head(df, num_rows=20): |
|
""" |
|
Identify non-numeric columns in a DataFrame and return their names and head. |
|
|
|
:param df: Pandas DataFrame to be examined. |
|
:param num_rows: Number of rows to include in the head (default is 20). |
|
:return: A tuple with two elements: |
|
1. List of column names that are not numeric (integer or float). |
|
2. DataFrame containing the head of the non-numeric columns. |
|
""" |
|
|
|
non_numeric_cols = [col for col in df.columns if not pd.api.types.is_numeric_dtype(df[col])] |
|
|
|
|
|
non_numeric_head = df[non_numeric_cols].head(num_rows).to_csv() |
|
|
|
return non_numeric_cols, non_numeric_head |
|
|
|
def contain_null_attributes_info(df): |
|
""" |
|
Identifies columns with missing values, summarizes their statistics, and reports their data types. |
|
|
|
This function checks for attributes within a DataFrame that contain null values, |
|
generates descriptive statistics for these attributes, and compiles information about their data types. |
|
|
|
:param df: A pandas DataFrame to be analyzed. |
|
:return: A tuple containing: |
|
- A list of columns that contain null values. |
|
- A string representation of data types for these columns. |
|
- A CSV-formatted string containing descriptive statistics (count, mean, median, and standard deviation) for these columns. |
|
Returns an empty list, -1, and -1 if no columns with null values are found. |
|
""" |
|
attributes = df.columns[df.isnull().any()].tolist() |
|
if not attributes: return [], -1, -1 |
|
|
|
description_info = df[attributes].describe(percentiles=[.5]) |
|
description_info = description_info.loc[['count', 'mean', '50%', 'std']].round(2).to_csv() |
|
|
|
dtypes_df = df[attributes].dtypes |
|
types_info = "\n".join([f"{index}:{dtype}" for index, dtype in dtypes_df.items()]) |
|
|
|
return attributes, types_info, description_info |
|
|
|
def attribute_info(df): |
|
""" |
|
Obtain the attributes, types, and head information of the DataFrame. |
|
""" |
|
attributes = df.columns.tolist() |
|
dtypes_df = df.dtypes |
|
types_info = "\n".join([f"{index}:{dtype}" for index, dtype in dtypes_df.items()]) |
|
head_info = df.head(10).to_csv() |
|
|
|
return attributes, types_info, head_info |
|
|
|
def get_data_overview(df): |
|
""" |
|
Obtain the shape, head, nunique, and description information of the DataFrame. |
|
""" |
|
shape_info = str(df.shape) |
|
head_info = df.head().to_csv() |
|
nunique_info = df.nunique().to_csv() |
|
description_info = df.describe(include='all').to_csv() |
|
return shape_info, head_info, nunique_info, description_info |
|
|
|
def get_balance_info(df, Y_name): |
|
""" |
|
Obtain the shape, description, and balance information of the DataFrame. |
|
""" |
|
shape_info = df.shape |
|
description_info = df.describe().to_csv() |
|
balance_info = df[Y_name].value_counts().to_dict() |
|
return shape_info, description_info, balance_info |
|
|
|
def separate_decode_list(decided_dict, Y_name): |
|
""" |
|
Process the LLM response and return the lists of columns to be converted to integer, one-hot encoding, and drop |
|
""" |
|
convert_int_cols = [key for key, value in decided_dict.items() if value == 1] |
|
one_hot_cols = [key for key, value in decided_dict.items() if value == 2] |
|
drop_cols = [key for key, value in decided_dict.items() if value == 3] |
|
if Y_name and Y_name in one_hot_cols: |
|
one_hot_cols.remove(Y_name) |
|
convert_int_cols.append(Y_name) |
|
if Y_name and Y_name in drop_cols: |
|
drop_cols.remove(Y_name) |
|
convert_int_cols.append(Y_name) |
|
return convert_int_cols, one_hot_cols, drop_cols |
|
|
|
def separate_fill_null_list(fill_null_dict): |
|
""" |
|
Process the LLM response and return the lists of columns to be filled with mean, median, mode, new category, interpolation |
|
""" |
|
mean_list = [key for key, value in fill_null_dict.items() if value == 1] |
|
median_list = [key for key, value in fill_null_dict.items() if value == 2] |
|
mode_list = [key for key, value in fill_null_dict.items() if value == 3] |
|
new_category_list = [key for key, value in fill_null_dict.items() if value == 4] |
|
interpolation_list = [key for key, value in fill_null_dict.items() if value == 5] |
|
return mean_list, median_list, mode_list, new_category_list, interpolation_list |
|
|
|
def get_selected_models(model_dict): |
|
""" |
|
Convert the dictionary of models to a list. |
|
""" |
|
return list(model_dict.values()) |
|
|
|
def get_model_name(model_no): |
|
""" |
|
Returns the name of the classification model based on the model number. |
|
""" |
|
if model_no == 1: |
|
return "Logistic Regression" |
|
elif model_no == 2: |
|
return "SVM" |
|
elif model_no == 3: |
|
return "Naive Bayes" |
|
elif model_no == 4: |
|
return "Random Forest" |
|
elif model_no == 5: |
|
return "ADA Boost" |
|
elif model_no == 6: |
|
return "XGBoost" |
|
elif model_no == 7: |
|
return "Grandient Boost" |
|
|
|
def get_cluster_method_name(method): |
|
""" |
|
Returns the name of the clustering method based on the method number. |
|
""" |
|
if method == 1: |
|
return "K-Means" |
|
elif method == 2: |
|
return "DBSCAN" |
|
elif method == 3: |
|
return "Gaussian Mixture" |
|
elif method == 4: |
|
return "Agglomerative Clustering" |
|
elif method == 5: |
|
return "Spectral Clustering" |
|
|
|
def get_balance_method_name(method): |
|
""" |
|
Returns the name of the balance method based on the method number. |
|
""" |
|
if method == 1: |
|
return "ROS" |
|
elif method == 2: |
|
return "SMOTE" |
|
elif method == 3: |
|
return "ADASYN" |
|
elif method == 4: |
|
return "None" |
|
|
|
def get_regression_method_name(method): |
|
""" |
|
Returns the name of the regression method based on the method number. |
|
""" |
|
if method == 1: |
|
return "Linear Regression" |
|
elif method == 2: |
|
return "Ridge Regression" |
|
elif method == 3: |
|
return "Lasso Regression" |
|
elif method == 4: |
|
return "Random Forest" |
|
elif method == 5: |
|
return "Gradient Boosting" |
|
elif method == 6: |
|
return "Elastic Net" |
|
|
|
def count_unique(df, Y): |
|
""" |
|
Counts the number of unique values in a specified column of a DataFrame. |
|
""" |
|
return df[Y].nunique() |
|
|