import os import io import pandas as pd def read_file(file_path): """ Read a file from a given path. """ # Check the size of the file if os.path.getsize(file_path) > 200 * 1024 * 1024: # 200MB in bytes raise ValueError("Too large file") # Extract the file extension file_extension = file_path.split('.')[-1] if file_extension == 'csv': # Read CSV file return pd.read_csv(file_path) elif file_extension == 'json': # Read JSON file return pd.read_json(file_path) elif file_extension in ['xls', 'xlsx']: # Read Excel file return pd.read_excel(file_path, engine='openpyxl') else: raise ValueError("Unsupported file format: " + file_extension) def read_file_from_streamlit(uploaded_file): """ Read a file from a given streamlit file. """ # Check the size of the file if uploaded_file.size > 200 * 1024 * 1024: # 200MB in bytes raise ValueError("Too large file") # Extract the file extension file_extension = uploaded_file.name.split('.')[-1] if file_extension == 'csv': # Read CSV file return pd.read_csv(uploaded_file) elif file_extension == 'json': # Read JSON file return pd.read_json(uploaded_file) elif file_extension in ['xls', 'xlsx']: # Read Excel file # Use io.BytesIO to handle the binary stream return pd.read_excel(io.BytesIO(uploaded_file.read()), engine='openpyxl') else: raise ValueError("Unsupported file format: " + file_extension) def select_Y(df, Y_name): """ Select the target variable from the DataFrame. """ if Y_name in df.columns: X = df.drop(Y_name, axis=1) Y = df[Y_name] return X, Y else: return -1 def check_all_columns_numeric(df): """ Check if all columns in a DataFrame are numeric. Return True if so, False otherwise. """ return df.select_dtypes(include=[int, float]).shape[1] == df.shape[1] def non_numeric_columns_and_head(df, num_rows=20): """ Identify non-numeric columns in a DataFrame and return their names and head. :param df: Pandas DataFrame to be examined. :param num_rows: Number of rows to include in the head (default is 20). :return: A tuple with two elements: 1. List of column names that are not numeric (integer or float). 2. DataFrame containing the head of the non-numeric columns. """ # Identify columns that are not of numeric data type non_numeric_cols = [col for col in df.columns if not pd.api.types.is_numeric_dtype(df[col])] # Get the head of the non-numeric columns non_numeric_head = df[non_numeric_cols].head(num_rows).to_csv() return non_numeric_cols, non_numeric_head def contain_null_attributes_info(df): """ Identifies columns with missing values, summarizes their statistics, and reports their data types. This function checks for attributes within a DataFrame that contain null values, generates descriptive statistics for these attributes, and compiles information about their data types. :param df: A pandas DataFrame to be analyzed. :return: A tuple containing: - A list of columns that contain null values. - A string representation of data types for these columns. - A CSV-formatted string containing descriptive statistics (count, mean, median, and standard deviation) for these columns. Returns an empty list, -1, and -1 if no columns with null values are found. """ attributes = df.columns[df.isnull().any()].tolist() if not attributes: return [], -1, -1 description_info = df[attributes].describe(percentiles=[.5]) description_info = description_info.loc[['count', 'mean', '50%', 'std']].round(2).to_csv() dtypes_df = df[attributes].dtypes types_info = "\n".join([f"{index}:{dtype}" for index, dtype in dtypes_df.items()]) return attributes, types_info, description_info def attribute_info(df): """ Obtain the attributes, types, and head information of the DataFrame. """ attributes = df.columns.tolist() dtypes_df = df.dtypes types_info = "\n".join([f"{index}:{dtype}" for index, dtype in dtypes_df.items()]) head_info = df.head(10).to_csv() return attributes, types_info, head_info def get_data_overview(df): """ Obtain the shape, head, nunique, and description information of the DataFrame. """ shape_info = str(df.shape) head_info = df.head().to_csv() nunique_info = df.nunique().to_csv() description_info = df.describe(include='all').to_csv() return shape_info, head_info, nunique_info, description_info def get_balance_info(df, Y_name): """ Obtain the shape, description, and balance information of the DataFrame. """ shape_info = df.shape description_info = df.describe().to_csv() balance_info = df[Y_name].value_counts().to_dict() return shape_info, description_info, balance_info def separate_decode_list(decided_dict, Y_name): """ Process the LLM response and return the lists of columns to be converted to integer, one-hot encoding, and drop """ convert_int_cols = [key for key, value in decided_dict.items() if value == 1] one_hot_cols = [key for key, value in decided_dict.items() if value == 2] drop_cols = [key for key, value in decided_dict.items() if value == 3] if Y_name and Y_name in one_hot_cols: one_hot_cols.remove(Y_name) convert_int_cols.append(Y_name) if Y_name and Y_name in drop_cols: drop_cols.remove(Y_name) convert_int_cols.append(Y_name) return convert_int_cols, one_hot_cols, drop_cols def separate_fill_null_list(fill_null_dict): """ Process the LLM response and return the lists of columns to be filled with mean, median, mode, new category, interpolation """ mean_list = [key for key, value in fill_null_dict.items() if value == 1] median_list = [key for key, value in fill_null_dict.items() if value == 2] mode_list = [key for key, value in fill_null_dict.items() if value == 3] new_category_list = [key for key, value in fill_null_dict.items() if value == 4] interpolation_list = [key for key, value in fill_null_dict.items() if value == 5] return mean_list, median_list, mode_list, new_category_list, interpolation_list def get_selected_models(model_dict): """ Convert the dictionary of models to a list. """ return list(model_dict.values()) def get_model_name(model_no): """ Returns the name of the classification model based on the model number. """ if model_no == 1: return "Logistic Regression" elif model_no == 2: return "SVM" elif model_no == 3: return "Naive Bayes" elif model_no == 4: return "Random Forest" elif model_no == 5: return "ADA Boost" elif model_no == 6: return "XGBoost" elif model_no == 7: return "Grandient Boost" def get_cluster_method_name(method): """ Returns the name of the clustering method based on the method number. """ if method == 1: return "K-Means" elif method == 2: return "DBSCAN" elif method == 3: return "Gaussian Mixture" elif method == 4: return "Agglomerative Clustering" elif method == 5: return "Spectral Clustering" def get_balance_method_name(method): """ Returns the name of the balance method based on the method number. """ if method == 1: return "ROS" elif method == 2: return "SMOTE" elif method == 3: return "ADASYN" elif method == 4: return "None" def get_regression_method_name(method): """ Returns the name of the regression method based on the method number. """ if method == 1: return "Linear Regression" elif method == 2: return "Ridge Regression" elif method == 3: return "Lasso Regression" elif method == 4: return "Random Forest" elif method == 5: return "Gradient Boosting" elif method == 6: return "Elastic Net" def count_unique(df, Y): """ Counts the number of unique values in a specified column of a DataFrame. """ return df[Y].nunique()