Spaces:
Runtime error
Runtime error
File size: 8,389 Bytes
9183c57 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 |
import os
import io
import pandas as pd
def read_file(file_path):
"""
Read a file from a given path.
"""
# Check the size of the file
if os.path.getsize(file_path) > 200 * 1024 * 1024: # 200MB in bytes
raise ValueError("Too large file")
# Extract the file extension
file_extension = file_path.split('.')[-1]
if file_extension == 'csv':
# Read CSV file
return pd.read_csv(file_path)
elif file_extension == 'json':
# Read JSON file
return pd.read_json(file_path)
elif file_extension in ['xls', 'xlsx']:
# Read Excel file
return pd.read_excel(file_path, engine='openpyxl')
else:
raise ValueError("Unsupported file format: " + file_extension)
def read_file_from_streamlit(uploaded_file):
"""
Read a file from a given streamlit file.
"""
# Check the size of the file
if uploaded_file.size > 200 * 1024 * 1024: # 200MB in bytes
raise ValueError("Too large file")
# Extract the file extension
file_extension = uploaded_file.name.split('.')[-1]
if file_extension == 'csv':
# Read CSV file
return pd.read_csv(uploaded_file)
elif file_extension == 'json':
# Read JSON file
return pd.read_json(uploaded_file)
elif file_extension in ['xls', 'xlsx']:
# Read Excel file
# Use io.BytesIO to handle the binary stream
return pd.read_excel(io.BytesIO(uploaded_file.read()), engine='openpyxl')
else:
raise ValueError("Unsupported file format: " + file_extension)
def select_Y(df, Y_name):
"""
Select the target variable from the DataFrame.
"""
if Y_name in df.columns:
X = df.drop(Y_name, axis=1)
Y = df[Y_name]
return X, Y
else:
return -1
def check_all_columns_numeric(df):
"""
Check if all columns in a DataFrame are numeric. Return True if so, False otherwise.
"""
return df.select_dtypes(include=[int, float]).shape[1] == df.shape[1]
def non_numeric_columns_and_head(df, num_rows=20):
"""
Identify non-numeric columns in a DataFrame and return their names and head.
:param df: Pandas DataFrame to be examined.
:param num_rows: Number of rows to include in the head (default is 20).
:return: A tuple with two elements:
1. List of column names that are not numeric (integer or float).
2. DataFrame containing the head of the non-numeric columns.
"""
# Identify columns that are not of numeric data type
non_numeric_cols = [col for col in df.columns if not pd.api.types.is_numeric_dtype(df[col])]
# Get the head of the non-numeric columns
non_numeric_head = df[non_numeric_cols].head(num_rows).to_csv()
return non_numeric_cols, non_numeric_head
def contain_null_attributes_info(df):
"""
Identifies columns with missing values, summarizes their statistics, and reports their data types.
This function checks for attributes within a DataFrame that contain null values,
generates descriptive statistics for these attributes, and compiles information about their data types.
:param df: A pandas DataFrame to be analyzed.
:return: A tuple containing:
- A list of columns that contain null values.
- A string representation of data types for these columns.
- A CSV-formatted string containing descriptive statistics (count, mean, median, and standard deviation) for these columns.
Returns an empty list, -1, and -1 if no columns with null values are found.
"""
attributes = df.columns[df.isnull().any()].tolist()
if not attributes: return [], -1, -1
description_info = df[attributes].describe(percentiles=[.5])
description_info = description_info.loc[['count', 'mean', '50%', 'std']].round(2).to_csv()
dtypes_df = df[attributes].dtypes
types_info = "\n".join([f"{index}:{dtype}" for index, dtype in dtypes_df.items()])
return attributes, types_info, description_info
def attribute_info(df):
"""
Obtain the attributes, types, and head information of the DataFrame.
"""
attributes = df.columns.tolist()
dtypes_df = df.dtypes
types_info = "\n".join([f"{index}:{dtype}" for index, dtype in dtypes_df.items()])
head_info = df.head(10).to_csv()
return attributes, types_info, head_info
def get_data_overview(df):
"""
Obtain the shape, head, nunique, and description information of the DataFrame.
"""
shape_info = str(df.shape)
head_info = df.head().to_csv()
nunique_info = df.nunique().to_csv()
description_info = df.describe(include='all').to_csv()
return shape_info, head_info, nunique_info, description_info
def get_balance_info(df, Y_name):
"""
Obtain the shape, description, and balance information of the DataFrame.
"""
shape_info = df.shape
description_info = df.describe().to_csv()
balance_info = df[Y_name].value_counts().to_dict()
return shape_info, description_info, balance_info
def separate_decode_list(decided_dict, Y_name):
"""
Process the LLM response and return the lists of columns to be converted to integer, one-hot encoding, and drop
"""
convert_int_cols = [key for key, value in decided_dict.items() if value == 1]
one_hot_cols = [key for key, value in decided_dict.items() if value == 2]
drop_cols = [key for key, value in decided_dict.items() if value == 3]
if Y_name and Y_name in one_hot_cols:
one_hot_cols.remove(Y_name)
convert_int_cols.append(Y_name)
if Y_name and Y_name in drop_cols:
drop_cols.remove(Y_name)
convert_int_cols.append(Y_name)
return convert_int_cols, one_hot_cols, drop_cols
def separate_fill_null_list(fill_null_dict):
"""
Process the LLM response and return the lists of columns to be filled with mean, median, mode, new category, interpolation
"""
mean_list = [key for key, value in fill_null_dict.items() if value == 1]
median_list = [key for key, value in fill_null_dict.items() if value == 2]
mode_list = [key for key, value in fill_null_dict.items() if value == 3]
new_category_list = [key for key, value in fill_null_dict.items() if value == 4]
interpolation_list = [key for key, value in fill_null_dict.items() if value == 5]
return mean_list, median_list, mode_list, new_category_list, interpolation_list
def get_selected_models(model_dict):
"""
Convert the dictionary of models to a list.
"""
return list(model_dict.values())
def get_model_name(model_no):
"""
Returns the name of the classification model based on the model number.
"""
if model_no == 1:
return "Logistic Regression"
elif model_no == 2:
return "SVM"
elif model_no == 3:
return "Naive Bayes"
elif model_no == 4:
return "Random Forest"
elif model_no == 5:
return "ADA Boost"
elif model_no == 6:
return "XGBoost"
elif model_no == 7:
return "Grandient Boost"
def get_cluster_method_name(method):
"""
Returns the name of the clustering method based on the method number.
"""
if method == 1:
return "K-Means"
elif method == 2:
return "DBSCAN"
elif method == 3:
return "Gaussian Mixture"
elif method == 4:
return "Agglomerative Clustering"
elif method == 5:
return "Spectral Clustering"
def get_balance_method_name(method):
"""
Returns the name of the balance method based on the method number.
"""
if method == 1:
return "ROS"
elif method == 2:
return "SMOTE"
elif method == 3:
return "ADASYN"
elif method == 4:
return "None"
def get_regression_method_name(method):
"""
Returns the name of the regression method based on the method number.
"""
if method == 1:
return "Linear Regression"
elif method == 2:
return "Ridge Regression"
elif method == 3:
return "Lasso Regression"
elif method == 4:
return "Random Forest"
elif method == 5:
return "Gradient Boosting"
elif method == 6:
return "Elastic Net"
def count_unique(df, Y):
"""
Counts the number of unique values in a specified column of a DataFrame.
"""
return df[Y].nunique()
|