File size: 8,389 Bytes
9183c57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
import os
import io
import pandas as pd

def read_file(file_path):
    """
    Read a file from a given path.
    """
    # Check the size of the file
    if os.path.getsize(file_path) > 200 * 1024 * 1024:  # 200MB in bytes
        raise ValueError("Too large file")
    
    # Extract the file extension
    file_extension = file_path.split('.')[-1]

    if file_extension == 'csv':
        # Read CSV file
        return pd.read_csv(file_path)
    elif file_extension == 'json':
        # Read JSON file
        return pd.read_json(file_path)
    elif file_extension in ['xls', 'xlsx']:
        # Read Excel file
        return pd.read_excel(file_path, engine='openpyxl')
    else:
        raise ValueError("Unsupported file format: " + file_extension)

def read_file_from_streamlit(uploaded_file):
    """
    Read a file from a given streamlit file.
    """
    # Check the size of the file
    if uploaded_file.size > 200 * 1024 * 1024:  # 200MB in bytes
        raise ValueError("Too large file")

    # Extract the file extension
    file_extension = uploaded_file.name.split('.')[-1]

    if file_extension == 'csv':
        # Read CSV file
        return pd.read_csv(uploaded_file)
    elif file_extension == 'json':
        # Read JSON file
        return pd.read_json(uploaded_file)
    elif file_extension in ['xls', 'xlsx']:
        # Read Excel file
        # Use io.BytesIO to handle the binary stream
        return pd.read_excel(io.BytesIO(uploaded_file.read()), engine='openpyxl')
    else:
        raise ValueError("Unsupported file format: " + file_extension)

def select_Y(df, Y_name):
    """
    Select the target variable from the DataFrame.
    """
    if Y_name in df.columns:
        X = df.drop(Y_name, axis=1)
        Y = df[Y_name]
        return X, Y
    else:
        return -1

def check_all_columns_numeric(df):
    """
    Check if all columns in a DataFrame are numeric. Return True if so, False otherwise.
    """
    return df.select_dtypes(include=[int, float]).shape[1] == df.shape[1]

def non_numeric_columns_and_head(df, num_rows=20):
    """
    Identify non-numeric columns in a DataFrame and return their names and head.

    :param df: Pandas DataFrame to be examined.
    :param num_rows: Number of rows to include in the head (default is 20).
    :return: A tuple with two elements:
             1. List of column names that are not numeric (integer or float).
             2. DataFrame containing the head of the non-numeric columns.
    """
    # Identify columns that are not of numeric data type
    non_numeric_cols = [col for col in df.columns if not pd.api.types.is_numeric_dtype(df[col])]
    
    # Get the head of the non-numeric columns
    non_numeric_head = df[non_numeric_cols].head(num_rows).to_csv()
    
    return non_numeric_cols, non_numeric_head

def contain_null_attributes_info(df):
    """
    Identifies columns with missing values, summarizes their statistics, and reports their data types.

    This function checks for attributes within a DataFrame that contain null values, 
    generates descriptive statistics for these attributes, and compiles information about their data types.

    :param df: A pandas DataFrame to be analyzed.
    :return: A tuple containing:
             - A list of columns that contain null values.
             - A string representation of data types for these columns.
             - A CSV-formatted string containing descriptive statistics (count, mean, median, and standard deviation) for these columns.
               Returns an empty list, -1, and -1 if no columns with null values are found.
    """
    attributes = df.columns[df.isnull().any()].tolist()
    if not attributes: return [], -1, -1

    description_info = df[attributes].describe(percentiles=[.5])
    description_info = description_info.loc[['count', 'mean', '50%', 'std']].round(2).to_csv()

    dtypes_df = df[attributes].dtypes
    types_info = "\n".join([f"{index}:{dtype}" for index, dtype in dtypes_df.items()])

    return attributes, types_info, description_info

def attribute_info(df):
    """
    Obtain the attributes, types, and head information of the DataFrame.
    """
    attributes = df.columns.tolist()
    dtypes_df = df.dtypes
    types_info = "\n".join([f"{index}:{dtype}" for index, dtype in dtypes_df.items()])
    head_info = df.head(10).to_csv()

    return attributes, types_info, head_info

def get_data_overview(df):
    """
    Obtain the shape, head, nunique, and description information of the DataFrame.
    """
    shape_info = str(df.shape)
    head_info = df.head().to_csv()
    nunique_info = df.nunique().to_csv()
    description_info = df.describe(include='all').to_csv()
    return shape_info, head_info, nunique_info, description_info

def get_balance_info(df, Y_name):
    """
    Obtain the shape, description, and balance information of the DataFrame.
    """
    shape_info = df.shape
    description_info = df.describe().to_csv()
    balance_info = df[Y_name].value_counts().to_dict()
    return shape_info, description_info, balance_info

def separate_decode_list(decided_dict, Y_name):
    """
    Process the LLM response and return the lists of columns to be converted to integer, one-hot encoding, and drop
    """
    convert_int_cols = [key for key, value in decided_dict.items() if value == 1]
    one_hot_cols = [key for key, value in decided_dict.items() if value == 2]
    drop_cols = [key for key, value in decided_dict.items() if value == 3]
    if Y_name and Y_name in one_hot_cols:
        one_hot_cols.remove(Y_name)
        convert_int_cols.append(Y_name)
    if Y_name and Y_name in drop_cols:
        drop_cols.remove(Y_name)
        convert_int_cols.append(Y_name)
    return convert_int_cols, one_hot_cols, drop_cols

def separate_fill_null_list(fill_null_dict):
    """
    Process the LLM response and return the lists of columns to be filled with mean, median, mode, new category, interpolation
    """
    mean_list = [key for key, value in fill_null_dict.items() if value == 1]
    median_list = [key for key, value in fill_null_dict.items() if value == 2]
    mode_list = [key for key, value in fill_null_dict.items() if value == 3]
    new_category_list = [key for key, value in fill_null_dict.items() if value == 4]
    interpolation_list = [key for key, value in fill_null_dict.items() if value == 5]
    return mean_list, median_list, mode_list, new_category_list, interpolation_list

def get_selected_models(model_dict):
    """
    Convert the dictionary of models to a list.
    """
    return list(model_dict.values())

def get_model_name(model_no):
    """
    Returns the name of the classification model based on the model number.
    """
    if model_no == 1:
        return "Logistic Regression"
    elif model_no == 2:
        return "SVM"
    elif model_no == 3:
        return "Naive Bayes"
    elif model_no == 4:
        return "Random Forest"
    elif model_no == 5:
        return "ADA Boost"
    elif model_no == 6:
        return "XGBoost"
    elif model_no == 7:
        return "Grandient Boost"
    
def get_cluster_method_name(method):
    """
    Returns the name of the clustering method based on the method number.
    """
    if method == 1:
        return "K-Means"
    elif method == 2:
        return "DBSCAN"
    elif method == 3:
        return "Gaussian Mixture"
    elif method == 4:
        return "Agglomerative Clustering"
    elif method == 5:
        return "Spectral Clustering"
    
def get_balance_method_name(method):
    """
    Returns the name of the balance method based on the method number.
    """
    if method == 1:
        return "ROS"
    elif method == 2:
        return "SMOTE"
    elif method == 3:
        return "ADASYN"
    elif method == 4:
        return "None"
    
def get_regression_method_name(method):
    """
    Returns the name of the regression method based on the method number.
    """
    if method == 1:
        return "Linear Regression"
    elif method == 2:
        return "Ridge Regression"
    elif method == 3:
        return "Lasso Regression"
    elif method == 4:
        return "Random Forest"
    elif method == 5:
        return "Gradient Boosting"
    elif method == 6:
        return "Elastic Net"
    
def count_unique(df, Y):
    """
    Counts the number of unique values in a specified column of a DataFrame.
    """
    return df[Y].nunique()