Streamline-Analyst

Running

Wilson-ZheLin

Initial commit

9183c57 11 months ago

8.39 kB

	import os
	import io
	import pandas as pd

	def read_file(file_path):
	"""
	Read a file from a given path.
	"""
	# Check the size of the file
	if os.path.getsize(file_path) > 200 * 1024 * 1024: # 200MB in bytes
	raise ValueError("Too large file")

	# Extract the file extension
	file_extension = file_path.split('.')[-1]

	if file_extension == 'csv':
	# Read CSV file
	return pd.read_csv(file_path)
	elif file_extension == 'json':
	# Read JSON file
	return pd.read_json(file_path)
	elif file_extension in ['xls', 'xlsx']:
	# Read Excel file
	return pd.read_excel(file_path, engine='openpyxl')
	else:
	raise ValueError("Unsupported file format: " + file_extension)

	def read_file_from_streamlit(uploaded_file):
	"""
	Read a file from a given streamlit file.
	"""
	# Check the size of the file
	if uploaded_file.size > 200 * 1024 * 1024: # 200MB in bytes
	raise ValueError("Too large file")

	# Extract the file extension
	file_extension = uploaded_file.name.split('.')[-1]

	if file_extension == 'csv':
	# Read CSV file
	return pd.read_csv(uploaded_file)
	elif file_extension == 'json':
	# Read JSON file
	return pd.read_json(uploaded_file)
	elif file_extension in ['xls', 'xlsx']:
	# Read Excel file
	# Use io.BytesIO to handle the binary stream
	return pd.read_excel(io.BytesIO(uploaded_file.read()), engine='openpyxl')
	else:
	raise ValueError("Unsupported file format: " + file_extension)

	def select_Y(df, Y_name):
	"""
	Select the target variable from the DataFrame.
	"""
	if Y_name in df.columns:
	X = df.drop(Y_name, axis=1)
	Y = df[Y_name]
	return X, Y
	else:
	return -1

	def check_all_columns_numeric(df):
	"""
	Check if all columns in a DataFrame are numeric. Return True if so, False otherwise.
	"""
	return df.select_dtypes(include=[int, float]).shape[1] == df.shape[1]

	def non_numeric_columns_and_head(df, num_rows=20):
	"""
	Identify non-numeric columns in a DataFrame and return their names and head.

	:param df: Pandas DataFrame to be examined.
	:param num_rows: Number of rows to include in the head (default is 20).
	:return: A tuple with two elements:
	1. List of column names that are not numeric (integer or float).
	2. DataFrame containing the head of the non-numeric columns.
	"""
	# Identify columns that are not of numeric data type
	non_numeric_cols = [col for col in df.columns if not pd.api.types.is_numeric_dtype(df[col])]

	# Get the head of the non-numeric columns
	non_numeric_head = df[non_numeric_cols].head(num_rows).to_csv()

	return non_numeric_cols, non_numeric_head

	def contain_null_attributes_info(df):
	"""
	Identifies columns with missing values, summarizes their statistics, and reports their data types.

	This function checks for attributes within a DataFrame that contain null values,
	generates descriptive statistics for these attributes, and compiles information about their data types.

	:param df: A pandas DataFrame to be analyzed.
	:return: A tuple containing:
	- A list of columns that contain null values.
	- A string representation of data types for these columns.
	- A CSV-formatted string containing descriptive statistics (count, mean, median, and standard deviation) for these columns.
	Returns an empty list, -1, and -1 if no columns with null values are found.
	"""
	attributes = df.columns[df.isnull().any()].tolist()
	if not attributes: return [], -1, -1

	description_info = df[attributes].describe(percentiles=[.5])
	description_info = description_info.loc[['count', 'mean', '50%', 'std']].round(2).to_csv()

	dtypes_df = df[attributes].dtypes
	types_info = "\n".join([f"{index}:{dtype}" for index, dtype in dtypes_df.items()])

	return attributes, types_info, description_info

	def attribute_info(df):
	"""
	Obtain the attributes, types, and head information of the DataFrame.
	"""
	attributes = df.columns.tolist()
	dtypes_df = df.dtypes
	types_info = "\n".join([f"{index}:{dtype}" for index, dtype in dtypes_df.items()])
	head_info = df.head(10).to_csv()

	return attributes, types_info, head_info

	def get_data_overview(df):
	"""
	Obtain the shape, head, nunique, and description information of the DataFrame.
	"""
	shape_info = str(df.shape)
	head_info = df.head().to_csv()
	nunique_info = df.nunique().to_csv()
	description_info = df.describe(include='all').to_csv()
	return shape_info, head_info, nunique_info, description_info

	def get_balance_info(df, Y_name):
	"""
	Obtain the shape, description, and balance information of the DataFrame.
	"""
	shape_info = df.shape
	description_info = df.describe().to_csv()
	balance_info = df[Y_name].value_counts().to_dict()
	return shape_info, description_info, balance_info

	def separate_decode_list(decided_dict, Y_name):
	"""
	Process the LLM response and return the lists of columns to be converted to integer, one-hot encoding, and drop
	"""
	convert_int_cols = [key for key, value in decided_dict.items() if value == 1]
	one_hot_cols = [key for key, value in decided_dict.items() if value == 2]
	drop_cols = [key for key, value in decided_dict.items() if value == 3]
	if Y_name and Y_name in one_hot_cols:
	one_hot_cols.remove(Y_name)
	convert_int_cols.append(Y_name)
	if Y_name and Y_name in drop_cols:
	drop_cols.remove(Y_name)
	convert_int_cols.append(Y_name)
	return convert_int_cols, one_hot_cols, drop_cols

	def separate_fill_null_list(fill_null_dict):
	"""
	Process the LLM response and return the lists of columns to be filled with mean, median, mode, new category, interpolation
	"""
	mean_list = [key for key, value in fill_null_dict.items() if value == 1]
	median_list = [key for key, value in fill_null_dict.items() if value == 2]
	mode_list = [key for key, value in fill_null_dict.items() if value == 3]
	new_category_list = [key for key, value in fill_null_dict.items() if value == 4]
	interpolation_list = [key for key, value in fill_null_dict.items() if value == 5]
	return mean_list, median_list, mode_list, new_category_list, interpolation_list

	def get_selected_models(model_dict):
	"""
	Convert the dictionary of models to a list.
	"""
	return list(model_dict.values())

	def get_model_name(model_no):
	"""
	Returns the name of the classification model based on the model number.
	"""
	if model_no == 1:
	return "Logistic Regression"
	elif model_no == 2:
	return "SVM"
	elif model_no == 3:
	return "Naive Bayes"
	elif model_no == 4:
	return "Random Forest"
	elif model_no == 5:
	return "ADA Boost"
	elif model_no == 6:
	return "XGBoost"
	elif model_no == 7:
	return "Grandient Boost"

	def get_cluster_method_name(method):
	"""
	Returns the name of the clustering method based on the method number.
	"""
	if method == 1:
	return "K-Means"
	elif method == 2:
	return "DBSCAN"
	elif method == 3:
	return "Gaussian Mixture"
	elif method == 4:
	return "Agglomerative Clustering"
	elif method == 5:
	return "Spectral Clustering"

	def get_balance_method_name(method):
	"""
	Returns the name of the balance method based on the method number.
	"""
	if method == 1:
	return "ROS"
	elif method == 2:
	return "SMOTE"
	elif method == 3:
	return "ADASYN"
	elif method == 4:
	return "None"

	def get_regression_method_name(method):
	"""
	Returns the name of the regression method based on the method number.
	"""
	if method == 1:
	return "Linear Regression"
	elif method == 2:
	return "Ridge Regression"
	elif method == 3:
	return "Lasso Regression"
	elif method == 4:
	return "Random Forest"
	elif method == 5:
	return "Gradient Boosting"
	elif method == 6:
	return "Elastic Net"

	def count_unique(df, Y):
	"""
	Counts the number of unique values in a specified column of a DataFrame.
	"""
	return df[Y].nunique()