Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
from sklearn.decomposition import PCA | |
from sklearn.preprocessing import StandardScaler | |
from src.preprocess import convert_to_integer | |
def decide_pca(df, cumulative_variance_threshold=0.95, min_dim_reduction_ratio=0.1): | |
""" | |
Determines whether PCA should be performed based on cumulative variance threshold and dimension reduction ratio. | |
Parameters: | |
- df (DataFrame): The input DataFrame. | |
- cumulative_variance_threshold (float): The threshold of explained variance to retain. Default is 0.95. | |
- min_dim_reduction_ratio (float): The minimum ratio of dimension reduction required to perform PCA. Default is 0.1. | |
Returns: | |
- perform_pca (bool): Whether PCA should be performed. | |
- n_components (int): The number of principal components to retain. | |
""" | |
# Remove non-numeric columns | |
numeric_df = df.select_dtypes(include=[np.number]) | |
# Standardizing the Data | |
scaler = StandardScaler() | |
scaled_data = scaler.fit_transform(numeric_df) | |
# PCA for Explained Variance | |
pca = PCA() | |
pca.fit(scaled_data) | |
# Calculate cumulative variance | |
cumulative_variance = np.cumsum(pca.explained_variance_ratio_) | |
# Find the number of components for the desired threshold | |
n_components = np.where(cumulative_variance >= cumulative_variance_threshold)[0][0] + 1 | |
# Calculate the dimension reduction ratio | |
dim_reduction_ratio = 1 - (n_components / df.shape[1]) | |
# Check if PCA should be performed based on the dimension reduction ratio | |
perform_pca = dim_reduction_ratio >= min_dim_reduction_ratio | |
return perform_pca, n_components | |
def perform_pca(df, n_components, Y_name): | |
""" | |
Performs PCA on the dataset, optionally excluding a target column, and standardizes the data. | |
Parameters: | |
- df (DataFrame): The input DataFrame. | |
- n_components (int): The number of principal components to retain. | |
- Y_name (str, optional): The name of the target column to exclude from PCA. Default is None. | |
Returns: | |
- pca_df (DataFrame): DataFrame with principal components and optionally the target column. | |
""" | |
# Save the target column data | |
drop_columns = [] | |
if Y_name: | |
target_data = df[Y_name] | |
drop_columns.append(Y_name) | |
# Remove non-numeric columns and the target column | |
numeric_df = df.select_dtypes(include=[np.number]).drop(columns=drop_columns, errors='ignore') | |
# Standardizing the Data | |
scaler = StandardScaler() | |
scaled_data = scaler.fit_transform(numeric_df) | |
# Applying PCA | |
pca = PCA(n_components=n_components) | |
principal_components = pca.fit_transform(scaled_data) | |
# Create a new DataFrame with principal components | |
columns = [f'PC{i+1}' for i in range(n_components)] | |
pca_df = pd.DataFrame(data=principal_components, columns=columns) | |
# Reattach the target column | |
if Y_name: | |
pca_df[Y_name] = target_data.reset_index(drop=True) | |
pca_df, _ = convert_to_integer(pca_df, columns_to_convert=[Y_name]) | |
return pca_df | |
def perform_PCA_for_clustering(df, n_components): | |
""" | |
Applies PCA transformation for clustering tasks on the given DataFrame. | |
Parameters: | |
- df (DataFrame): The input DataFrame to apply PCA. | |
- n_components (int): The number of principal components to retain. | |
Returns: | |
- pca_df (DataFrame): DataFrame of the principal components. | |
""" | |
# Applying PCA | |
pca = PCA(n_components=n_components) | |
principal_components = pca.fit_transform(df) | |
# Create a new DataFrame with principal components | |
columns = [f'PC{i+1}' for i in range(n_components)] | |
pca_df = pd.DataFrame(data=principal_components, columns=columns) | |
return pca_df | |
def perform_PCA_for_regression(df, n_components, Y_name): | |
""" | |
Applies PCA for regression tasks, excluding a specified target column from the transformation. | |
Parameters: | |
- df (DataFrame): The input DataFrame. | |
- n_components (int): The number of principal components to retain. | |
- Y_name (str, optional): The name of the target column to exclude from PCA and append back after transformation. Default is None. | |
Returns: | |
- pca_df (DataFrame): A new DataFrame with principal components and the target column. | |
""" | |
# Save the target column data | |
drop_columns = [] | |
if Y_name: | |
target_data = df[Y_name] | |
drop_columns.append(Y_name) | |
# Remove non-numeric columns and the target column | |
numeric_df = df.select_dtypes(include=[np.number]).drop(columns=drop_columns, errors='ignore') | |
# Applying PCA | |
pca = PCA(n_components=n_components) | |
principal_components = pca.fit_transform(numeric_df) | |
# Create a new DataFrame with principal components | |
columns = [f'PC{i+1}' for i in range(n_components)] | |
pca_df = pd.DataFrame(data=principal_components, columns=columns) | |
# Reattach the target column | |
if Y_name: | |
pca_df[Y_name] = target_data.reset_index(drop=True) | |
pca_df, _ = convert_to_integer(pca_df, columns_to_convert=[Y_name]) | |
return pca_df |