Wilson-ZheLin
Initial commit
9183c57
raw
history blame
5.1 kB
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from src.preprocess import convert_to_integer
def decide_pca(df, cumulative_variance_threshold=0.95, min_dim_reduction_ratio=0.1):
"""
Determines whether PCA should be performed based on cumulative variance threshold and dimension reduction ratio.
Parameters:
- df (DataFrame): The input DataFrame.
- cumulative_variance_threshold (float): The threshold of explained variance to retain. Default is 0.95.
- min_dim_reduction_ratio (float): The minimum ratio of dimension reduction required to perform PCA. Default is 0.1.
Returns:
- perform_pca (bool): Whether PCA should be performed.
- n_components (int): The number of principal components to retain.
"""
# Remove non-numeric columns
numeric_df = df.select_dtypes(include=[np.number])
# Standardizing the Data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)
# PCA for Explained Variance
pca = PCA()
pca.fit(scaled_data)
# Calculate cumulative variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
# Find the number of components for the desired threshold
n_components = np.where(cumulative_variance >= cumulative_variance_threshold)[0][0] + 1
# Calculate the dimension reduction ratio
dim_reduction_ratio = 1 - (n_components / df.shape[1])
# Check if PCA should be performed based on the dimension reduction ratio
perform_pca = dim_reduction_ratio >= min_dim_reduction_ratio
return perform_pca, n_components
def perform_pca(df, n_components, Y_name):
"""
Performs PCA on the dataset, optionally excluding a target column, and standardizes the data.
Parameters:
- df (DataFrame): The input DataFrame.
- n_components (int): The number of principal components to retain.
- Y_name (str, optional): The name of the target column to exclude from PCA. Default is None.
Returns:
- pca_df (DataFrame): DataFrame with principal components and optionally the target column.
"""
# Save the target column data
drop_columns = []
if Y_name:
target_data = df[Y_name]
drop_columns.append(Y_name)
# Remove non-numeric columns and the target column
numeric_df = df.select_dtypes(include=[np.number]).drop(columns=drop_columns, errors='ignore')
# Standardizing the Data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)
# Applying PCA
pca = PCA(n_components=n_components)
principal_components = pca.fit_transform(scaled_data)
# Create a new DataFrame with principal components
columns = [f'PC{i+1}' for i in range(n_components)]
pca_df = pd.DataFrame(data=principal_components, columns=columns)
# Reattach the target column
if Y_name:
pca_df[Y_name] = target_data.reset_index(drop=True)
pca_df, _ = convert_to_integer(pca_df, columns_to_convert=[Y_name])
return pca_df
def perform_PCA_for_clustering(df, n_components):
"""
Applies PCA transformation for clustering tasks on the given DataFrame.
Parameters:
- df (DataFrame): The input DataFrame to apply PCA.
- n_components (int): The number of principal components to retain.
Returns:
- pca_df (DataFrame): DataFrame of the principal components.
"""
# Applying PCA
pca = PCA(n_components=n_components)
principal_components = pca.fit_transform(df)
# Create a new DataFrame with principal components
columns = [f'PC{i+1}' for i in range(n_components)]
pca_df = pd.DataFrame(data=principal_components, columns=columns)
return pca_df
def perform_PCA_for_regression(df, n_components, Y_name):
"""
Applies PCA for regression tasks, excluding a specified target column from the transformation.
Parameters:
- df (DataFrame): The input DataFrame.
- n_components (int): The number of principal components to retain.
- Y_name (str, optional): The name of the target column to exclude from PCA and append back after transformation. Default is None.
Returns:
- pca_df (DataFrame): A new DataFrame with principal components and the target column.
"""
# Save the target column data
drop_columns = []
if Y_name:
target_data = df[Y_name]
drop_columns.append(Y_name)
# Remove non-numeric columns and the target column
numeric_df = df.select_dtypes(include=[np.number]).drop(columns=drop_columns, errors='ignore')
# Applying PCA
pca = PCA(n_components=n_components)
principal_components = pca.fit_transform(numeric_df)
# Create a new DataFrame with principal components
columns = [f'PC{i+1}' for i in range(n_components)]
pca_df = pd.DataFrame(data=principal_components, columns=columns)
# Reattach the target column
if Y_name:
pca_df[Y_name] = target_data.reset_index(drop=True)
pca_df, _ = convert_to_integer(pca_df, columns_to_convert=[Y_name])
return pca_df