Spaces:

Zhe-Lin
/

Streamline-Analyst

Runtime error

File size: 5,097 Bytes

9183c57

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from src.preprocess import convert_to_integer

def decide_pca(df, cumulative_variance_threshold=0.95, min_dim_reduction_ratio=0.1):
    """
    Determines whether PCA should be performed based on cumulative variance threshold and dimension reduction ratio.

    Parameters:
    - df (DataFrame): The input DataFrame.
    - cumulative_variance_threshold (float): The threshold of explained variance to retain. Default is 0.95.
    - min_dim_reduction_ratio (float): The minimum ratio of dimension reduction required to perform PCA. Default is 0.1.

    Returns:
    - perform_pca (bool): Whether PCA should be performed.
    - n_components (int): The number of principal components to retain.
    """
    # Remove non-numeric columns
    numeric_df = df.select_dtypes(include=[np.number])

    # Standardizing the Data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(numeric_df)

    # PCA for Explained Variance
    pca = PCA()
    pca.fit(scaled_data)

    # Calculate cumulative variance
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

    # Find the number of components for the desired threshold
    n_components = np.where(cumulative_variance >= cumulative_variance_threshold)[0][0] + 1

    # Calculate the dimension reduction ratio
    dim_reduction_ratio = 1 - (n_components / df.shape[1])

    # Check if PCA should be performed based on the dimension reduction ratio
    perform_pca = dim_reduction_ratio >= min_dim_reduction_ratio
    return perform_pca, n_components

def perform_pca(df, n_components, Y_name):
    """
    Performs PCA on the dataset, optionally excluding a target column, and standardizes the data.

    Parameters:
    - df (DataFrame): The input DataFrame.
    - n_components (int): The number of principal components to retain.
    - Y_name (str, optional): The name of the target column to exclude from PCA. Default is None.

    Returns:
    - pca_df (DataFrame): DataFrame with principal components and optionally the target column.
    """
    # Save the target column data
    drop_columns = []
    if Y_name:
        target_data = df[Y_name]
        drop_columns.append(Y_name)

    # Remove non-numeric columns and the target column
    numeric_df = df.select_dtypes(include=[np.number]).drop(columns=drop_columns, errors='ignore')

    # Standardizing the Data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(numeric_df)

    # Applying PCA
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(scaled_data)
    
    # Create a new DataFrame with principal components
    columns = [f'PC{i+1}' for i in range(n_components)]
    pca_df = pd.DataFrame(data=principal_components, columns=columns)

    # Reattach the target column
    if Y_name:
        pca_df[Y_name] = target_data.reset_index(drop=True)
        pca_df, _ = convert_to_integer(pca_df, columns_to_convert=[Y_name])

    return pca_df

def perform_PCA_for_clustering(df, n_components):
    """
    Applies PCA transformation for clustering tasks on the given DataFrame.

    Parameters:
    - df (DataFrame): The input DataFrame to apply PCA.
    - n_components (int): The number of principal components to retain.

    Returns:
    - pca_df (DataFrame): DataFrame of the principal components.
    """
    # Applying PCA
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(df)
    
    # Create a new DataFrame with principal components
    columns = [f'PC{i+1}' for i in range(n_components)]
    pca_df = pd.DataFrame(data=principal_components, columns=columns)
    
    return pca_df

def perform_PCA_for_regression(df, n_components, Y_name):
    """
    Applies PCA for regression tasks, excluding a specified target column from the transformation.

    Parameters:
    - df (DataFrame): The input DataFrame.
    - n_components (int): The number of principal components to retain.
    - Y_name (str, optional): The name of the target column to exclude from PCA and append back after transformation. Default is None.

    Returns:
    - pca_df (DataFrame): A new DataFrame with principal components and the target column.
    """

    # Save the target column data
    drop_columns = []
    if Y_name:
        target_data = df[Y_name]
        drop_columns.append(Y_name)

    # Remove non-numeric columns and the target column
    numeric_df = df.select_dtypes(include=[np.number]).drop(columns=drop_columns, errors='ignore')

    # Applying PCA
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(numeric_df)
    
    # Create a new DataFrame with principal components
    columns = [f'PC{i+1}' for i in range(n_components)]
    pca_df = pd.DataFrame(data=principal_components, columns=columns)

    # Reattach the target column
    if Y_name:
        pca_df[Y_name] = target_data.reset_index(drop=True)
        pca_df, _ = convert_to_integer(pca_df, columns_to_convert=[Y_name])
    
    return pca_df