File size: 5,097 Bytes
9183c57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from src.preprocess import convert_to_integer

def decide_pca(df, cumulative_variance_threshold=0.95, min_dim_reduction_ratio=0.1):
    """
    Determines whether PCA should be performed based on cumulative variance threshold and dimension reduction ratio.

    Parameters:
    - df (DataFrame): The input DataFrame.
    - cumulative_variance_threshold (float): The threshold of explained variance to retain. Default is 0.95.
    - min_dim_reduction_ratio (float): The minimum ratio of dimension reduction required to perform PCA. Default is 0.1.

    Returns:
    - perform_pca (bool): Whether PCA should be performed.
    - n_components (int): The number of principal components to retain.
    """
    # Remove non-numeric columns
    numeric_df = df.select_dtypes(include=[np.number])

    # Standardizing the Data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(numeric_df)

    # PCA for Explained Variance
    pca = PCA()
    pca.fit(scaled_data)

    # Calculate cumulative variance
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

    # Find the number of components for the desired threshold
    n_components = np.where(cumulative_variance >= cumulative_variance_threshold)[0][0] + 1

    # Calculate the dimension reduction ratio
    dim_reduction_ratio = 1 - (n_components / df.shape[1])

    # Check if PCA should be performed based on the dimension reduction ratio
    perform_pca = dim_reduction_ratio >= min_dim_reduction_ratio
    return perform_pca, n_components

def perform_pca(df, n_components, Y_name):
    """
    Performs PCA on the dataset, optionally excluding a target column, and standardizes the data.

    Parameters:
    - df (DataFrame): The input DataFrame.
    - n_components (int): The number of principal components to retain.
    - Y_name (str, optional): The name of the target column to exclude from PCA. Default is None.

    Returns:
    - pca_df (DataFrame): DataFrame with principal components and optionally the target column.
    """
    # Save the target column data
    drop_columns = []
    if Y_name:
        target_data = df[Y_name]
        drop_columns.append(Y_name)

    # Remove non-numeric columns and the target column
    numeric_df = df.select_dtypes(include=[np.number]).drop(columns=drop_columns, errors='ignore')

    # Standardizing the Data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(numeric_df)

    # Applying PCA
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(scaled_data)
    
    # Create a new DataFrame with principal components
    columns = [f'PC{i+1}' for i in range(n_components)]
    pca_df = pd.DataFrame(data=principal_components, columns=columns)

    # Reattach the target column
    if Y_name:
        pca_df[Y_name] = target_data.reset_index(drop=True)
        pca_df, _ = convert_to_integer(pca_df, columns_to_convert=[Y_name])

    return pca_df

def perform_PCA_for_clustering(df, n_components):
    """
    Applies PCA transformation for clustering tasks on the given DataFrame.

    Parameters:
    - df (DataFrame): The input DataFrame to apply PCA.
    - n_components (int): The number of principal components to retain.

    Returns:
    - pca_df (DataFrame): DataFrame of the principal components.
    """
    # Applying PCA
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(df)
    
    # Create a new DataFrame with principal components
    columns = [f'PC{i+1}' for i in range(n_components)]
    pca_df = pd.DataFrame(data=principal_components, columns=columns)
    
    return pca_df

def perform_PCA_for_regression(df, n_components, Y_name):
    """
    Applies PCA for regression tasks, excluding a specified target column from the transformation.

    Parameters:
    - df (DataFrame): The input DataFrame.
    - n_components (int): The number of principal components to retain.
    - Y_name (str, optional): The name of the target column to exclude from PCA and append back after transformation. Default is None.

    Returns:
    - pca_df (DataFrame): A new DataFrame with principal components and the target column.
    """

    # Save the target column data
    drop_columns = []
    if Y_name:
        target_data = df[Y_name]
        drop_columns.append(Y_name)

    # Remove non-numeric columns and the target column
    numeric_df = df.select_dtypes(include=[np.number]).drop(columns=drop_columns, errors='ignore')

    # Applying PCA
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(numeric_df)
    
    # Create a new DataFrame with principal components
    columns = [f'PC{i+1}' for i in range(n_components)]
    pca_df = pd.DataFrame(data=principal_components, columns=columns)

    # Reattach the target column
    if Y_name:
        pca_df[Y_name] = target_data.reset_index(drop=True)
        pca_df, _ = convert_to_integer(pca_df, columns_to_convert=[Y_name])
    
    return pca_df