File size: 5,338 Bytes
9183c57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler, PowerTransformer

def convert_to_numeric(df, convert_int_cols_list, one_hot_cols_list, drop_cols):
    """
    Convert specified columns in the DataFrame to numeric formats and drop specified columns.
    Integer conversion and one-hot encoding are applied based on the provided lists of columns.
    Returns a modified DataFrame and a dictionary of mappings used for conversions.

    :param df: Pandas DataFrame to be processed.
    :param convert_int_cols_list: List of column names to be converted to integer type.
    :param one_hot_cols_list: List of column names to be converted to one-hot encoding.
    :param drop_cols: List of column names to be dropped from the DataFrame.
    :return: A tuple with two elements:
             1. DataFrame with specified columns converted and specified columns dropped.
             2. Dictionary of mappings for each conversion type ('integer_mappings' and 'one_hot_mappings').
    """
    df, int_mapping = convert_to_integer(df, convert_int_cols_list)
    df, one_hot_mapping = convert_to_one_hot(df, one_hot_cols_list)
    df = df.drop(columns=drop_cols, errors='ignore')
    mappings = {'integer_mappings': int_mapping, 'one_hot_mappings': one_hot_mapping}
    return df, mappings

def convert_to_integer(df, columns_to_convert=[]):
    """
    Convert specified non-numeric columns in the DataFrame to integer type,
    and return a dictionary of mappings from original values to integers.

    :param df: Pandas DataFrame to be processed.
    :param columns_to_convert: List of column names to be converted to integer type.
    :return: A tuple with two elements:
             1. DataFrame with specified columns converted to integer type.
             2. Dictionary of mappings for each converted column.
    """
    mappings = {}
    for column in columns_to_convert:

        if df[column].dtype == 'object':
            # Create a mapping from unique values to integers
            unique_values = df[column].unique()
            int_to_value_map = {i: value for i, value in enumerate(unique_values)}
            mappings[column] = int_to_value_map

            # Apply the reversed mapping to the DataFrame
            value_to_int_map = {v: k for k, v in int_to_value_map.items()}
            df[column] = df[column].map(value_to_int_map)

    return df, mappings

def convert_to_one_hot(df, columns_to_convert=[]):
    """
    Convert specified non-numeric columns in the DataFrame to one-hot encoding,
    and return a modified DataFrame and a dictionary of mappings used for one-hot encoding.

    :param df: Pandas DataFrame to be processed.
    :param columns_to_convert: List of column names to be converted to one-hot encoding.
    :return: A tuple with two elements:
             1. DataFrame with specified columns converted to one-hot encoding.
             2. Dictionary of mappings for each converted column.
    """
    mappings = {}
    df_modified = df.copy()

    for column in columns_to_convert:
        # Check if the column is categorical
        if df[column].dtype == 'object' or df[column].dtype == 'category':
            # Perform one-hot encoding
            one_hot = pd.get_dummies(df[column], prefix=column)
            # Add the new columns to the modified DataFrame
            df_modified = pd.concat([df_modified, one_hot], axis=1)
            # Drop the original column
            df_modified = df_modified.drop(column, axis=1)

            # Store the mapping
            mappings[column] = {i: column + '_' + str(i) for i in df[column].unique()}

    return df_modified, mappings

def remove_rows_with_empty_target(df, Y_name):
    """
    Remove rows from the DataFrame where the target column has empty values.

    :param df: Pandas DataFrame to be processed.
    :param Y_name: Name of the target column to check for empty values.
    :return: DataFrame with rows removed where target column value is empty.
    """
    # Remove rows where the target column is empty (NaN)
    cleaned_df = df.dropna(subset=[Y_name])
    return cleaned_df

def remove_duplicates(df):
    """
    Remove duplicate rows from the DataFrame.
    """
    return df.drop_duplicates()

def transform_data_for_clustering(df):
    """
    Transform numeric columns in the DataFrame for clustering.
    Applies a PowerTransformer to columns with skewness over a threshold and standardizes them.
    This can help in making the clustering algorithm more effective by normalizing the scale of numerical features.

    :param df: Pandas DataFrame to be transformed.
    :return: DataFrame with transformed numeric columns suitable for clustering.
    """
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    transformed_df = df.copy()
    pt = PowerTransformer(method='box-cox', standardize=False)

    for col in numeric_cols:
        if (transformed_df[col] > 0).all():
            skewness = stats.skew(transformed_df[col])
            if abs(skewness) > 0.5:
                transformed_data = pt.fit_transform(transformed_df[[col]])
                transformed_df[col] = transformed_data

    scaler = StandardScaler()
    transformed_df[numeric_cols] = scaler.fit_transform(transformed_df[numeric_cols])
    
    return transformed_df