File size: 4,132 Bytes
9183c57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import numpy as np

def contains_missing_value(df):
    """
    Checks if the DataFrame contains any missing values.
    """
    return df.isnull().values.any()

def fill_null_values(df, mean_list, median_list, mode_list, new_category_list, interpolation_list):
    """
    Fills missing values in the DataFrame using specified methods for different columns.

    Parameters:
    - df (DataFrame): The DataFrame with missing values.
    - mean_list (list): Columns to fill missing values with mean.
    - median_list (list): Columns to fill missing values with median.
    - mode_list (list): Columns to fill missing values with mode.
    - new_category_list (list): Columns to fill missing values with a new category (previously intended for 'NaN', now uses interpolation).
    - interpolation_list (list): Columns to fill missing values using interpolation.

    Returns:
    - df (DataFrame): The DataFrame after filling missing values.
    """
    if mean_list:
        df = fill_with_mean(df, mean_list)
    if median_list:
        df = fill_with_median(df, median_list)
    if mode_list:
        df = fill_with_mode(df, mode_list)
    if new_category_list:
        # df = fill_with_NaN(df, new_category_list)
        df = fill_with_interpolation(df, new_category_list)
    if interpolation_list:
        df = fill_with_interpolation(df, interpolation_list)
    return df

def remove_high_null(df, threshold_row=0.5, threshold_col=0.7):
    """
    Remove rows and columns from a DataFrame where the proportion of null values
    is greater than the specified threshold.

    - param df: Pandas DataFrame to be processed.
    - param threshold_row: Proportion threshold for null values (default is 0.5 for rows).
    - param threshold_col: Proportion threshold for null values (default is 0.7 for columns).

    - return: DataFrame with high-null rows and columns removed.
    """
    # Calculate the proportion of nulls in each column
    null_prop_col = df.isnull().mean()
    cols_to_drop = null_prop_col[null_prop_col > threshold_col].index

    # Drop columns with high proportion of nulls
    df_cleaned = df.drop(columns=cols_to_drop)

    # Calculate the proportion of nulls in each row
    null_prop_row = df_cleaned.isnull().mean(axis=1)
    rows_to_drop = null_prop_row[null_prop_row > threshold_row].index

    # Drop rows with high proportion of nulls
    df_cleaned = df_cleaned.drop(index=rows_to_drop)

    return df_cleaned

def fill_with_mean(df, attributes):
    for attr in attributes:
        if attr in df.columns:
            df[attr] = df[attr].fillna(df[attr].mean())
    return df

def fill_with_median(df, attributes):
    for attr in attributes:
        if attr in df.columns:
            df[attr] = df[attr].fillna(df[attr].median())
    return df

def fill_with_mode(df, attributes):
    for attr in attributes:
        if attr in df.columns:
            mode_value = df[attr].mode()[0] if not df[attr].mode().empty else None
            if mode_value is not None:
                df[attr] = df[attr].fillna(mode_value)
    return df

def fill_with_interpolation(df, attributes, method='linear'):
    # method: default is 'linear'. 'time', 'index', 'pad', 'nearest', 'quadratic', 'cubic', etc.
    for attr in attributes:
        if attr in df.columns:
            df[attr] = df[attr].interpolate(method=method)
    return df

# Deprecated: replaced with interpolation to ensure no missing values
def fill_with_NaN(df, attributes):
    for attr in attributes:
        if attr in df.columns:
            df[attr] = df[attr].fillna('NaN')
    return df

def replace_placeholders_with_nan(df):
    """
    Replaces common placeholders for missing values in object columns with np.nan.

    Parameters:
    - df (DataFrame): The DataFrame to process.

    Returns:
    - df (DataFrame): Updated DataFrame with placeholders replaced.
    """
    placeholders = ["NA", "NULL", "?", "", "NaN", "None", "N/A", "n/a", "nan", "none"]
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].apply(lambda x: np.nan if str(x).lower() in placeholders else x)
    return df