Spaces:
Sleeping
Sleeping
import numpy as np | |
def contains_missing_value(df): | |
""" | |
Checks if the DataFrame contains any missing values. | |
""" | |
return df.isnull().values.any() | |
def fill_null_values(df, mean_list, median_list, mode_list, new_category_list, interpolation_list): | |
""" | |
Fills missing values in the DataFrame using specified methods for different columns. | |
Parameters: | |
- df (DataFrame): The DataFrame with missing values. | |
- mean_list (list): Columns to fill missing values with mean. | |
- median_list (list): Columns to fill missing values with median. | |
- mode_list (list): Columns to fill missing values with mode. | |
- new_category_list (list): Columns to fill missing values with a new category (previously intended for 'NaN', now uses interpolation). | |
- interpolation_list (list): Columns to fill missing values using interpolation. | |
Returns: | |
- df (DataFrame): The DataFrame after filling missing values. | |
""" | |
if mean_list: | |
df = fill_with_mean(df, mean_list) | |
if median_list: | |
df = fill_with_median(df, median_list) | |
if mode_list: | |
df = fill_with_mode(df, mode_list) | |
if new_category_list: | |
# df = fill_with_NaN(df, new_category_list) | |
df = fill_with_interpolation(df, new_category_list) | |
if interpolation_list: | |
df = fill_with_interpolation(df, interpolation_list) | |
return df | |
def remove_high_null(df, threshold_row=0.5, threshold_col=0.7): | |
""" | |
Remove rows and columns from a DataFrame where the proportion of null values | |
is greater than the specified threshold. | |
- param df: Pandas DataFrame to be processed. | |
- param threshold_row: Proportion threshold for null values (default is 0.5 for rows). | |
- param threshold_col: Proportion threshold for null values (default is 0.7 for columns). | |
- return: DataFrame with high-null rows and columns removed. | |
""" | |
# Calculate the proportion of nulls in each column | |
null_prop_col = df.isnull().mean() | |
cols_to_drop = null_prop_col[null_prop_col > threshold_col].index | |
# Drop columns with high proportion of nulls | |
df_cleaned = df.drop(columns=cols_to_drop) | |
# Calculate the proportion of nulls in each row | |
null_prop_row = df_cleaned.isnull().mean(axis=1) | |
rows_to_drop = null_prop_row[null_prop_row > threshold_row].index | |
# Drop rows with high proportion of nulls | |
df_cleaned = df_cleaned.drop(index=rows_to_drop) | |
return df_cleaned | |
def fill_with_mean(df, attributes): | |
for attr in attributes: | |
if attr in df.columns: | |
df[attr] = df[attr].fillna(df[attr].mean()) | |
return df | |
def fill_with_median(df, attributes): | |
for attr in attributes: | |
if attr in df.columns: | |
df[attr] = df[attr].fillna(df[attr].median()) | |
return df | |
def fill_with_mode(df, attributes): | |
for attr in attributes: | |
if attr in df.columns: | |
mode_value = df[attr].mode()[0] if not df[attr].mode().empty else None | |
if mode_value is not None: | |
df[attr] = df[attr].fillna(mode_value) | |
return df | |
def fill_with_interpolation(df, attributes, method='linear'): | |
# method: default is 'linear'. 'time', 'index', 'pad', 'nearest', 'quadratic', 'cubic', etc. | |
for attr in attributes: | |
if attr in df.columns: | |
df[attr] = df[attr].interpolate(method=method) | |
return df | |
# Deprecated: replaced with interpolation to ensure no missing values | |
def fill_with_NaN(df, attributes): | |
for attr in attributes: | |
if attr in df.columns: | |
df[attr] = df[attr].fillna('NaN') | |
return df | |
def replace_placeholders_with_nan(df): | |
""" | |
Replaces common placeholders for missing values in object columns with np.nan. | |
Parameters: | |
- df (DataFrame): The DataFrame to process. | |
Returns: | |
- df (DataFrame): Updated DataFrame with placeholders replaced. | |
""" | |
placeholders = ["NA", "NULL", "?", "", "NaN", "None", "N/A", "n/a", "nan", "none"] | |
for col in df.columns: | |
if df[col].dtype == 'object': | |
df[col] = df[col].apply(lambda x: np.nan if str(x).lower() in placeholders else x) | |
return df |