Streamline-Analyst / app /src /handle_null_value.py
Wilson-ZheLin
Initial commit
9183c57
raw
history blame
4.13 kB
import numpy as np
def contains_missing_value(df):
"""
Checks if the DataFrame contains any missing values.
"""
return df.isnull().values.any()
def fill_null_values(df, mean_list, median_list, mode_list, new_category_list, interpolation_list):
"""
Fills missing values in the DataFrame using specified methods for different columns.
Parameters:
- df (DataFrame): The DataFrame with missing values.
- mean_list (list): Columns to fill missing values with mean.
- median_list (list): Columns to fill missing values with median.
- mode_list (list): Columns to fill missing values with mode.
- new_category_list (list): Columns to fill missing values with a new category (previously intended for 'NaN', now uses interpolation).
- interpolation_list (list): Columns to fill missing values using interpolation.
Returns:
- df (DataFrame): The DataFrame after filling missing values.
"""
if mean_list:
df = fill_with_mean(df, mean_list)
if median_list:
df = fill_with_median(df, median_list)
if mode_list:
df = fill_with_mode(df, mode_list)
if new_category_list:
# df = fill_with_NaN(df, new_category_list)
df = fill_with_interpolation(df, new_category_list)
if interpolation_list:
df = fill_with_interpolation(df, interpolation_list)
return df
def remove_high_null(df, threshold_row=0.5, threshold_col=0.7):
"""
Remove rows and columns from a DataFrame where the proportion of null values
is greater than the specified threshold.
- param df: Pandas DataFrame to be processed.
- param threshold_row: Proportion threshold for null values (default is 0.5 for rows).
- param threshold_col: Proportion threshold for null values (default is 0.7 for columns).
- return: DataFrame with high-null rows and columns removed.
"""
# Calculate the proportion of nulls in each column
null_prop_col = df.isnull().mean()
cols_to_drop = null_prop_col[null_prop_col > threshold_col].index
# Drop columns with high proportion of nulls
df_cleaned = df.drop(columns=cols_to_drop)
# Calculate the proportion of nulls in each row
null_prop_row = df_cleaned.isnull().mean(axis=1)
rows_to_drop = null_prop_row[null_prop_row > threshold_row].index
# Drop rows with high proportion of nulls
df_cleaned = df_cleaned.drop(index=rows_to_drop)
return df_cleaned
def fill_with_mean(df, attributes):
for attr in attributes:
if attr in df.columns:
df[attr] = df[attr].fillna(df[attr].mean())
return df
def fill_with_median(df, attributes):
for attr in attributes:
if attr in df.columns:
df[attr] = df[attr].fillna(df[attr].median())
return df
def fill_with_mode(df, attributes):
for attr in attributes:
if attr in df.columns:
mode_value = df[attr].mode()[0] if not df[attr].mode().empty else None
if mode_value is not None:
df[attr] = df[attr].fillna(mode_value)
return df
def fill_with_interpolation(df, attributes, method='linear'):
# method: default is 'linear'. 'time', 'index', 'pad', 'nearest', 'quadratic', 'cubic', etc.
for attr in attributes:
if attr in df.columns:
df[attr] = df[attr].interpolate(method=method)
return df
# Deprecated: replaced with interpolation to ensure no missing values
def fill_with_NaN(df, attributes):
for attr in attributes:
if attr in df.columns:
df[attr] = df[attr].fillna('NaN')
return df
def replace_placeholders_with_nan(df):
"""
Replaces common placeholders for missing values in object columns with np.nan.
Parameters:
- df (DataFrame): The DataFrame to process.
Returns:
- df (DataFrame): Updated DataFrame with placeholders replaced.
"""
placeholders = ["NA", "NULL", "?", "", "NaN", "None", "N/A", "n/a", "nan", "none"]
for col in df.columns:
if df[col].dtype == 'object':
df[col] = df[col].apply(lambda x: np.nan if str(x).lower() in placeholders else x)
return df