Streamline-Analyst / app /src /preprocess.py
Wilson-ZheLin
Initial commit
9183c57
raw
history blame contribute delete
No virus
5.34 kB
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler, PowerTransformer
def convert_to_numeric(df, convert_int_cols_list, one_hot_cols_list, drop_cols):
"""
Convert specified columns in the DataFrame to numeric formats and drop specified columns.
Integer conversion and one-hot encoding are applied based on the provided lists of columns.
Returns a modified DataFrame and a dictionary of mappings used for conversions.
:param df: Pandas DataFrame to be processed.
:param convert_int_cols_list: List of column names to be converted to integer type.
:param one_hot_cols_list: List of column names to be converted to one-hot encoding.
:param drop_cols: List of column names to be dropped from the DataFrame.
:return: A tuple with two elements:
1. DataFrame with specified columns converted and specified columns dropped.
2. Dictionary of mappings for each conversion type ('integer_mappings' and 'one_hot_mappings').
"""
df, int_mapping = convert_to_integer(df, convert_int_cols_list)
df, one_hot_mapping = convert_to_one_hot(df, one_hot_cols_list)
df = df.drop(columns=drop_cols, errors='ignore')
mappings = {'integer_mappings': int_mapping, 'one_hot_mappings': one_hot_mapping}
return df, mappings
def convert_to_integer(df, columns_to_convert=[]):
"""
Convert specified non-numeric columns in the DataFrame to integer type,
and return a dictionary of mappings from original values to integers.
:param df: Pandas DataFrame to be processed.
:param columns_to_convert: List of column names to be converted to integer type.
:return: A tuple with two elements:
1. DataFrame with specified columns converted to integer type.
2. Dictionary of mappings for each converted column.
"""
mappings = {}
for column in columns_to_convert:
if df[column].dtype == 'object':
# Create a mapping from unique values to integers
unique_values = df[column].unique()
int_to_value_map = {i: value for i, value in enumerate(unique_values)}
mappings[column] = int_to_value_map
# Apply the reversed mapping to the DataFrame
value_to_int_map = {v: k for k, v in int_to_value_map.items()}
df[column] = df[column].map(value_to_int_map)
return df, mappings
def convert_to_one_hot(df, columns_to_convert=[]):
"""
Convert specified non-numeric columns in the DataFrame to one-hot encoding,
and return a modified DataFrame and a dictionary of mappings used for one-hot encoding.
:param df: Pandas DataFrame to be processed.
:param columns_to_convert: List of column names to be converted to one-hot encoding.
:return: A tuple with two elements:
1. DataFrame with specified columns converted to one-hot encoding.
2. Dictionary of mappings for each converted column.
"""
mappings = {}
df_modified = df.copy()
for column in columns_to_convert:
# Check if the column is categorical
if df[column].dtype == 'object' or df[column].dtype == 'category':
# Perform one-hot encoding
one_hot = pd.get_dummies(df[column], prefix=column)
# Add the new columns to the modified DataFrame
df_modified = pd.concat([df_modified, one_hot], axis=1)
# Drop the original column
df_modified = df_modified.drop(column, axis=1)
# Store the mapping
mappings[column] = {i: column + '_' + str(i) for i in df[column].unique()}
return df_modified, mappings
def remove_rows_with_empty_target(df, Y_name):
"""
Remove rows from the DataFrame where the target column has empty values.
:param df: Pandas DataFrame to be processed.
:param Y_name: Name of the target column to check for empty values.
:return: DataFrame with rows removed where target column value is empty.
"""
# Remove rows where the target column is empty (NaN)
cleaned_df = df.dropna(subset=[Y_name])
return cleaned_df
def remove_duplicates(df):
"""
Remove duplicate rows from the DataFrame.
"""
return df.drop_duplicates()
def transform_data_for_clustering(df):
"""
Transform numeric columns in the DataFrame for clustering.
Applies a PowerTransformer to columns with skewness over a threshold and standardizes them.
This can help in making the clustering algorithm more effective by normalizing the scale of numerical features.
:param df: Pandas DataFrame to be transformed.
:return: DataFrame with transformed numeric columns suitable for clustering.
"""
numeric_cols = df.select_dtypes(include=[np.number]).columns
transformed_df = df.copy()
pt = PowerTransformer(method='box-cox', standardize=False)
for col in numeric_cols:
if (transformed_df[col] > 0).all():
skewness = stats.skew(transformed_df[col])
if abs(skewness) > 0.5:
transformed_data = pt.fit_transform(transformed_df[[col]])
transformed_df[col] = transformed_data
scaler = StandardScaler()
transformed_df[numeric_cols] = scaler.fit_transform(transformed_df[numeric_cols])
return transformed_df