import os import sys from dataclasses import dataclass import numpy as np import pandas as pd from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder, StandardScaler from src.logger import logging from src.exception import CustomException from src.utils import save_object @dataclass class DataTransformationConfig: preprocessor_obj_file_path = os.path.join("artifacts", "preprocessor.pkl") class DataTransformation: def __init__(self) -> None: self.data_transformation_config = DataTransformationConfig() def get_data_transformer_object(self): """ This function is responsible for data transformation """ try: numerical_columns = ["reading_score", "writing_score"] catogrical_columns = [ "gender", "race_ethnicity", "parental_level_of_education", "lunch", "test_preparation_course", ] num_pipeline = Pipeline( steps=[ ("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()), ] ) logging.info("Numerical columns standard scaling completed") cat_pipeline = Pipeline( steps=[ ("imputer", SimpleImputer(strategy="most_frequent")), ("one_hot_encoder", OneHotEncoder()), # ("scaler", StandardScaler()), ] ) logging.info("Categorical columns encoding completed") logging.info(f"Numerical columns: {numerical_columns}") logging.info(f"Categorical columns: {catogrical_columns}") preprocessor = ColumnTransformer( transformers=[ ("num_pipeline", num_pipeline, numerical_columns), ("cat_pipeline", cat_pipeline, catogrical_columns), ] ) return preprocessor except Exception as e: raise CustomException(e, sys) def initiate_data_transformation(self, train_path, test_path): try: train_df = pd.read_csv(train_path) test_df = pd.read_csv(test_path) logging.info("Read train and test data completed") logging.info("Obtaining preprocessing object") preprocessing_obj = self.get_data_transformer_object() target_column_name = "math_score" # numerical_columns = (["reading_score", "writing_score"],) input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1) target_feature_train_df = train_df[target_column_name] input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1) target_feature_test_df = test_df[target_column_name] logging.info( f"Applying preprocessing object on training and testing dataframe" ) input_feature_train_arr = preprocessing_obj.fit_transform( input_feature_train_df ) input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df) train_arr = np.c_[ input_feature_train_arr, np.array(target_feature_train_df) ] test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)] save_object( file_path=self.data_transformation_config.preprocessor_obj_file_path, obj=preprocessing_obj, ) logging.info(f"Saved preprocessing object") return ( train_arr, test_arr, self.data_transformation_config.preprocessor_obj_file_path, ) except Exception as e: raise CustomException(e, sys)