|
import os |
|
import sys |
|
from dataclasses import dataclass |
|
|
|
import numpy as np |
|
import pandas as pd |
|
from sklearn.pipeline import Pipeline |
|
from sklearn.compose import ColumnTransformer |
|
from sklearn.impute import SimpleImputer |
|
from sklearn.preprocessing import OneHotEncoder, StandardScaler |
|
|
|
from src.logger import logging |
|
from src.exception import CustomException |
|
from src.utils import save_object |
|
|
|
|
|
@dataclass |
|
class DataTransformationConfig: |
|
preprocessor_obj_file_path = os.path.join("artifacts", "preprocessor.pkl") |
|
|
|
|
|
class DataTransformation: |
|
def __init__(self) -> None: |
|
self.data_transformation_config = DataTransformationConfig() |
|
|
|
def get_data_transformer_object(self): |
|
""" |
|
This function is responsible for data transformation |
|
""" |
|
try: |
|
numerical_columns = ["reading_score", "writing_score"] |
|
catogrical_columns = [ |
|
"gender", |
|
"race_ethnicity", |
|
"parental_level_of_education", |
|
"lunch", |
|
"test_preparation_course", |
|
] |
|
|
|
num_pipeline = Pipeline( |
|
steps=[ |
|
("imputer", SimpleImputer(strategy="median")), |
|
("scaler", StandardScaler()), |
|
] |
|
) |
|
logging.info("Numerical columns standard scaling completed") |
|
|
|
cat_pipeline = Pipeline( |
|
steps=[ |
|
("imputer", SimpleImputer(strategy="most_frequent")), |
|
("one_hot_encoder", OneHotEncoder()), |
|
|
|
] |
|
) |
|
logging.info("Categorical columns encoding completed") |
|
|
|
logging.info(f"Numerical columns: {numerical_columns}") |
|
logging.info(f"Categorical columns: {catogrical_columns}") |
|
|
|
preprocessor = ColumnTransformer( |
|
transformers=[ |
|
("num_pipeline", num_pipeline, numerical_columns), |
|
("cat_pipeline", cat_pipeline, catogrical_columns), |
|
] |
|
) |
|
|
|
return preprocessor |
|
|
|
except Exception as e: |
|
raise CustomException(e, sys) |
|
|
|
def initiate_data_transformation(self, train_path, test_path): |
|
try: |
|
train_df = pd.read_csv(train_path) |
|
test_df = pd.read_csv(test_path) |
|
logging.info("Read train and test data completed") |
|
logging.info("Obtaining preprocessing object") |
|
|
|
preprocessing_obj = self.get_data_transformer_object() |
|
|
|
target_column_name = "math_score" |
|
|
|
|
|
input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1) |
|
target_feature_train_df = train_df[target_column_name] |
|
|
|
input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1) |
|
target_feature_test_df = test_df[target_column_name] |
|
|
|
logging.info( |
|
f"Applying preprocessing object on training and testing dataframe" |
|
) |
|
|
|
input_feature_train_arr = preprocessing_obj.fit_transform( |
|
input_feature_train_df |
|
) |
|
input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df) |
|
|
|
train_arr = np.c_[ |
|
input_feature_train_arr, np.array(target_feature_train_df) |
|
] |
|
test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)] |
|
|
|
save_object( |
|
file_path=self.data_transformation_config.preprocessor_obj_file_path, |
|
obj=preprocessing_obj, |
|
) |
|
logging.info(f"Saved preprocessing object") |
|
|
|
return ( |
|
train_arr, |
|
test_arr, |
|
self.data_transformation_config.preprocessor_obj_file_path, |
|
) |
|
except Exception as e: |
|
raise CustomException(e, sys) |
|
|