singhjagpreet commited on
Commit
bf0670d
1 Parent(s): eb2eadc

data trasformation

Browse files
logs/09_09_2023_20_29_47.log/09_09_2023_20_29_47.log ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [ 2023-09-09 20:29:51,791 ] 25 root - INFO - Entered the data ingestion method or component
2
+ [ 2023-09-09 20:29:51,795 ] 28 root - INFO - read the dataset as dataframe
3
+ [ 2023-09-09 20:29:51,799 ] 37 root - INFO - Train test split initiated
4
+ [ 2023-09-09 20:29:51,804 ] 44 root - INFO - ingestion of data completed
5
+ [ 2023-09-09 20:29:51,806 ] 68 root - INFO - read train and test data completed
6
+ [ 2023-09-09 20:29:51,806 ] 70 root - INFO - obtaining preprocessing object
7
+ [ 2023-09-09 20:29:51,806 ] 44 root - INFO - numerical columns: ['writing_score', 'reading_score']
8
+ [ 2023-09-09 20:29:51,806 ] 51 root - INFO - categorical columns: ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
9
+ [ 2023-09-09 20:29:51,806 ] 81 root - INFO - applying preprocessing object on training and testing dataframe
logs/09_09_2023_20_30_41.log/09_09_2023_20_30_41.log ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ [ 2023-09-09 20:30:42,388 ] 25 root - INFO - Entered the data ingestion method or component
2
+ [ 2023-09-09 20:30:42,391 ] 28 root - INFO - read the dataset as dataframe
3
+ [ 2023-09-09 20:30:42,394 ] 37 root - INFO - Train test split initiated
4
+ [ 2023-09-09 20:30:42,398 ] 44 root - INFO - ingestion of data completed
5
+ [ 2023-09-09 20:30:42,400 ] 68 root - INFO - read train and test data completed
6
+ [ 2023-09-09 20:30:42,400 ] 70 root - INFO - obtaining preprocessing object
7
+ [ 2023-09-09 20:30:42,400 ] 44 root - INFO - numerical columns: ['writing_score', 'reading_score']
8
+ [ 2023-09-09 20:30:42,400 ] 51 root - INFO - categorical columns: ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
9
+ [ 2023-09-09 20:30:42,400 ] 81 root - INFO - applying preprocessing object on training and testing dataframe
10
+ [ 2023-09-09 20:30:42,408 ] 100 root - INFO - saved preprocessing object.
logs/09_09_2023_20_31_40.log/09_09_2023_20_31_40.log ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ [ 2023-09-09 20:31:41,630 ] 25 root - INFO - Entered the data ingestion method or component
2
+ [ 2023-09-09 20:31:41,633 ] 28 root - INFO - read the dataset as dataframe
3
+ [ 2023-09-09 20:31:41,635 ] 37 root - INFO - Train test split initiated
4
+ [ 2023-09-09 20:31:41,639 ] 44 root - INFO - ingestion of data completed
5
+ [ 2023-09-09 20:31:41,641 ] 68 root - INFO - read train and test data completed
6
+ [ 2023-09-09 20:31:41,641 ] 70 root - INFO - obtaining preprocessing object
7
+ [ 2023-09-09 20:31:41,641 ] 44 root - INFO - numerical columns: ['writing_score', 'reading_score']
8
+ [ 2023-09-09 20:31:41,641 ] 51 root - INFO - categorical columns: ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
9
+ [ 2023-09-09 20:31:41,641 ] 81 root - INFO - applying preprocessing object on training and testing dataframe
10
+ [ 2023-09-09 20:31:41,648 ] 100 root - INFO - saved preprocessing object.
src/__pycache__/utils.cpython-310.pyc ADDED
Binary file (651 Bytes). View file
 
src/components/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (158 Bytes). View file
 
src/components/__pycache__/data_transformation.cpython-310.pyc ADDED
Binary file (3.17 kB). View file
 
src/components/data_ingestion.py CHANGED
@@ -7,6 +7,9 @@ import pandas as pd
7
  from sklearn.model_selection import train_test_split
8
  from dataclasses import dataclass
9
 
 
 
 
10
  @dataclass
11
  class DataIngestionConfig:
12
  train_data_path: str=os.path.join('artifacts','train.csv')
@@ -49,4 +52,7 @@ class DataIngestion:
49
 
50
  if __name__ == '__main__':
51
  obj=DataIngestion()
52
- obj.intiate_data_ingestion()
 
 
 
 
7
  from sklearn.model_selection import train_test_split
8
  from dataclasses import dataclass
9
 
10
+ from src.components.data_transformation import DataTransformation,DataTransformationConfig
11
+
12
+
13
  @dataclass
14
  class DataIngestionConfig:
15
  train_data_path: str=os.path.join('artifacts','train.csv')
 
52
 
53
  if __name__ == '__main__':
54
  obj=DataIngestion()
55
+ train_data_path, test_data_path = obj.intiate_data_ingestion()
56
+
57
+ data_transformation = DataTransformation()
58
+ data_transformation.initiate_data_tranformation(train_data_path,test_data_path)
src/components/data_transformation.py CHANGED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from dataclasses import dataclass
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from sklearn.compose import ColumnTransformer
8
+ from sklearn.impute import SimpleImputer
9
+ from sklearn.preprocessing import OneHotEncoder,StandardScaler
10
+ from sklearn.pipeline import Pipeline
11
+
12
+
13
+ from src.exception import CustomException
14
+ from src.logger import logging
15
+ from src.utils import save_object
16
+
17
+ @dataclass
18
+ class DataTransformationConfig:
19
+ preprocessor_ob_file_path = os.path.join('artifacts','preprocessor.pkl')
20
+
21
+ class DataTransformation:
22
+ def __init__(self):
23
+ self.data_transformation_config = DataTransformationConfig()
24
+ self.numerical_columns = ['writing_score','reading_score']
25
+ self.categorical_columns = ['gender',
26
+ 'race_ethnicity',
27
+ 'parental_level_of_education',
28
+ 'lunch',
29
+ 'test_preparation_course']
30
+ self.target_column_name = 'math_score'
31
+
32
+ def get_data_transformer_object(self):
33
+ """
34
+
35
+ function performs data transformation
36
+ """
37
+ try:
38
+
39
+
40
+ num_pipeline = Pipeline(steps=[
41
+ ('imputer',SimpleImputer(strategy='median')),
42
+ ('scaller',StandardScaler())
43
+ ])
44
+ logging.info(f"numerical columns: {self.numerical_columns}")
45
+
46
+ cat_pipeline = Pipeline(steps=[
47
+ ('imputer',SimpleImputer(strategy='most_frequent')),
48
+ ('ohe',OneHotEncoder(drop='first',handle_unknown='ignore'))
49
+
50
+ ])
51
+ logging.info(f"categorical columns: {self.categorical_columns}")
52
+
53
+ preprocessor = ColumnTransformer([
54
+ ('num_pipeline',num_pipeline,self.numerical_columns),
55
+ ('cat_pipeline',cat_pipeline,self.categorical_columns)
56
+ ]
57
+ )
58
+
59
+ return preprocessor
60
+ except Exception as e:
61
+ raise CustomException(e,sys)
62
+
63
+ def initiate_data_tranformation(self, train_path, test_path):
64
+ try:
65
+ train_df = pd.read_csv(train_path)
66
+ test_df = pd.read_csv(test_path)
67
+
68
+ logging.info('read train and test data completed')
69
+
70
+ logging.info('obtaining preprocessing object')
71
+
72
+ preprocessing_obj = self.get_data_transformer_object()
73
+
74
+ input_feature_train_df = train_df.drop(self.target_column_name,axis=1)
75
+ target_feature_train_df = train_df[self.target_column_name]
76
+
77
+
78
+ input_feature_test_df = test_df.drop(self.target_column_name,axis=1)
79
+ target_feature_test_df = test_df[self.target_column_name]
80
+
81
+ logging.info(f"applying preprocessing object on training and testing dataframe")
82
+
83
+ input_feature_train_arr = preprocessing_obj.fit_transform(input_feature_train_df)
84
+ input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df)
85
+
86
+ train_arr = np.c_[
87
+ input_feature_train_arr, np.array(target_feature_train_df)
88
+ ]
89
+
90
+ test_arr = np.c_[
91
+ input_feature_test_arr, np.array(target_feature_test_df)
92
+ ]
93
+
94
+
95
+
96
+ save_object(
97
+ file_path = self.data_transformation_config.preprocessor_ob_file_path,
98
+ obj = preprocessing_obj
99
+ )
100
+ logging.info(f"saved preprocessing object.")
101
+
102
+ return (
103
+ train_arr,
104
+ test_arr,
105
+ self.data_transformation_config.preprocessor_ob_file_path
106
+ )
107
+
108
+
109
+ except Exception as e:
110
+ raise CustomException(e,sys)
111
+
src/utils.py CHANGED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ import pickle
7
+
8
+
9
+ from src.exception import CustomException
10
+
11
+ def save_object(file_path, obj):
12
+ try:
13
+ dir_path = os.path.dirname(file_path)
14
+ os.makedirs(dir_path,exist_ok=True)
15
+ with open(file_path, 'wb') as file_obj:
16
+ pickle.dump(obj,file_obj)
17
+
18
+ except Exception as e:
19
+ raise CustomException(e,sys)