molinari135 commited on
Commit
d449be0
1 Parent(s): 220a214

Delete product_return_prediction/modeling

Browse files
product_return_prediction/modeling/__init__.py DELETED
File without changes
product_return_prediction/modeling/eval.py DELETED
@@ -1,101 +0,0 @@
1
- import pickle
2
- import typer
3
- import json
4
-
5
- import seaborn as sns
6
- import pandas as pd
7
- import matplotlib.pyplot as plt
8
-
9
- from loguru import logger
10
- from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
11
- from pathlib import Path
12
- from codecarbon import EmissionsTracker
13
-
14
- from product_return_prediction.dataset import scale_data_with_trained_scaler
15
- from product_return_prediction.config import (
16
- MODELS_DIR,
17
- PROCESSED_DATA_DIR,
18
- TARGET_COLUMN,
19
- REPORTS_DIR
20
- )
21
-
22
- app = typer.Typer()
23
-
24
-
25
- def evaluate_model(test_data: pd.DataFrame, scaler_file: Path, model: any, model_name: str):
26
- """
27
- Evaluates the performance of a trained model on the provided test data. It includes scaling the features
28
- using a pre-trained scaler, making predictions, computing accuracy, generating a classification report,
29
- and visualizing the confusion matrix.
30
-
31
- This function scales the test data using a pre-trained scaler, applies the trained model to make predictions,
32
- and calculates key performance metrics, including accuracy. It then generates a detailed classification report,
33
- saves the report to a JSON file, and plots the confusion matrix to visually assess model performance.
34
-
35
- Args:
36
- test_data (pd.DataFrame): The test dataset, which includes both features and the target column.
37
- scaler_file (Path): Path to the pre-trained scaler file, used to scale the feature columns.
38
- model (any): The trained model object, used to make predictions on the test data.
39
- model_name (str): The name of the model, used for saving the evaluation report.
40
-
41
- Example:
42
- ```python
43
- evaluate_model(test_data, scaler_file='scaler.pkl', model=model, model_name='log_reg')
44
- ```
45
- """
46
-
47
- X_test = test_data.drop(columns=[TARGET_COLUMN]).copy()
48
- y_test = test_data[TARGET_COLUMN].copy()
49
-
50
- X_test = scale_data_with_trained_scaler(X_test, scaler_file)
51
-
52
- cc_file = f"{model_name}_emissions.csv"
53
- tracker = EmissionsTracker(project_name="eval", output_dir=REPORTS_DIR, output_file=cc_file)
54
- tracker.start()
55
-
56
- y_pred = model.predict(X_test)
57
-
58
- tracker.stop()
59
-
60
- accuracy = accuracy_score(y_test, y_pred)
61
- logger.info(f"Accuracy: {accuracy * 100:.2f}%")
62
-
63
- report = classification_report(y_test, y_pred)
64
- logger.info(f"Classification Report:\n{report}")
65
-
66
- report = classification_report(y_test, y_pred, output_dict=True)
67
- with open(REPORTS_DIR / f"{model_name}.json", "w") as json_file:
68
- json.dump(report, json_file, indent=4)
69
-
70
- cm = confusion_matrix(y_test, y_pred)
71
- sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=model.classes_, yticklabels=model.classes_)
72
- plt.title("Confusion Matrix")
73
- plt.xlabel("Predicted Labels")
74
- plt.ylabel("True Labels")
75
-
76
- # Saving the confusion matrix in the reports/figures directory
77
- plt.savefig(REPORTS_DIR / f"figures/cm_{model_name}.png", dpi=300, bbox_inches='tight')
78
- plt.close()
79
-
80
-
81
- @app.command()
82
- def main(
83
- test_file: Path = PROCESSED_DATA_DIR / "test.tsv",
84
- scaler_file: Path = MODELS_DIR / "scaler.pkl",
85
- log_reg_model_path: Path = MODELS_DIR / "log_reg.pkl",
86
- svm_model_path: Path = MODELS_DIR / "svm.pkl",
87
- ):
88
- test_data = pd.read_csv(test_file, sep='\t')
89
-
90
- with open(log_reg_model_path, "rb") as f:
91
- log_reg = pickle.load(f)
92
-
93
- with open(svm_model_path, "rb") as f:
94
- svm = pickle.load(f)
95
-
96
- evaluate_model(test_data, scaler_file, log_reg, "log_reg_eval")
97
- evaluate_model(test_data, scaler_file, svm, "svm_eval")
98
-
99
-
100
- if __name__ == "__main__":
101
- app()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
product_return_prediction/modeling/predict.py DELETED
@@ -1,60 +0,0 @@
1
- from pathlib import Path
2
-
3
- import typer
4
- import pickle
5
- import json
6
- import pandas as pd
7
- from loguru import logger
8
- from codecarbon import EmissionsTracker
9
-
10
- from product_return_prediction.config import MODELS_DIR, INTERIM_DATA_DIR, EXTERNAL_DATA_DIR, REPORTS_DIR, RAW_DATA_DIR
11
- from product_return_prediction.dataset import prepare_inventory, scale_data_with_trained_scaler
12
-
13
- app = typer.Typer()
14
-
15
-
16
- @app.command()
17
- def main(
18
- sales_path: Path = RAW_DATA_DIR / "sales.xlsx",
19
- inventory_path: Path = EXTERNAL_DATA_DIR / "inventory.csv",
20
- json_percentage: Path = INTERIM_DATA_DIR / "colour_return_percentage.json",
21
- scaler_file: Path = MODELS_DIR / "scaler.pkl",
22
- model_path: Path = MODELS_DIR / "svm.pkl",
23
- ):
24
- sales = pd.read_excel(sales_path)
25
- inventory = pd.read_csv(inventory_path)
26
-
27
- with open(json_percentage, 'r') as f:
28
- percentages = json.load(f)
29
-
30
- # ---- Prepare inventory data for inference ----
31
- inventory = prepare_inventory(sales, inventory, percentages)
32
-
33
- with open(model_path, "rb") as f:
34
- model = pickle.load(f)
35
-
36
- # ---- Scale 5 random rows from the inventory ----
37
- random_row = inventory.sample(n=5)
38
- logger.info(f"Your product:\n {random_row}")
39
- random_row = scale_data_with_trained_scaler(random_row, scaler_file)
40
-
41
- # ---- Compute predictions and probabilities ----
42
- cc_file = "svm_predict_emissions.csv"
43
- tracker = EmissionsTracker(project_name="eval", output_dir=REPORTS_DIR, output_file=cc_file)
44
- tracker.start()
45
-
46
- predictions = model.predict(random_row)
47
- probabilities = model.predict_proba(random_row)
48
-
49
- tracker.stop()
50
-
51
- for pred, prob in zip(predictions, probabilities):
52
- prob_confidence = prob.max()
53
- if pred == 1:
54
- logger.info(f"The product will be returned with {prob_confidence:.2f} confidence")
55
- else:
56
- logger.info(f"The product will NOT be returned with {prob_confidence:.2f} confidence")
57
-
58
-
59
- if __name__ == "__main__":
60
- app()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
product_return_prediction/modeling/train.py DELETED
@@ -1,143 +0,0 @@
1
- import pickle
2
- from pathlib import Path
3
-
4
- import dagshub
5
- import mlflow
6
- import pandas as pd
7
- import typer
8
- from loguru import logger
9
- from sklearn.linear_model import LogisticRegression
10
- from sklearn.model_selection import GridSearchCV
11
- from sklearn.svm import SVC
12
- from codecarbon import EmissionsTracker
13
-
14
- from product_return_prediction.dataset import scale_data_with_trained_scaler
15
- from product_return_prediction.config import (
16
- MODELS_DIR,
17
- PROCESSED_DATA_DIR,
18
- TARGET_COLUMN,
19
- REPORTS_DIR
20
- )
21
-
22
- dagshub.init(repo_owner='se4ai2425-uniba', repo_name='product-return-prediction', mlflow=True)
23
-
24
- app = typer.Typer()
25
-
26
-
27
- # TODO The training dataset must have the following columns:
28
- # Product Type, Product Subtype, Product Gender, Net Sales (FA), Net Sales Units (FA)
29
- # TARGET_COLUMN, Product Order Count, Total Order Value, Main Material, Colour Return Percentage
30
- # Total Customer Purchases, Total Customer Returns, Customer Return Percentage
31
- # TODO The scaler and model paths must be Pickle (.pkl) files
32
- def train_log_reg(train_data: pd.DataFrame, scaler_file: Path, model_path: Path):
33
- """
34
- Trains a Logistic Regression model using the provided training data, applies feature scaling,
35
- and saves the trained model to a specified file.
36
-
37
- This function trains a Logistic Regression model using the training data. The feature columns are
38
- scaled using a pre-trained scaler before fitting the model. The model is then saved to the specified
39
- file path, and the training process is tracked using MLflow.
40
-
41
- Args:
42
- train_data (pd.DataFrame): The training data, including features and target column.
43
- scaler_file (Path): Path to the pre-trained scaler file, used to scale the feature columns.
44
- model_path (Path): Path where the trained Logistic Regression model will be saved.
45
- """
46
-
47
- run_name = model_path.stem
48
- mlflow.start_run(run_name=run_name)
49
- mlflow.sklearn.autolog()
50
-
51
- # Apply scaling to the feature columns (excluding the target column)
52
- X_train = train_data.drop(columns=[TARGET_COLUMN]).copy()
53
- y_train = train_data[TARGET_COLUMN].copy()
54
-
55
- # Scale X_train using the pre-trained scaler
56
- X_train = scale_data_with_trained_scaler(X_train, scaler_file)
57
-
58
- # Initialize the Logistic Regression model
59
- model = LogisticRegression(max_iter=1000, class_weight="balanced")
60
- logger.info(f"Model: {model}")
61
-
62
- cc_file = "log_reg_train_emissions.csv"
63
- tracker = EmissionsTracker(project_name="train", output_dir=REPORTS_DIR, output_file=cc_file)
64
- tracker.start()
65
-
66
- # Fit the model to the training data
67
- model.fit(X_train, y_train)
68
-
69
- tracker.stop()
70
- mlflow.end_run()
71
-
72
- # Save the trained model to disk
73
- with open(model_path, "wb") as f:
74
- pickle.dump(model, f)
75
- logger.success(f"Model saved to {model_path}")
76
-
77
-
78
- # TODO The training dataset must have the following columns:
79
- # Product Type, Product Subtype, Product Gender, Net Sales (FA), Net Sales Units (FA)
80
- # TARGET_COLUMN, Product Order Count, Total Order Value, Main Material, Colour Return Percentage
81
- # Total Customer Purchases, Total Customer Returns, Customer Return Percentage
82
- # TODO The scaler and model paths must be Pickle (.pkl) files
83
- def train_svm(train_data: pd.DataFrame, scaler_file: Path, model_path: Path):
84
- """
85
- Trains a Support Vector Machine (SVM) classifier using the provided training data, applies feature scaling,
86
- performs hyperparameter tuning via grid search, and saves the trained model to a specified file.
87
-
88
- This function trains an SVM model with hyperparameter optimization using grid search. The feature columns
89
- are scaled using a pre-trained scaler before fitting the model. The trained model is saved to the specified
90
- file path, and the training process is tracked using MLflow.
91
-
92
- Args:
93
- train_data (pd.DataFrame): The training data, including features and target column.
94
- scaler_file (Path): Path to the pre-trained scaler file, used to scale the feature columns.
95
- model_path (Path): Path where the trained SVM model will be saved.
96
- """
97
-
98
- run_name = model_path.stem
99
- mlflow.start_run(run_name=run_name)
100
- mlflow.sklearn.autolog()
101
-
102
- X_train = train_data.drop(columns=[TARGET_COLUMN]).copy()
103
- y_train = train_data[TARGET_COLUMN].copy()
104
-
105
- X_train = scale_data_with_trained_scaler(X_train, scaler_file)
106
-
107
- param_grid = {"C": [0.1, 1, 10], "kernel": ["rbf"], "gamma": ["scale", "auto"]}
108
-
109
- logger.info("Starting Grid Search for best hyperparameters")
110
- grid_search = GridSearchCV(SVC(probability=True), param_grid, scoring="balanced_accuracy", cv=10)
111
- grid_search.fit(X_train, y_train)
112
- model = grid_search.best_estimator_
113
-
114
- cc_file = "svm_train_emissions.csv"
115
- tracker = EmissionsTracker(project_name="train", output_dir=REPORTS_DIR, output_file=cc_file)
116
- tracker.start()
117
-
118
- model.fit(X_train, y_train)
119
-
120
- tracker.stop()
121
- mlflow.end_run()
122
-
123
- with open(model_path, "wb") as f:
124
- pickle.dump(model, f)
125
- logger.success(f"Model saved to {model_path}")
126
-
127
-
128
- @app.command()
129
- def main(
130
- train_file: Path = PROCESSED_DATA_DIR / "train.tsv",
131
- scaler_file: Path = MODELS_DIR / "scaler.pkl",
132
- log_reg_model_path: Path = MODELS_DIR / "log_reg.pkl",
133
- svm_model_path: Path = MODELS_DIR / "svm.pkl",
134
- ):
135
- train_data = pd.read_csv(train_file, sep='\t')
136
-
137
- # ---- Train models ----
138
- train_log_reg(train_data, scaler_file, log_reg_model_path)
139
- train_svm(train_data, scaler_file, svm_model_path)
140
-
141
-
142
- if __name__ == "__main__":
143
- app()