{ "cells": [ { "attachments": {}, "cell_type": "markdown", "id": "92e48866", "metadata": {}, "source": [ "## Model Training" ] }, { "cell_type": "markdown", "id": "25791a74", "metadata": {}, "source": [ "#### 1.1 Import Data and Required Packages\n", "##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library." ] }, { "cell_type": "code", "execution_count": 2, "id": "b080dfb2", "metadata": {}, "outputs": [], "source": [ "# Basic Import\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "# Modelling\n", "from sklearn.metrics import mean_squared_error, r2_score\n", "from sklearn.neighbors import KNeighborsRegressor\n", "from sklearn.tree import DecisionTreeRegressor\n", "from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor\n", "from sklearn.svm import SVR\n", "from sklearn.linear_model import LinearRegression, Ridge, Lasso\n", "from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error\n", "from sklearn.model_selection import RandomizedSearchCV\n", "from catboost import CatBoostRegressor\n", "from xgboost import XGBRegressor\n", "import warnings" ] }, { "cell_type": "markdown", "id": "e45079ad", "metadata": {}, "source": [ "#### Import the CSV Data as Pandas DataFrame" ] }, { "cell_type": "code", "execution_count": 3, "id": "e11c6255", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('data/stud.csv')" ] }, { "cell_type": "markdown", "id": "20634923", "metadata": {}, "source": [ "#### Show Top 5 Records" ] }, { "cell_type": "code", "execution_count": 4, "id": "e7e412a2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
genderrace_ethnicityparental_level_of_educationlunchtest_preparation_coursemath_scorereading_scorewriting_score
0femalegroup Bbachelor's degreestandardnone727274
1femalegroup Csome collegestandardcompleted699088
2femalegroup Bmaster's degreestandardnone909593
3malegroup Aassociate's degreefree/reducednone475744
4malegroup Csome collegestandardnone767875
\n", "
" ], "text/plain": [ " gender race_ethnicity parental_level_of_education lunch \\\n", "0 female group B bachelor's degree standard \n", "1 female group C some college standard \n", "2 female group B master's degree standard \n", "3 male group A associate's degree free/reduced \n", "4 male group C some college standard \n", "\n", " test_preparation_course math_score reading_score writing_score \n", "0 none 72 72 74 \n", "1 completed 69 90 88 \n", "2 none 90 95 93 \n", "3 none 47 57 44 \n", "4 none 76 78 75 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "id": "fbd32281", "metadata": {}, "source": [ "#### Preparing X and Y variables" ] }, { "cell_type": "code", "execution_count": 5, "id": "56d72fde", "metadata": {}, "outputs": [], "source": [ "X = df.drop(columns=[\"math_score\"], axis=1)" ] }, { "cell_type": "code", "execution_count": 6, "id": "cd613177", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
genderrace_ethnicityparental_level_of_educationlunchtest_preparation_coursereading_scorewriting_score
0femalegroup Bbachelor's degreestandardnone7274
1femalegroup Csome collegestandardcompleted9088
2femalegroup Bmaster's degreestandardnone9593
3malegroup Aassociate's degreefree/reducednone5744
4malegroup Csome collegestandardnone7875
\n", "
" ], "text/plain": [ " gender race_ethnicity parental_level_of_education lunch \\\n", "0 female group B bachelor's degree standard \n", "1 female group C some college standard \n", "2 female group B master's degree standard \n", "3 male group A associate's degree free/reduced \n", "4 male group C some college standard \n", "\n", " test_preparation_course reading_score writing_score \n", "0 none 72 74 \n", "1 completed 90 88 \n", "2 none 95 93 \n", "3 none 57 44 \n", "4 none 78 75 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.head()" ] }, { "cell_type": "code", "execution_count": 7, "id": "f237ea14", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Categories in 'gender' variable: ['female' 'male']\n", "Categories in 'race_ethnicity' variable: ['group B' 'group C' 'group A' 'group D' 'group E']\n", "Categories in'parental level of education' variable: [\"bachelor's degree\" 'some college' \"master's degree\" \"associate's degree\"\n", " 'high school' 'some high school']\n", "Categories in 'lunch' variable: ['standard' 'free/reduced']\n", "Categories in 'test preparation course' variable: ['none' 'completed']\n" ] } ], "source": [ "print(\"Categories in 'gender' variable: \", end=\" \")\n", "print(df[\"gender\"].unique())\n", "\n", "print(\"Categories in 'race_ethnicity' variable: \", end=\" \")\n", "print(df[\"race_ethnicity\"].unique())\n", "\n", "print(\"Categories in'parental level of education' variable:\", end=\" \")\n", "print(df[\"parental_level_of_education\"].unique())\n", "\n", "print(\"Categories in 'lunch' variable: \", end=\" \")\n", "print(df[\"lunch\"].unique())\n", "\n", "print(\"Categories in 'test preparation course' variable: \", end=\" \")\n", "print(df[\"test_preparation_course\"].unique())" ] }, { "cell_type": "code", "execution_count": 8, "id": "924b7f9d", "metadata": {}, "outputs": [], "source": [ "y = df[\"math_score\"]" ] }, { "cell_type": "code", "execution_count": 9, "id": "ffc69816", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 72\n", "1 69\n", "2 90\n", "3 47\n", "4 76\n", " ..\n", "995 88\n", "996 62\n", "997 59\n", "998 68\n", "999 77\n", "Name: math_score, Length: 1000, dtype: int64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y" ] }, { "cell_type": "code", "execution_count": 10, "id": "1e290fe3", "metadata": {}, "outputs": [], "source": [ "# Create Column Transformer with 3 types of transformers\n", "num_features = X.select_dtypes(exclude=\"object\").columns\n", "cat_features = X.select_dtypes(include=\"object\").columns\n", "\n", "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", "from sklearn.compose import ColumnTransformer\n", "\n", "numeric_transformer = StandardScaler()\n", "oh_transformer = OneHotEncoder()\n", "\n", "preprocessor = ColumnTransformer(\n", " [\n", " (\"OneHotEncoder\", oh_transformer, cat_features),\n", " (\"StandardScaler\", numeric_transformer, num_features),\n", " ]\n", ")" ] }, { "cell_type": "code", "execution_count": 11, "id": "9c68f99a", "metadata": {}, "outputs": [], "source": [ "X = preprocessor.fit_transform(X)" ] }, { "cell_type": "code", "execution_count": 14, "id": "1f57b3ec", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 1. , 0. , 0. , ..., 1. ,\n", " 0.19399858, 0.39149181],\n", " [ 1. , 0. , 0. , ..., 0. ,\n", " 1.42747598, 1.31326868],\n", " [ 1. , 0. , 0. , ..., 1. ,\n", " 1.77010859, 1.64247471],\n", " ...,\n", " [ 1. , 0. , 0. , ..., 0. ,\n", " 0.12547206, -0.20107904],\n", " [ 1. , 0. , 0. , ..., 0. ,\n", " 0.60515772, 0.58901542],\n", " [ 1. , 0. , 0. , ..., 1. ,\n", " 1.15336989, 1.18158627]])" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X" ] }, { "cell_type": "code", "execution_count": 12, "id": "72459f1d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1000, 19)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.shape" ] }, { "cell_type": "code", "execution_count": 13, "id": "ed5c4e99", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((800, 19), (200, 19))" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# separate dataset into train and test\n", "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.2, random_state=42\n", ")\n", "X_train.shape, X_test.shape" ] }, { "cell_type": "markdown", "id": "4cd80317", "metadata": {}, "source": [ "#### Create an Evaluate Function to give all metrics after model Training" ] }, { "cell_type": "code", "execution_count": 15, "id": "8c247bd0", "metadata": {}, "outputs": [], "source": [ "def evaluate_model(true, predicted):\n", " mae = mean_absolute_error(true, predicted)\n", " # mse = mean_squared_error(true, predicted)\n", " rmse = np.sqrt(mean_squared_error(true, predicted))\n", " r2_square = r2_score(true, predicted)\n", " return mae, rmse, r2_square" ] }, { "cell_type": "code", "execution_count": 16, "id": "79ccb8e7", "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Linear Regression\n", "Model performance for Training set\n", "- Root Mean Squared Error: 5.3231\n", "- Mean Absolute Error: 4.2667\n", "- R2 Score: 0.8743\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 5.3940\n", "- Mean Absolute Error: 4.2148\n", "- R2 Score: 0.8804\n", "===================================\n", "\n", "\n", "Lasso\n", "Model performance for Training set\n", "- Root Mean Squared Error: 6.5938\n", "- Mean Absolute Error: 5.2063\n", "- R2 Score: 0.8071\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 6.5197\n", "- Mean Absolute Error: 5.1579\n", "- R2 Score: 0.8253\n", "===================================\n", "\n", "\n", "Ridge\n", "Model performance for Training set\n", "- Root Mean Squared Error: 5.3233\n", "- Mean Absolute Error: 4.2650\n", "- R2 Score: 0.8743\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 5.3904\n", "- Mean Absolute Error: 4.2111\n", "- R2 Score: 0.8806\n", "===================================\n", "\n", "\n", "K-Neighbors Regressor\n", "Model performance for Training set\n", "- Root Mean Squared Error: 5.7079\n", "- Mean Absolute Error: 4.5168\n", "- R2 Score: 0.8555\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 7.2530\n", "- Mean Absolute Error: 5.6210\n", "- R2 Score: 0.7838\n", "===================================\n", "\n", "\n", "Decision Tree\n", "Model performance for Training set\n", "- Root Mean Squared Error: 0.2795\n", "- Mean Absolute Error: 0.0187\n", "- R2 Score: 0.9997\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 7.9294\n", "- Mean Absolute Error: 6.4050\n", "- R2 Score: 0.7416\n", "===================================\n", "\n", "\n", "Random Forest Regressor\n", "Model performance for Training set\n", "- Root Mean Squared Error: 2.3125\n", "- Mean Absolute Error: 1.8477\n", "- R2 Score: 0.9763\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 5.9646\n", "- Mean Absolute Error: 4.6275\n", "- R2 Score: 0.8538\n", "===================================\n", "\n", "\n", "XGBRegressor\n", "Model performance for Training set\n", "- Root Mean Squared Error: 1.0073\n", "- Mean Absolute Error: 0.6875\n", "- R2 Score: 0.9955\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 6.4733\n", "- Mean Absolute Error: 5.0577\n", "- R2 Score: 0.8278\n", "===================================\n", "\n", "\n", "CatBoosting Regressor\n", "Model performance for Training set\n", "- Root Mean Squared Error: 3.0427\n", "- Mean Absolute Error: 2.4054\n", "- R2 Score: 0.9589\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 6.0086\n", "- Mean Absolute Error: 4.6125\n", "- R2 Score: 0.8516\n", "===================================\n", "\n", "\n", "AdaBoost Regressor\n", "Model performance for Training set\n", "- Root Mean Squared Error: 5.8340\n", "- Mean Absolute Error: 4.7767\n", "- R2 Score: 0.8490\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 6.0478\n", "- Mean Absolute Error: 4.7079\n", "- R2 Score: 0.8497\n", "===================================\n", "\n", "\n" ] } ], "source": [ "models = {\n", " \"Linear Regression\": LinearRegression(),\n", " \"Lasso\": Lasso(),\n", " \"Ridge\": Ridge(),\n", " \"K-Neighbors Regressor\": KNeighborsRegressor(),\n", " \"Decision Tree\": DecisionTreeRegressor(),\n", " \"Random Forest Regressor\": RandomForestRegressor(),\n", " \"XGBRegressor\": XGBRegressor(),\n", " \"CatBoosting Regressor\": CatBoostRegressor(verbose=False),\n", " \"AdaBoost Regressor\": AdaBoostRegressor(),\n", "}\n", "model_list = []\n", "r2_list = []\n", "\n", "for i in range(len(list(models))):\n", " model = list(models.values())[i]\n", " model.fit(X_train, y_train) # Train model\n", "\n", " # Make predictions\n", " y_train_pred = model.predict(X_train)\n", " y_test_pred = model.predict(X_test)\n", "\n", " # Evaluate Train and Test dataset\n", " model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(\n", " y_train, y_train_pred\n", " )\n", "\n", " model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)\n", "\n", " print(list(models.keys())[i])\n", " model_list.append(list(models.keys())[i])\n", "\n", " print(\"Model performance for Training set\")\n", " print(\"- Root Mean Squared Error: {:.4f}\".format(model_train_rmse))\n", " print(\"- Mean Absolute Error: {:.4f}\".format(model_train_mae))\n", " print(\"- R2 Score: {:.4f}\".format(model_train_r2))\n", "\n", " print(\"----------------------------------\")\n", "\n", " print(\"Model performance for Test set\")\n", " print(\"- Root Mean Squared Error: {:.4f}\".format(model_test_rmse))\n", " print(\"- Mean Absolute Error: {:.4f}\".format(model_test_mae))\n", " print(\"- R2 Score: {:.4f}\".format(model_test_r2))\n", " r2_list.append(model_test_r2)\n", "\n", " print(\"=\" * 35)\n", " print(\"\\n\")" ] }, { "cell_type": "markdown", "id": "06480b5a", "metadata": {}, "source": [ "### Results" ] }, { "cell_type": "code", "execution_count": 17, "id": "e0159e5f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Model NameR2_Score
2Ridge0.880593
0Linear Regression0.880433
5Random Forest Regressor0.853797
7CatBoosting Regressor0.851632
8AdaBoost Regressor0.849691
6XGBRegressor0.827797
1Lasso0.825320
3K-Neighbors Regressor0.783813
4Decision Tree0.741615
\n", "
" ], "text/plain": [ " Model Name R2_Score\n", "2 Ridge 0.880593\n", "0 Linear Regression 0.880433\n", "5 Random Forest Regressor 0.853797\n", "7 CatBoosting Regressor 0.851632\n", "8 AdaBoost Regressor 0.849691\n", "6 XGBRegressor 0.827797\n", "1 Lasso 0.825320\n", "3 K-Neighbors Regressor 0.783813\n", "4 Decision Tree 0.741615" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(\n", " list(zip(model_list, r2_list)), columns=[\"Model Name\", \"R2_Score\"]\n", ").sort_values(by=[\"R2_Score\"], ascending=False)" ] }, { "cell_type": "markdown", "id": "357a7c1c", "metadata": {}, "source": [ "## Linear Regression" ] }, { "cell_type": "code", "execution_count": 18, "id": "9a6ad559", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Accuracy of the model is 88.04\n" ] } ], "source": [ "lin_model = LinearRegression(fit_intercept=True)\n", "lin_model = lin_model.fit(X_train, y_train)\n", "y_pred = lin_model.predict(X_test)\n", "score = r2_score(y_test, y_pred) * 100\n", "print(\" Accuracy of the model is %.2f\" % score)" ] }, { "cell_type": "markdown", "id": "1d31453e", "metadata": {}, "source": [ "## Plot y_pred and y_test" ] }, { "cell_type": "code", "execution_count": 19, "id": "eb557b0a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0, 0.5, 'Predicted')" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.scatter(y_test, y_pred)\n", "plt.xlabel(\"Actual\")\n", "plt.ylabel(\"Predicted\")" ] }, { "cell_type": "code", "execution_count": 22, "id": "1e707ec3", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sns.regplot(x=y_test, y=y_pred, ci=None, color=\"red\");" ] }, { "cell_type": "markdown", "id": "79c2fe28", "metadata": {}, "source": [ "#### Difference between Actual and Predicted Values" ] }, { "cell_type": "code", "execution_count": 23, "id": "7c9a8b48", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Actual ValuePredicted ValueDifference
5219176.38797014.612030
7375358.885970-5.885970
7408076.9902653.009735
6607476.851804-2.851804
4118487.627378-3.627378
............
4085243.4091498.590851
3326262.152214-0.152214
2087467.8883956.111605
6136567.022287-2.022287
786162.345132-1.345132
\n", "

200 rows × 3 columns

\n", "
" ], "text/plain": [ " Actual Value Predicted Value Difference\n", "521 91 76.387970 14.612030\n", "737 53 58.885970 -5.885970\n", "740 80 76.990265 3.009735\n", "660 74 76.851804 -2.851804\n", "411 84 87.627378 -3.627378\n", ".. ... ... ...\n", "408 52 43.409149 8.590851\n", "332 62 62.152214 -0.152214\n", "208 74 67.888395 6.111605\n", "613 65 67.022287 -2.022287\n", "78 61 62.345132 -1.345132\n", "\n", "[200 rows x 3 columns]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pred_df = pd.DataFrame(\n", " {\"Actual Value\": y_test, \"Predicted Value\": y_pred, \"Difference\": y_test - y_pred}\n", ")\n", "pred_df" ] }, { "cell_type": "code", "execution_count": null, "id": "3acf1fbc", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 5 }