{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.7.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# Incremental/Online Training with Scikit-learn and Production with Skops\nIn this tutorial, we will train online learning models with Scikit-learn, look at differences between warm and cold start and use skops to improve reproducibility!","metadata":{}},{"cell_type":"code","source":"!pip install datasets\n!pip install skops","metadata":{"_kg_hide-input":true,"execution":{"iopub.status.busy":"2022-12-01T10:31:20.498705Z","iopub.execute_input":"2022-12-01T10:31:20.499374Z","iopub.status.idle":"2022-12-01T10:31:48.092799Z","shell.execute_reply.started":"2022-12-01T10:31:20.499234Z","shell.execute_reply":"2022-12-01T10:31:48.091058Z"},"trusted":true},"execution_count":1,"outputs":[{"name":"stdout","text":"Requirement already satisfied: datasets in /opt/conda/lib/python3.7/site-packages (2.1.0)\nRequirement already satisfied: tqdm>=4.62.1 in /opt/conda/lib/python3.7/site-packages (from datasets) (4.64.0)\nRequirement already satisfied: requests>=2.19.0 in /opt/conda/lib/python3.7/site-packages (from datasets) (2.27.1)\nRequirement already satisfied: xxhash in /opt/conda/lib/python3.7/site-packages (from datasets) (3.0.0)\nRequirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.7/site-packages (from datasets) (1.21.6)\nRequirement already satisfied: packaging in /opt/conda/lib/python3.7/site-packages (from datasets) (21.3)\nRequirement already satisfied: importlib-metadata in /opt/conda/lib/python3.7/site-packages (from datasets) (4.11.4)\nRequirement already satisfied: multiprocess in /opt/conda/lib/python3.7/site-packages (from datasets) (0.70.13)\nRequirement already satisfied: aiohttp in /opt/conda/lib/python3.7/site-packages (from datasets) (3.8.1)\nRequirement already satisfied: fsspec[http]>=2021.05.0 in /opt/conda/lib/python3.7/site-packages (from datasets) (2022.5.0)\nRequirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /opt/conda/lib/python3.7/site-packages (from datasets) (0.7.0)\nRequirement already satisfied: pyarrow>=5.0.0 in /opt/conda/lib/python3.7/site-packages (from datasets) (8.0.0)\nRequirement already satisfied: responses<0.19 in /opt/conda/lib/python3.7/site-packages (from datasets) (0.18.0)\nRequirement already satisfied: dill in /opt/conda/lib/python3.7/site-packages (from datasets) (0.3.5.1)\nRequirement already satisfied: pandas in /opt/conda/lib/python3.7/site-packages (from datasets) (1.3.5)\nRequirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.7/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (6.0)\nRequirement already satisfied: filelock in /opt/conda/lib/python3.7/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.6.0)\nRequirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.7/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (4.1.1)\nRequirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.7/site-packages (from packaging->datasets) (3.0.9)\nRequirement already satisfied: charset-normalizer~=2.0.0 in /opt/conda/lib/python3.7/site-packages (from requests>=2.19.0->datasets) (2.0.12)\nRequirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.7/site-packages (from requests>=2.19.0->datasets) (1.26.9)\nRequirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests>=2.19.0->datasets) (3.3)\nRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests>=2.19.0->datasets) (2022.6.15)\nRequirement already satisfied: asynctest==0.13.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp->datasets) (0.13.0)\nRequirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.7/site-packages (from aiohttp->datasets) (1.3.0)\nRequirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp->datasets) (21.4.0)\nRequirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.7/site-packages (from aiohttp->datasets) (1.2.0)\nRequirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.7/site-packages (from aiohttp->datasets) (6.0.2)\nRequirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /opt/conda/lib/python3.7/site-packages (from aiohttp->datasets) (4.0.2)\nRequirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp->datasets) (1.7.2)\nRequirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->datasets) (3.8.0)\nRequirement already satisfied: python-dateutil>=2.7.3 in /opt/conda/lib/python3.7/site-packages (from pandas->datasets) (2.8.2)\nRequirement already satisfied: pytz>=2017.3 in /opt/conda/lib/python3.7/site-packages (from pandas->datasets) (2022.1)\nRequirement already satisfied: six>=1.5 in /opt/conda/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.16.0)\n\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0mCollecting skops\n Downloading skops-0.3.0-py3-none-any.whl (58 kB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.8/58.8 kB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25hRequirement already satisfied: tabulate>=0.8.8 in /opt/conda/lib/python3.7/site-packages (from skops) (0.8.9)\nRequirement already satisfied: typing-extensions>=3.7 in /opt/conda/lib/python3.7/site-packages (from skops) (4.1.1)\nCollecting huggingface-hub>=0.10.1\n Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m182.4/182.4 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25hRequirement already satisfied: scikit-learn>=0.24 in /opt/conda/lib/python3.7/site-packages (from skops) (1.0.2)\nRequirement already satisfied: requests in /opt/conda/lib/python3.7/site-packages (from huggingface-hub>=0.10.1->skops) (2.27.1)\nRequirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.7/site-packages (from huggingface-hub>=0.10.1->skops) (21.3)\nRequirement already satisfied: filelock in /opt/conda/lib/python3.7/site-packages (from huggingface-hub>=0.10.1->skops) (3.6.0)\nRequirement already satisfied: importlib-metadata in /opt/conda/lib/python3.7/site-packages (from huggingface-hub>=0.10.1->skops) (4.11.4)\nRequirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.7/site-packages (from huggingface-hub>=0.10.1->skops) (6.0)\nRequirement already satisfied: tqdm in /opt/conda/lib/python3.7/site-packages (from huggingface-hub>=0.10.1->skops) (4.64.0)\nRequirement already satisfied: joblib>=0.11 in /opt/conda/lib/python3.7/site-packages (from scikit-learn>=0.24->skops) (1.1.0)\nRequirement already satisfied: scipy>=1.1.0 in /opt/conda/lib/python3.7/site-packages (from scikit-learn>=0.24->skops) (1.7.3)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.7/site-packages (from scikit-learn>=0.24->skops) (3.1.0)\nRequirement already satisfied: numpy>=1.14.6 in /opt/conda/lib/python3.7/site-packages (from scikit-learn>=0.24->skops) (1.21.6)\nRequirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.7/site-packages (from packaging>=20.9->huggingface-hub>=0.10.1->skops) (3.0.9)\nRequirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->huggingface-hub>=0.10.1->skops) (3.8.0)\nRequirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.7/site-packages (from requests->huggingface-hub>=0.10.1->skops) (1.26.9)\nRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests->huggingface-hub>=0.10.1->skops) (2022.6.15)\nRequirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests->huggingface-hub>=0.10.1->skops) (3.3)\nRequirement already satisfied: charset-normalizer~=2.0.0 in /opt/conda/lib/python3.7/site-packages (from requests->huggingface-hub>=0.10.1->skops) (2.0.12)\nInstalling collected packages: huggingface-hub, skops\n Attempting uninstall: huggingface-hub\n Found existing installation: huggingface-hub 0.7.0\n Uninstalling huggingface-hub-0.7.0:\n Successfully uninstalled huggingface-hub-0.7.0\n\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\ncached-path 1.1.3 requires huggingface-hub<0.8.0,>=0.0.12, but you have huggingface-hub 0.11.1 which is incompatible.\u001b[0m\u001b[31m\n\u001b[0mSuccessfully installed huggingface-hub-0.11.1 skops-0.3.0\n\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0m","output_type":"stream"}]},{"cell_type":"code","source":"import pandas as pd\nimport datasets\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.ensemble import HistGradientBoostingRegressor\nimport sklearn.neural_network\nimport numpy as np","metadata":{"execution":{"iopub.status.busy":"2022-12-01T10:31:48.096112Z","iopub.execute_input":"2022-12-01T10:31:48.096713Z","iopub.status.idle":"2022-12-01T10:31:49.955513Z","shell.execute_reply.started":"2022-12-01T10:31:48.096642Z","shell.execute_reply":"2022-12-01T10:31:49.954231Z"},"trusted":true},"execution_count":2,"outputs":[]},{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"execution":{"iopub.status.busy":"2022-12-01T10:31:49.957048Z","iopub.execute_input":"2022-12-01T10:31:49.957951Z","iopub.status.idle":"2022-12-01T10:31:49.965242Z","shell.execute_reply.started":"2022-12-01T10:31:49.957910Z","shell.execute_reply":"2022-12-01T10:31:49.964020Z"},"trusted":true},"execution_count":3,"outputs":[]},{"cell_type":"markdown","source":"We will use a dataset from datasets library.","metadata":{}},{"cell_type":"code","source":"dataset = datasets.load_dataset(\"scikit-learn/auto-mpg\")\ndf = pd.DataFrame(dataset[\"train\"])","metadata":{"execution":{"iopub.status.busy":"2022-12-01T10:31:49.968676Z","iopub.execute_input":"2022-12-01T10:31:49.969228Z","iopub.status.idle":"2022-12-01T10:31:51.003435Z","shell.execute_reply.started":"2022-12-01T10:31:49.969175Z","shell.execute_reply":"2022-12-01T10:31:51.002169Z"},"trusted":true},"execution_count":4,"outputs":[{"name":"stdout","text":"Downloading and preparing dataset csv/scikit-learn--auto-mpg to /root/.cache/huggingface/datasets/csv/scikit-learn--auto-mpg-a20aa45e3b31b7e3/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Downloading data files: 0%| | 0/1 [00:00, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"bfb3169dfe4d42f39d57e953ab4e34e4"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading data: 0%| | 0.00/17.7k [00:00, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"cfb35911ebcb4cceb317e16ff0ab24de"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Extracting data files: 0%| | 0/1 [00:00, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"94aeae88d6ba4aa9b82b6bdcab0d9ea1"}},"metadata":{}},{"name":"stdout","text":"Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/scikit-learn--auto-mpg-a20aa45e3b31b7e3/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":" 0%| | 0/1 [00:00, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"0bcc9ce76b8342e5a40739307af181d5"}},"metadata":{}}]},{"cell_type":"markdown","source":"# Preprocessing data","metadata":{}},{"cell_type":"code","source":"df.head()","metadata":{"execution":{"iopub.status.busy":"2022-12-01T10:31:51.005413Z","iopub.execute_input":"2022-12-01T10:31:51.006894Z","iopub.status.idle":"2022-12-01T10:31:51.032966Z","shell.execute_reply.started":"2022-12-01T10:31:51.006832Z","shell.execute_reply":"2022-12-01T10:31:51.031271Z"},"trusted":true},"execution_count":5,"outputs":[{"execution_count":5,"output_type":"execute_result","data":{"text/plain":" mpg cylinders displacement horsepower weight acceleration model year \\\n0 18.0 8 307.0 130 3504 12.0 70 \n1 15.0 8 350.0 165 3693 11.5 70 \n2 18.0 8 318.0 150 3436 11.0 70 \n3 16.0 8 304.0 150 3433 12.0 70 \n4 17.0 8 302.0 140 3449 10.5 70 \n\n origin car name \n0 1 chevrolet chevelle malibu \n1 1 buick skylark 320 \n2 1 plymouth satellite \n3 1 amc rebel sst \n4 1 ford torino ","text/html":"
\n | mpg | \ncylinders | \ndisplacement | \nhorsepower | \nweight | \nacceleration | \nmodel year | \norigin | \ncar name | \n
---|---|---|---|---|---|---|---|---|---|
0 | \n18.0 | \n8 | \n307.0 | \n130 | \n3504 | \n12.0 | \n70 | \n1 | \nchevrolet chevelle malibu | \n
1 | \n15.0 | \n8 | \n350.0 | \n165 | \n3693 | \n11.5 | \n70 | \n1 | \nbuick skylark 320 | \n
2 | \n18.0 | \n8 | \n318.0 | \n150 | \n3436 | \n11.0 | \n70 | \n1 | \nplymouth satellite | \n
3 | \n16.0 | \n8 | \n304.0 | \n150 | \n3433 | \n12.0 | \n70 | \n1 | \namc rebel sst | \n
4 | \n17.0 | \n8 | \n302.0 | \n140 | \n3449 | \n10.5 | \n70 | \n1 | \nford torino | \n
\n | c0 | \nc1 | \nc2 | \nmse_test | \ni | \n
---|---|---|---|---|---|
0 | \n0.000001 | \n0.000012 | \n0.000443 | \n224.463539 | \n0 | \n
1 | \n0.000003 | \n0.000018 | \n0.000598 | \n240.792368 | \n108 | \n
2 | \n0.000005 | \n0.000024 | \n0.000753 | \n531.940399 | \n352 | \n
3 | \n0.000005 | \n0.000026 | \n0.000788 | \n627.109381 | \n167 | \n
4 | \n0.000003 | \n0.000017 | \n0.000501 | \n226.493117 | \n60 | \n
... | \n... | \n... | \n... | \n... | \n... | \n
289 | \n0.000132 | \n-0.000018 | \n-0.020651 | \n261.402379 | \n214 | \n
290 | \n0.000135 | \n-0.000005 | \n-0.020280 | \n333.857844 | \n183 | \n
291 | \n0.000135 | \n-0.000007 | \n-0.020347 | \n255.845520 | \n340 | \n
292 | \n0.000134 | \n-0.000009 | \n-0.020401 | \n209.395166 | \n234 | \n
293 | \n0.000138 | \n0.000007 | \n-0.020044 | \n1102.661045 | \n349 | \n
294 rows × 5 columns
\n