{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "51a9bb64-969a-4fc0-aa76-4bd42b08c21a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from catboost import CatBoostRegressor, Pool\n",
    "from datasets import load_dataset\n",
    "from dotenv import load_dotenv\n",
    "from huggingface_hub import HfFolder, login\n",
    "from sklearn.metrics import (\n",
    "    mean_absolute_error,\n",
    "    mean_squared_error,\n",
    "    r2_score,\n",
    "    root_mean_squared_error,\n",
    ")\n",
    "from transformers import (\n",
    "    AutoModelForSequenceClassification,\n",
    "    AutoTokenizer,\n",
    "    Trainer,\n",
    "    TrainingArguments,\n",
    ")\n",
    "\n",
    "load_dotenv()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9b53c782-5dbb-4dd6-b541-8e4fab3f3ddf",
   "metadata": {},
   "outputs": [],
   "source": [
    "login(token=os.getenv(\"HUGGINGFACE_API_KEY\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "87f58ec1-231d-4c17-8af2-c366af55e375",
   "metadata": {},
   "source": [
    "### Dataset prep"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0ea777d8-988b-421a-8d76-b0f9256ab61b",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_dataset = load_dataset(\"Forecast-ing/email-clickthrough\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "0c8441e7-1606-4c61-8f6c-34ebe1e107c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_dataset = raw_dataset.rename_column(\"label\", \"labels\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "201b806d-6c94-4053-98b6-4d22dcdda08a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3292"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_dataset[\"train\"].to_pandas()[\"text\"].str.len().max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "cbd4d6ec-b293-49f1-925f-239549dab61e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.2427007299270073"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(raw_dataset[\"train\"].to_pandas()[\"text\"].str.len() > 2048).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "1101c038-3f83-4055-b938-3861ac43cf8f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    548.000000\n",
       "mean       2.879635\n",
       "std        2.423870\n",
       "min        0.450000\n",
       "25%        1.510000\n",
       "50%        2.025000\n",
       "75%        3.267500\n",
       "max       25.370000\n",
       "Name: labels, dtype: float64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_dataset[\"train\"].to_pandas()[\"labels\"].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "7546577f-4d7f-41b5-a68f-095fc0e8eec4",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_dataset = raw_dataset[\"train\"].train_test_split(test_size=0.1, seed=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "a0c28ab6-20f8-47dc-8ee9-179ea15830e0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train dataset size: 493\n",
      "Test dataset size: 55\n"
     ]
    }
   ],
   "source": [
    "print(f\"Train dataset size: {len(raw_dataset['train'])}\")\n",
    "print(f\"Test dataset size: {len(raw_dataset['test'])}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a2c3e7c3-e31d-4d8f-8d7d-e62050a9ae9d",
   "metadata": {},
   "source": [
    "### Catboost Benchmark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "01aaea26-e1df-4493-b9cd-732c3b7a76a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "catboost_train = raw_dataset[\"train\"].to_pandas()\n",
    "catboost_test = raw_dataset[\"test\"].to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "0243e07d-69ba-41e5-b54d-d0f4988bbf9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "text_columns = [\"text\"]\n",
    "label = \"labels\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "ba17040c-5882-47dc-a8af-a7557356840f",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_pool = Pool(\n",
    "    data=catboost_train[text_columns],\n",
    "    label=catboost_train[label],\n",
    "    text_features=text_columns,\n",
    ")\n",
    "test_pool = Pool(\n",
    "    data=catboost_test[text_columns],\n",
    "    label=catboost_test[label],\n",
    "    text_features=text_columns,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "d8b3768e-6f30-41ce-a209-bd915a997d8a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Learning rate set to 0.045569\n",
      "0:\tlearn: 2.4332854\ttest: 1.8670741\tbest: 1.8670741 (0)\ttotal: 60.5ms\tremaining: 1m\n",
      "100:\tlearn: 1.4972558\ttest: 1.6247590\tbest: 1.6048404 (59)\ttotal: 2.5s\tremaining: 22.2s\n",
      "200:\tlearn: 1.1104040\ttest: 1.6015944\tbest: 1.5975296 (197)\ttotal: 4.91s\tremaining: 19.5s\n",
      "300:\tlearn: 0.8568033\ttest: 1.6102309\tbest: 1.5975296 (197)\ttotal: 7.33s\tremaining: 17s\n",
      "400:\tlearn: 0.7096792\ttest: 1.6090190\tbest: 1.5975296 (197)\ttotal: 9.72s\tremaining: 14.5s\n",
      "500:\tlearn: 0.6056532\ttest: 1.6083240\tbest: 1.5975296 (197)\ttotal: 12.1s\tremaining: 12s\n",
      "600:\tlearn: 0.5298016\ttest: 1.6175366\tbest: 1.5975296 (197)\ttotal: 14.5s\tremaining: 9.64s\n",
      "700:\tlearn: 0.4701467\ttest: 1.6262668\tbest: 1.5975296 (197)\ttotal: 16.9s\tremaining: 7.23s\n",
      "800:\tlearn: 0.4233732\ttest: 1.6199203\tbest: 1.5975296 (197)\ttotal: 19.4s\tremaining: 4.81s\n",
      "900:\tlearn: 0.3837074\ttest: 1.6104091\tbest: 1.5975296 (197)\ttotal: 21.8s\tremaining: 2.39s\n",
      "999:\tlearn: 0.3501113\ttest: 1.6131207\tbest: 1.5975296 (197)\ttotal: 24.2s\tremaining: 0us\n",
      "\n",
      "bestTest = 1.597529566\n",
      "bestIteration = 197\n",
      "\n",
      "Shrink model to first 198 iterations.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<catboost.core.CatBoostRegressor at 0x7fb1061c5bb0>"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = CatBoostRegressor(loss_function=\"RMSE\", verbose=100)\n",
    "\n",
    "model.fit(train_pool, eval_set=test_pool)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "837b22a8-241d-49b3-a1ae-915893121319",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(test_pool)\n",
    "y_val = catboost_test[label]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "478521bf-85be-49a1-8461-020587f146d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def smape(y_true, y_pred):\n",
    "    return 100 * np.mean(\n",
    "        2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))\n",
    "    )\n",
    "\n",
    "\n",
    "def calculate_metrics(y_val, y_pred):\n",
    "    mse = mean_squared_error(y_val, y_pred)\n",
    "    rmse = np.sqrt(mse)\n",
    "    mae = mean_absolute_error(y_val, y_pred)\n",
    "    r2 = r2_score(y_val, y_pred)\n",
    "    smape_value = smape(y_val, y_pred)\n",
    "    return {\n",
    "        \"mse\": mse,\n",
    "        \"rmse\": rmse,\n",
    "        \"mae\": mae,\n",
    "        \"r2\": r2,\n",
    "        \"smape\": smape_value,\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "91ea95e2-1818-45a6-8725-3a1353cb5b97",
   "metadata": {},
   "outputs": [],
   "source": [
    "catboost_metrics = calculate_metrics(y_val, y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "e28e359e-f69c-4ee8-9bbd-e7afafafcd26",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'mse': 2.552100633998035,\n",
       " 'rmse': 1.5975295408843102,\n",
       " 'mae': 1.1439370629666958,\n",
       " 'r2': 0.30127932054387174,\n",
       " 'smape': 37.63064694052479}"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "catboost_metrics"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7afac97a-1e69-47e4-9ecd-ece7ebe7b48f",
   "metadata": {},
   "source": [
    "### Fine Tuning Modern Bert"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "031df047-2c18-4ec9-a498-596a7cf965b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_id = \"answerdotai/ModernBERT-base\"\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
    "tokenizer.model_max_length = 2048\n",
    "\n",
    "def tokenize(batch):\n",
    "    return tokenizer(\n",
    "        batch[\"text\"], padding=\"max_length\", truncation=True, return_tensors=\"pt\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "bbb711e8-8da6-401b-a2bd-c1637731f6c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized_dataset = raw_dataset.map(tokenize, batched=True, remove_columns=[\"text\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "ca4c2942-bf9e-4579-82b9-654fbee85b54",
   "metadata": {},
   "outputs": [],
   "source": [
    "def model_init(trial):\n",
    "    model = AutoModelForSequenceClassification.from_pretrained(\n",
    "        model_id, num_labels=1, ignore_mismatched_sizes=True, problem_type=\"regression\"\n",
    "    )\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "a2005499-e139-4151-9ebe-2759710149b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "def gen_training_args(additional_args={}):\n",
    "    default_args = {\n",
    "        \"output_dir\": \"./modernBERT-content-regression\",\n",
    "        \"per_device_eval_batch_size\": 4,\n",
    "        \"per_device_train_batch_size\": 4,\n",
    "        \"num_train_epochs\": 5,\n",
    "        \"bf16\": True,  # bfloat16 training\n",
    "        \"optim\": \"adamw_torch_fused\",  # improved optimizer\n",
    "        \"logging_strategy\": \"steps\",\n",
    "        \"logging_steps\": 1,\n",
    "        \"evaluation_strategy\": \"epoch\",\n",
    "        \"save_strategy\": \"epoch\",\n",
    "        \"save_total_limit\": 1,\n",
    "        \"metric_for_best_model\": \"rmse\",\n",
    "        \"greater_is_better\": False,\n",
    "        \"report_to\": \"tensorboard\",\n",
    "        \"push_to_hub\": True,\n",
    "        \"hub_private_repo\": True,\n",
    "        \"hub_strategy\": \"every_save\",\n",
    "        \"hub_token\": HfFolder.get_token(),\n",
    "    }\n",
    "    training_args = TrainingArguments(**default_args, **additional_args)\n",
    "    return training_args"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "e7e0ee17-9a1c-4789-8a4b-d2469a88837b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_metrics_for_regression(eval_pred):\n",
    "    predictions, labels = eval_pred\n",
    "    predictions = predictions.reshape(-1, 1)\n",
    "    results =  calculate_metrics(labels, predictions)\n",
    "    return results\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "10981164-fffc-4128-89ae-41c9238074cd",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/home/robin/Development/modernbert-content-regression/.venv/lib/python3.12/site-packages/transformers/training_args.py:1573: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
      "  warnings.warn(\n",
      "/tmp/ipykernel_22314/2727960756.py:1: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
      "  hp_trainer = Trainer(\n",
      "Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "hp_trainer = Trainer(\n",
    "    model=None,\n",
    "    args=gen_training_args(),\n",
    "    train_dataset=tokenized_dataset[\"train\"],\n",
    "    eval_dataset=tokenized_dataset[\"test\"],\n",
    "    tokenizer=tokenizer,\n",
    "    compute_metrics=compute_metrics_for_regression,\n",
    "    model_init=model_init,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "261a4988-e5f9-469f-b8ae-ef51ae6a95df",
   "metadata": {},
   "outputs": [],
   "source": [
    "def optuna_hp_space(trial):\n",
    "    return {\n",
    "        \"learning_rate\": trial.suggest_float(\"learning_rate\", 5e-7, 5e-5, log=True),\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "7e7fb4f6-4eb4-4a4c-931d-a97c1614a5b9",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[I 2025-01-09 12:16:25,726] A new study created in memory with name: no-name-2f3f9073-d130-4bb1-9447-7262f2b7bd75\n",
      "Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='620' max='620' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [620/620 03:27, Epoch 5/5]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "      <th>Mse</th>\n",
       "      <th>Rmse</th>\n",
       "      <th>Mae</th>\n",
       "      <th>R2</th>\n",
       "      <th>Smape</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.238000</td>\n",
       "      <td>4.573008</td>\n",
       "      <td>4.573008</td>\n",
       "      <td>2.138459</td>\n",
       "      <td>1.324540</td>\n",
       "      <td>-0.252010</td>\n",
       "      <td>54.242009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>3.768500</td>\n",
       "      <td>4.093452</td>\n",
       "      <td>4.093452</td>\n",
       "      <td>2.023228</td>\n",
       "      <td>1.458057</td>\n",
       "      <td>-0.120716</td>\n",
       "      <td>53.770840</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>27.661000</td>\n",
       "      <td>3.361875</td>\n",
       "      <td>3.361874</td>\n",
       "      <td>1.833541</td>\n",
       "      <td>1.126670</td>\n",
       "      <td>0.079577</td>\n",
       "      <td>52.641284</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.092300</td>\n",
       "      <td>2.759459</td>\n",
       "      <td>2.759459</td>\n",
       "      <td>1.661162</td>\n",
       "      <td>1.040074</td>\n",
       "      <td>0.244508</td>\n",
       "      <td>53.009331</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>0.020300</td>\n",
       "      <td>2.733250</td>\n",
       "      <td>2.733250</td>\n",
       "      <td>1.653254</td>\n",
       "      <td>1.078653</td>\n",
       "      <td>0.251684</td>\n",
       "      <td>54.187167</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[I 2025-01-09 12:19:57,000] Trial 0 finished with value: 1.6532543369745685 and parameters: {'learning_rate': 1.9437267223645173e-05}. Best is trial 0 with value: 1.6532543369745685.\n",
      "Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='620' max='620' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [620/620 03:30, Epoch 5/5]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "      <th>Mse</th>\n",
       "      <th>Rmse</th>\n",
       "      <th>Mae</th>\n",
       "      <th>R2</th>\n",
       "      <th>Smape</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.033500</td>\n",
       "      <td>3.730757</td>\n",
       "      <td>3.730757</td>\n",
       "      <td>1.931517</td>\n",
       "      <td>1.167591</td>\n",
       "      <td>-0.021416</td>\n",
       "      <td>46.438679</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>3.021100</td>\n",
       "      <td>3.532418</td>\n",
       "      <td>3.532420</td>\n",
       "      <td>1.879473</td>\n",
       "      <td>1.171051</td>\n",
       "      <td>0.032885</td>\n",
       "      <td>48.273236</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>32.454400</td>\n",
       "      <td>3.670944</td>\n",
       "      <td>3.670944</td>\n",
       "      <td>1.915971</td>\n",
       "      <td>1.159171</td>\n",
       "      <td>-0.005041</td>\n",
       "      <td>48.529482</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.074300</td>\n",
       "      <td>3.690546</td>\n",
       "      <td>3.690546</td>\n",
       "      <td>1.921079</td>\n",
       "      <td>1.179955</td>\n",
       "      <td>-0.010407</td>\n",
       "      <td>49.107727</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>0.098800</td>\n",
       "      <td>3.677439</td>\n",
       "      <td>3.677439</td>\n",
       "      <td>1.917665</td>\n",
       "      <td>1.188619</td>\n",
       "      <td>-0.006819</td>\n",
       "      <td>49.251461</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[I 2025-01-09 12:23:31,566] Trial 1 finished with value: 1.91766510403085 and parameters: {'learning_rate': 1.5810058165067856e-06}. Best is trial 0 with value: 1.6532543369745685.\n",
      "Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='620' max='620' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [620/620 03:28, Epoch 5/5]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "      <th>Mse</th>\n",
       "      <th>Rmse</th>\n",
       "      <th>Mae</th>\n",
       "      <th>R2</th>\n",
       "      <th>Smape</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.311500</td>\n",
       "      <td>4.090590</td>\n",
       "      <td>4.090590</td>\n",
       "      <td>2.022521</td>\n",
       "      <td>1.229977</td>\n",
       "      <td>-0.119932</td>\n",
       "      <td>50.514507</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>2.652800</td>\n",
       "      <td>4.852318</td>\n",
       "      <td>4.852319</td>\n",
       "      <td>2.202798</td>\n",
       "      <td>1.465739</td>\n",
       "      <td>-0.328480</td>\n",
       "      <td>54.715651</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>24.626400</td>\n",
       "      <td>3.331610</td>\n",
       "      <td>3.331610</td>\n",
       "      <td>1.825270</td>\n",
       "      <td>1.143937</td>\n",
       "      <td>0.087863</td>\n",
       "      <td>51.898420</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.289600</td>\n",
       "      <td>2.353773</td>\n",
       "      <td>2.353773</td>\n",
       "      <td>1.534201</td>\n",
       "      <td>1.079125</td>\n",
       "      <td>0.355578</td>\n",
       "      <td>55.779856</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>0.001400</td>\n",
       "      <td>2.629261</td>\n",
       "      <td>2.629261</td>\n",
       "      <td>1.621500</td>\n",
       "      <td>1.166006</td>\n",
       "      <td>0.280154</td>\n",
       "      <td>57.977718</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[I 2025-01-09 12:27:05,020] Trial 2 finished with value: 1.6214995462309338 and parameters: {'learning_rate': 2.479942619764035e-05}. Best is trial 2 with value: 1.6214995462309338.\n",
      "Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='620' max='620' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [620/620 03:25, Epoch 5/5]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "      <th>Mse</th>\n",
       "      <th>Rmse</th>\n",
       "      <th>Mae</th>\n",
       "      <th>R2</th>\n",
       "      <th>Smape</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.008000</td>\n",
       "      <td>3.590378</td>\n",
       "      <td>3.590379</td>\n",
       "      <td>1.894829</td>\n",
       "      <td>1.149898</td>\n",
       "      <td>0.017017</td>\n",
       "      <td>46.445611</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>2.704000</td>\n",
       "      <td>3.476464</td>\n",
       "      <td>3.476464</td>\n",
       "      <td>1.864528</td>\n",
       "      <td>1.125000</td>\n",
       "      <td>0.048205</td>\n",
       "      <td>47.319812</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>32.099300</td>\n",
       "      <td>3.543669</td>\n",
       "      <td>3.543668</td>\n",
       "      <td>1.882463</td>\n",
       "      <td>1.123369</td>\n",
       "      <td>0.029805</td>\n",
       "      <td>47.717217</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.058200</td>\n",
       "      <td>3.590872</td>\n",
       "      <td>3.590872</td>\n",
       "      <td>1.894960</td>\n",
       "      <td>1.142273</td>\n",
       "      <td>0.016882</td>\n",
       "      <td>48.410091</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>0.084600</td>\n",
       "      <td>3.600572</td>\n",
       "      <td>3.600573</td>\n",
       "      <td>1.897517</td>\n",
       "      <td>1.145824</td>\n",
       "      <td>0.014226</td>\n",
       "      <td>48.548377</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[I 2025-01-09 12:30:33,965] Trial 3 finished with value: 1.8975174797770824 and parameters: {'learning_rate': 1.1750268648920993e-06}. Best is trial 2 with value: 1.6214995462309338.\n",
      "Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='620' max='620' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [620/620 03:27, Epoch 5/5]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "      <th>Mse</th>\n",
       "      <th>Rmse</th>\n",
       "      <th>Mae</th>\n",
       "      <th>R2</th>\n",
       "      <th>Smape</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.085600</td>\n",
       "      <td>3.761341</td>\n",
       "      <td>3.761341</td>\n",
       "      <td>1.939418</td>\n",
       "      <td>1.156432</td>\n",
       "      <td>-0.029790</td>\n",
       "      <td>46.601269</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>2.913400</td>\n",
       "      <td>3.756832</td>\n",
       "      <td>3.756831</td>\n",
       "      <td>1.938255</td>\n",
       "      <td>1.238454</td>\n",
       "      <td>-0.028555</td>\n",
       "      <td>49.874967</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>32.276600</td>\n",
       "      <td>3.654472</td>\n",
       "      <td>3.654473</td>\n",
       "      <td>1.911668</td>\n",
       "      <td>1.135091</td>\n",
       "      <td>-0.000531</td>\n",
       "      <td>48.732340</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.083000</td>\n",
       "      <td>3.665871</td>\n",
       "      <td>3.665871</td>\n",
       "      <td>1.914646</td>\n",
       "      <td>1.162767</td>\n",
       "      <td>-0.003652</td>\n",
       "      <td>49.439710</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>0.055800</td>\n",
       "      <td>3.610057</td>\n",
       "      <td>3.610057</td>\n",
       "      <td>1.900015</td>\n",
       "      <td>1.183222</td>\n",
       "      <td>0.011629</td>\n",
       "      <td>49.474382</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[I 2025-01-09 12:34:05,271] Trial 4 finished with value: 1.9000149676084739 and parameters: {'learning_rate': 2.308984942228097e-06}. Best is trial 2 with value: 1.6214995462309338.\n"
     ]
    }
   ],
   "source": [
    "best_trial = hp_trainer.hyperparameter_search(\n",
    "    direction=\"minimize\",\n",
    "    backend=\"optuna\",\n",
    "    hp_space=optuna_hp_space,\n",
    "    n_trials=5,\n",
    "    compute_objective=lambda x: x['eval_rmse'],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "c9c39fa6-3f84-4082-879a-7efd5d21e174",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "BestRun(run_id='2', objective=1.6214995462309338, hyperparameters={'learning_rate': 2.479942619764035e-05}, run_summary=None)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "best_trial"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f511f354-5c3e-4c62-8063-f769a6c1b9ca",
   "metadata": {},
   "source": [
    "### Fit and upload the best Model\n",
    "We re-fit the model with the best hyperparameters in accordaince with this [forum post](https://discuss.huggingface.co/t/how-to-save-the-best-trials-model-using-trainer-hyperparameter-search/8783/4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "ad4e4cf2-286e-4b4a-9c3b-2024be1769b8",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "/var/home/robin/Development/modernbert-content-regression/.venv/lib/python3.12/site-packages/transformers/training_args.py:1573: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "best_trainer = Trainer(\n",
    "    model=model_init(None),\n",
    "    args=gen_training_args({**best_trial.hyperparameters}),\n",
    "    train_dataset=tokenized_dataset[\"train\"],\n",
    "    eval_dataset=tokenized_dataset[\"test\"],\n",
    "    compute_metrics=compute_metrics_for_regression,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "a031f566-4be1-440a-8481-f23e609ac3b3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='620' max='620' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [620/620 03:25, Epoch 5/5]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "      <th>Mse</th>\n",
       "      <th>Rmse</th>\n",
       "      <th>Mae</th>\n",
       "      <th>R2</th>\n",
       "      <th>Smape</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.115200</td>\n",
       "      <td>4.084211</td>\n",
       "      <td>4.084211</td>\n",
       "      <td>2.020943</td>\n",
       "      <td>1.219903</td>\n",
       "      <td>-0.118186</td>\n",
       "      <td>49.023473</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>1.239000</td>\n",
       "      <td>3.803578</td>\n",
       "      <td>3.803578</td>\n",
       "      <td>1.950276</td>\n",
       "      <td>1.289222</td>\n",
       "      <td>-0.041354</td>\n",
       "      <td>52.775413</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>27.825600</td>\n",
       "      <td>3.245966</td>\n",
       "      <td>3.245967</td>\n",
       "      <td>1.801657</td>\n",
       "      <td>1.102216</td>\n",
       "      <td>0.111311</td>\n",
       "      <td>51.747030</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.000100</td>\n",
       "      <td>2.413429</td>\n",
       "      <td>2.413429</td>\n",
       "      <td>1.553521</td>\n",
       "      <td>1.081085</td>\n",
       "      <td>0.339245</td>\n",
       "      <td>52.221513</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>0.166600</td>\n",
       "      <td>2.462405</td>\n",
       "      <td>2.462406</td>\n",
       "      <td>1.569205</td>\n",
       "      <td>1.182182</td>\n",
       "      <td>0.325836</td>\n",
       "      <td>56.614470</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=620, training_loss=4.329616037725622, metrics={'train_runtime': 205.4329, 'train_samples_per_second': 11.999, 'train_steps_per_second': 3.018, 'total_flos': 3359849068769280.0, 'train_loss': 4.329616037725622, 'epoch': 5.0})"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "best_trainer.train() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "09a0f8e2-7986-4171-902e-c08bcd5d6088",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='14' max='14' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [14/14 00:01]\n",
       "    </div>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'eval_loss': 2.4624054431915283,\n",
       " 'eval_mse': 2.4624056816101074,\n",
       " 'eval_rmse': 1.5692054300218654,\n",
       " 'eval_mae': 1.182181715965271,\n",
       " 'eval_r2': 0.325836181640625,\n",
       " 'eval_smape': 56.61447048187256,\n",
       " 'eval_runtime': 1.3489,\n",
       " 'eval_samples_per_second': 40.774,\n",
       " 'eval_steps_per_second': 10.379,\n",
       " 'epoch': 5.0}"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "best_trainer.evaluate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "6360b8dd-c456-4c0f-a79f-b1fb7a99ad19",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "49576b7f5f6b4ea781dd6198df4f33f7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "events.out.tfevents.1736455080.bazzite:   0%|          | 0.00/40.0 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/Forecast-ing/modernBERT-content-regression/commit/16f1dc87782b2735f8fef84a5b10807b6cbe5565', commit_message='End of training', commit_description='', oid='16f1dc87782b2735f8fef84a5b10807b6cbe5565', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Forecast-ing/modernBERT-content-regression', endpoint='https://huggingface.co', repo_type='model', repo_id='Forecast-ing/modernBERT-content-regression'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.save_pretrained(\"modernBERT-content-regression\")\n",
    "best_trainer.create_model_card()\n",
    "best_trainer.push_to_hub()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c9971d5-5db6-48ac-be23-4131f75a961c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "083db2d4-2601-44ef-aa87-5f4a1d66f8e3",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9db9a38d-653d-4b71-8cc2-e0f65ae065bd",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}