{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Connection closed.\n",
      "Connected. Call `.close()` to terminate connection gracefully.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1160344\n",
      "Connected. Call `.close()` to terminate connection gracefully.\n",
      "Connected. Call `.close()` to terminate connection gracefully.\n"
     ]
    }
   ],
   "source": [
    "import datetime\n",
    "import pandas as pd\n",
    "from xgboost import XGBRegressor\n",
    "import hopsworks\n",
    "import json\n",
    "from functions import util\n",
    "import os\n",
    "\n",
    "# Set up\n",
    "\n",
    "api_key = os.getenv('HOPSWORKS_API_KEY')\n",
    "project_name = os.getenv('HOPSWORKS_PROJECT')\n",
    "\n",
    "project = hopsworks.login(project=project_name, api_key_value=api_key)\n",
    "fs = project.get_feature_store() \n",
    "secrets = util.secrets_api(project.name)\n",
    "location_str = secrets.get_secret(\"SENSOR_LOCATION_JSON\").value\n",
    "location = json.loads(location_str)\n",
    "country=location['country']\n",
    "city=location['city']\n",
    "street=location['street']\n",
    "\n",
    "AQI_API_KEY = secrets.get_secret(\"AQI_API_KEY\").value\n",
    "location_str = secrets.get_secret(\"SENSOR_LOCATION_JSON\").value\n",
    "location = json.loads(location_str)\n",
    "\n",
    "today = datetime.datetime.now() - datetime.timedelta(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Connected. Call `.close()` to terminate connection gracefully.\n",
      "Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.02s) \n"
     ]
    }
   ],
   "source": [
    "### Retreive model\n",
    "\n",
    "mr = project.get_model_registry()\n",
    "\n",
    "retrieved_model = mr.get_model(\n",
    "    name=\"air_quality_xgboost_model\",\n",
    "    version=1,\n",
    ")\n",
    "\n",
    "saved_model_dir = retrieved_model.download()\n",
    "retrieved_xgboost_model = XGBRegressor()\n",
    "retrieved_xgboost_model.load_model(saved_model_dir + \"/model.json\")\n",
    "\n",
    "### Retrieve features \n",
    "\n",
    "weather_fg = fs.get_feature_group(\n",
    "    name='weather',\n",
    "    version=1,\n",
    ")\n",
    "\n",
    "today_timestamp = pd.to_datetime(today)\n",
    "batch_data = weather_fg.filter(weather_fg.date >= today_timestamp ).read().sort_values(by=['date'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>temperature_2m_mean</th>\n",
       "      <th>precipitation_sum</th>\n",
       "      <th>wind_speed_10m_max</th>\n",
       "      <th>wind_direction_10m_dominant</th>\n",
       "      <th>city</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2024-11-21 00:00:00+00:00</td>\n",
       "      <td>21.700001</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.138420</td>\n",
       "      <td>71.564964</td>\n",
       "      <td>lahore</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2024-11-22 00:00:00+00:00</td>\n",
       "      <td>21.850000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.610250</td>\n",
       "      <td>128.659836</td>\n",
       "      <td>lahore</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2024-11-23 00:00:00+00:00</td>\n",
       "      <td>22.250000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5.091168</td>\n",
       "      <td>44.999897</td>\n",
       "      <td>lahore</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2024-11-24 00:00:00+00:00</td>\n",
       "      <td>21.400000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.334974</td>\n",
       "      <td>318.366547</td>\n",
       "      <td>lahore</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2024-11-25 00:00:00+00:00</td>\n",
       "      <td>20.750000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>6.439876</td>\n",
       "      <td>296.564972</td>\n",
       "      <td>lahore</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2024-11-26 00:00:00+00:00</td>\n",
       "      <td>20.750000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.680000</td>\n",
       "      <td>270.000000</td>\n",
       "      <td>lahore</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2024-11-27 00:00:00+00:00</td>\n",
       "      <td>20.350000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.104631</td>\n",
       "      <td>37.875053</td>\n",
       "      <td>lahore</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2024-11-28 00:00:00+00:00</td>\n",
       "      <td>19.799999</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.189795</td>\n",
       "      <td>9.462248</td>\n",
       "      <td>lahore</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       date  temperature_2m_mean  precipitation_sum  \\\n",
       "1 2024-11-21 00:00:00+00:00            21.700001                0.0   \n",
       "4 2024-11-22 00:00:00+00:00            21.850000                0.0   \n",
       "7 2024-11-23 00:00:00+00:00            22.250000                0.0   \n",
       "6 2024-11-24 00:00:00+00:00            21.400000                0.0   \n",
       "5 2024-11-25 00:00:00+00:00            20.750000                0.0   \n",
       "2 2024-11-26 00:00:00+00:00            20.750000                0.0   \n",
       "0 2024-11-27 00:00:00+00:00            20.350000                0.0   \n",
       "3 2024-11-28 00:00:00+00:00            19.799999                0.0   \n",
       "\n",
       "   wind_speed_10m_max  wind_direction_10m_dominant    city  \n",
       "1            1.138420                    71.564964  lahore  \n",
       "4            4.610250                   128.659836  lahore  \n",
       "7            5.091168                    44.999897  lahore  \n",
       "6            4.334974                   318.366547  lahore  \n",
       "5            6.439876                   296.564972  lahore  \n",
       "2            4.680000                   270.000000  lahore  \n",
       "0            4.104631                    37.875053  lahore  \n",
       "3            2.189795                     9.462248  lahore  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "batch_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "feature_names mismatch: ['past_air_quality', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant'] ['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']\nexpected past_air_quality in input data",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[6], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m### Predict and upload\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m batch_data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpredicted_pm25\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mretrieved_xgboost_model\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      4\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbatch_data\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtemperature_2m_mean\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mprecipitation_sum\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mwind_speed_10m_max\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mwind_direction_10m_dominant\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      6\u001b[0m batch_data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mstreet\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m street\n\u001b[1;32m      7\u001b[0m batch_data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcity\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m city\n",
      "File \u001b[0;32m~/Documents/scalable-ml/lab1-new/hbg-weather/.venv/lib/python3.12/site-packages/xgboost/sklearn.py:1168\u001b[0m, in \u001b[0;36mXGBModel.predict\u001b[0;34m(self, X, output_margin, validate_features, base_margin, iteration_range)\u001b[0m\n\u001b[1;32m   1166\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_can_use_inplace_predict():\n\u001b[1;32m   1167\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1168\u001b[0m         predts \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_booster\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minplace_predict\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1169\u001b[0m \u001b[43m            \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1170\u001b[0m \u001b[43m            \u001b[49m\u001b[43miteration_range\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43miteration_range\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1171\u001b[0m \u001b[43m            \u001b[49m\u001b[43mpredict_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmargin\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43moutput_margin\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mvalue\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1172\u001b[0m \u001b[43m            \u001b[49m\u001b[43mmissing\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmissing\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1173\u001b[0m \u001b[43m            \u001b[49m\u001b[43mbase_margin\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbase_margin\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1174\u001b[0m \u001b[43m            \u001b[49m\u001b[43mvalidate_features\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvalidate_features\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1175\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1176\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m _is_cupy_array(predts):\n\u001b[1;32m   1177\u001b[0m             \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mcupy\u001b[39;00m  \u001b[38;5;66;03m# pylint: disable=import-error\u001b[39;00m\n",
      "File \u001b[0;32m~/Documents/scalable-ml/lab1-new/hbg-weather/.venv/lib/python3.12/site-packages/xgboost/core.py:2418\u001b[0m, in \u001b[0;36mBooster.inplace_predict\u001b[0;34m(self, data, iteration_range, predict_type, missing, validate_features, base_margin, strict_shape)\u001b[0m\n\u001b[1;32m   2416\u001b[0m     data, fns, _ \u001b[38;5;241m=\u001b[39m _transform_pandas_df(data, enable_categorical)\n\u001b[1;32m   2417\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m validate_features:\n\u001b[0;32m-> 2418\u001b[0m         \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_features\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfns\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2419\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _is_list(data) \u001b[38;5;129;01mor\u001b[39;00m _is_tuple(data):\n\u001b[1;32m   2420\u001b[0m     data \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39marray(data)\n",
      "File \u001b[0;32m~/Documents/scalable-ml/lab1-new/hbg-weather/.venv/lib/python3.12/site-packages/xgboost/core.py:2970\u001b[0m, in \u001b[0;36mBooster._validate_features\u001b[0;34m(self, feature_names)\u001b[0m\n\u001b[1;32m   2964\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m my_missing:\n\u001b[1;32m   2965\u001b[0m     msg \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m   2966\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mtraining data did not have the following fields: \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   2967\u001b[0m         \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;28mstr\u001b[39m(s) \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m my_missing)\n\u001b[1;32m   2968\u001b[0m     )\n\u001b[0;32m-> 2970\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(msg\u001b[38;5;241m.\u001b[39mformat(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeature_names, feature_names))\n",
      "\u001b[0;31mValueError\u001b[0m: feature_names mismatch: ['past_air_quality', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant'] ['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']\nexpected past_air_quality in input data"
     ]
    }
   ],
   "source": [
    "### Predict and upload\n",
    "\n",
    "batch_data['predicted_pm25'] = retrieved_xgboost_model.predict(\n",
    "    batch_data[['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']])\n",
    "\n",
    "batch_data['street'] = street\n",
    "batch_data['city'] = city\n",
    "batch_data['country'] = country\n",
    "# Fill in the number of days before the date on which you made the forecast (base_date)\n",
    "batch_data['days_before_forecast_day'] = range(1, len(batch_data)+1)\n",
    "batch_data = batch_data.sort_values(by=['date'])\n",
    "#batch_data['date'] = batch_data['date'].dt.tz_convert(None).astype('datetime64[ns]')\n",
    "\n",
    "plt = util.plot_air_quality_forecast(city, street, batch_data, file_path=\"./img/pm25_forecast.png\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Batch data:                        date  temperature_2m_mean  precipitation_sum  \\\n",
      "0 2024-11-21 00:00:00+00:00                 3.40                0.2   \n",
      "3 2024-11-22 00:00:00+00:00                 4.05                0.7   \n",
      "2 2024-11-23 00:00:00+00:00                 5.45                0.0   \n",
      "1 2024-11-24 00:00:00+00:00                 5.60                0.0   \n",
      "\n",
      "   wind_speed_10m_max  wind_direction_10m_dominant         city  \\\n",
      "0           19.995398                   246.665939  Helsingborg   \n",
      "3           23.540806                   246.571289  Helsingborg   \n",
      "2           30.631746                   240.422256  Helsingborg   \n",
      "1           13.755580                   276.008911  Helsingborg   \n",
      "\n",
      "   predicted_pm25          street country  days_before_forecast_day  \n",
      "0       39.168438  Drottninggatan  Sweden                         1  \n",
      "3       20.740093  Drottninggatan  Sweden                         2  \n",
      "2       46.448105  Drottninggatan  Sweden                         3  \n",
      "1       61.713448  Drottninggatan  Sweden                         4  \n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0c3e8fd8c8f545a597e504acf5f077e8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading Dataframe: 0.00% |          | Rows 0/4 | Elapsed Time: 00:00 | Remaining Time: ?"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Launching job: aq_predictions_1_offline_fg_materialization\n",
      "Job started successfully, you can follow the progress at \n",
      "https://c.app.hopsworks.ai/p/1160340/jobs/named/aq_predictions_1_offline_fg_materialization/executions\n",
      "Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.95s) \n",
      "Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.85s) \n"
     ]
    }
   ],
   "source": [
    "monitor_fg = fs.get_or_create_feature_group(\n",
    "    name='aq_predictions',\n",
    "    description='Air Quality prediction monitoring',\n",
    "    version=1,\n",
    "    primary_key=['city','street','date','days_before_forecast_day'],\n",
    "    event_time=\"date\"\n",
    ")\n",
    "\n",
    "print(f\"Batch data: {batch_data}\")\n",
    "\n",
    "monitor_fg.insert(batch_data, write_options={\"wait_for_job\": True})\n",
    "monitoring_df = monitor_fg.filter(monitor_fg.days_before_forecast_day == 1).read()\n",
    "\n",
    "# Hindcast monitoring\n",
    "\n",
    "air_quality_fg = fs.get_feature_group(\n",
    "    name='air_quality',\n",
    "    version=1,\n",
    ")\n",
    "air_quality_df = air_quality_fg.read()\n",
    "\n",
    "outcome_df = air_quality_df[['date', 'pm25']]\n",
    "preds_df =  monitoring_df[['date', 'predicted_pm25']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "You are trying to merge on datetime64[us, UTC] and object columns for key 'date'. If you wish to proceed you should use pd.concat",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[5], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m hindcast_df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmerge\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpreds_df\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutcome_df\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mon\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdate\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m      2\u001b[0m hindcast_df \u001b[38;5;241m=\u001b[39m hindcast_df\u001b[38;5;241m.\u001b[39msort_values(by\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdate\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(hindcast_df) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
      "File \u001b[0;32m~/Documents/scalable-ml/lab1-new/hbg-weather/.venv/lib/python3.12/site-packages/pandas/core/reshape/merge.py:169\u001b[0m, in \u001b[0;36mmerge\u001b[0;34m(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)\u001b[0m\n\u001b[1;32m    154\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m _cross_merge(\n\u001b[1;32m    155\u001b[0m         left_df,\n\u001b[1;32m    156\u001b[0m         right_df,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    166\u001b[0m         copy\u001b[38;5;241m=\u001b[39mcopy,\n\u001b[1;32m    167\u001b[0m     )\n\u001b[1;32m    168\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 169\u001b[0m     op \u001b[38;5;241m=\u001b[39m \u001b[43m_MergeOperation\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    170\u001b[0m \u001b[43m        \u001b[49m\u001b[43mleft_df\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    171\u001b[0m \u001b[43m        \u001b[49m\u001b[43mright_df\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    172\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhow\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhow\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    173\u001b[0m \u001b[43m        \u001b[49m\u001b[43mon\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    174\u001b[0m \u001b[43m        \u001b[49m\u001b[43mleft_on\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mleft_on\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    175\u001b[0m \u001b[43m        \u001b[49m\u001b[43mright_on\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mright_on\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    176\u001b[0m \u001b[43m        \u001b[49m\u001b[43mleft_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mleft_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    177\u001b[0m \u001b[43m        \u001b[49m\u001b[43mright_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mright_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    178\u001b[0m \u001b[43m        \u001b[49m\u001b[43msort\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    179\u001b[0m \u001b[43m        \u001b[49m\u001b[43msuffixes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msuffixes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    180\u001b[0m \u001b[43m        \u001b[49m\u001b[43mindicator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindicator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    181\u001b[0m \u001b[43m        \u001b[49m\u001b[43mvalidate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvalidate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    182\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    183\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m op\u001b[38;5;241m.\u001b[39mget_result(copy\u001b[38;5;241m=\u001b[39mcopy)\n",
      "File \u001b[0;32m~/Documents/scalable-ml/lab1-new/hbg-weather/.venv/lib/python3.12/site-packages/pandas/core/reshape/merge.py:804\u001b[0m, in \u001b[0;36m_MergeOperation.__init__\u001b[0;34m(self, left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, indicator, validate)\u001b[0m\n\u001b[1;32m    800\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_tolerance(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mleft_join_keys)\n\u001b[1;32m    802\u001b[0m \u001b[38;5;66;03m# validate the merge keys dtypes. We may need to coerce\u001b[39;00m\n\u001b[1;32m    803\u001b[0m \u001b[38;5;66;03m# to avoid incompatible dtypes\u001b[39;00m\n\u001b[0;32m--> 804\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_maybe_coerce_merge_keys\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    806\u001b[0m \u001b[38;5;66;03m# If argument passed to validate,\u001b[39;00m\n\u001b[1;32m    807\u001b[0m \u001b[38;5;66;03m# check if columns specified as unique\u001b[39;00m\n\u001b[1;32m    808\u001b[0m \u001b[38;5;66;03m# are in fact unique.\u001b[39;00m\n\u001b[1;32m    809\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m validate \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
      "File \u001b[0;32m~/Documents/scalable-ml/lab1-new/hbg-weather/.venv/lib/python3.12/site-packages/pandas/core/reshape/merge.py:1483\u001b[0m, in \u001b[0;36m_MergeOperation._maybe_coerce_merge_keys\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1481\u001b[0m \u001b[38;5;66;03m# datetimelikes must match exactly\u001b[39;00m\n\u001b[1;32m   1482\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m needs_i8_conversion(lk\u001b[38;5;241m.\u001b[39mdtype) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m needs_i8_conversion(rk\u001b[38;5;241m.\u001b[39mdtype):\n\u001b[0;32m-> 1483\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(msg)\n\u001b[1;32m   1484\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m needs_i8_conversion(lk\u001b[38;5;241m.\u001b[39mdtype) \u001b[38;5;129;01mand\u001b[39;00m needs_i8_conversion(rk\u001b[38;5;241m.\u001b[39mdtype):\n\u001b[1;32m   1485\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(msg)\n",
      "\u001b[0;31mValueError\u001b[0m: You are trying to merge on datetime64[us, UTC] and object columns for key 'date'. If you wish to proceed you should use pd.concat"
     ]
    }
   ],
   "source": [
    "hindcast_df = pd.merge(preds_df, outcome_df, on=\"date\")\n",
    "hindcast_df = hindcast_df.sort_values(by=['date'])\n",
    "\n",
    "if len(hindcast_df) == 0:\n",
    "    hindcast_df = util.backfill_predictions_for_monitoring(weather_fg, air_quality_df, monitor_fg, retrieved_xgboost_model)\n",
    "\n",
    "plt = util.plot_air_quality_forecast(city, street, hindcast_df, file_path=\"./img/pm25_hindcast_1day.png\", hindcast=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2024-11-20 14:23:12,559 WARNING: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "scale = 5\n",
    "outcome_df['predicted_pm25'] = outcome_df['pm25'] + scale * np.random.uniform(-1, 1, outcome_df.shape[0])\n",
    "outcome_df.sort_values(by=['date'])\n",
    "outcome_df.to_pickle('outcome_df.pkl')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}