{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0c4ecb49-ce58-4b65-849a-760980576e48",
   "metadata": {},
   "source": [
    "# Poro34B Lora fine-tuning with S-Group's data - 1 Q/A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b686006-65a7-43af-8207-1c7309a5e423",
   "metadata": {},
   "outputs": [],
   "source": [
    "# This script finetunes the Poro34B model with 1 Question and Answer pair"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "defcdb6f-3b69-4b03-b2dc-07c4b3027fd6",
   "metadata": {},
   "source": [
    "## Initialization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "67f730e6-3467-4a19-ab76-e8baace8e02e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pip install peft, all other Python libraries are already in AWS image\n",
    "!pip install peft"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "80b24df2-140b-4792-aaf1-6f6aff92ece8",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-02-29 15:06:36.945989: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import json\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer \n",
    "from transformers import TrainingArguments, Trainer\n",
    "from transformers import pipeline\n",
    "from peft import  get_peft_model, PromptTuningConfig, TaskType, PromptTuningInit\n",
    "from datasets import load_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d31adfc6-a460-419e-871b-d0437501b026",
   "metadata": {},
   "outputs": [],
   "source": [
    "# this checks wether we have GPU\n",
    "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "2c5a9b07-c92b-4d1d-b5b5-96e8c234e14f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cpu\n"
     ]
    }
   ],
   "source": [
    "print(device)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6ea88a10-f5f1-4342-939b-60d2b9c5bb91",
   "metadata": {},
   "source": [
    "## Foundation model import"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "2c0f7b3a-9d56-46ce-9dc8-5fe40b2628a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Foundation model\n",
    "model_name='LumiOpen/Poro-34B'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "4e4c9089-a195-4fd7-91b2-6240cafb4989",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(model_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "a42e0fb6-40d4-483b-a034-84ff351c021d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3a476b270f8d413c8d54e413fe791a82",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "branch = \"1000B\"\n",
    "model = AutoModelForCausalLM.from_pretrained(model_name,\n",
    "    torch_dtype=torch.bfloat16,\n",
    "    revision=branch,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "258df7e6-27c1-48a2-b20a-d377dc885884",
   "metadata": {},
   "source": [
    "## Setting up the Lora parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "63151e65-6bff-4b65-a8ae-af4d6c53036f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-02-29 15:28:54.008508: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    }
   ],
   "source": [
    "from peft import LoraConfig, get_peft_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "f35f934f-23c6-47db-95bb-df20526e29e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "config = LoraConfig(\n",
    "    r=8,\n",
    "    lora_alpha=8,\n",
    "    target_modules=[\"query_key_value\"],\n",
    "    lora_dropout=0.05,\n",
    "    bias=\"none\",\n",
    "    task_type=\"CAUSAL_LM\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "a62cb983-6e28-40f1-9f8d-64a1a6ccd0f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "peft_model = get_peft_model(model, config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "fe7e7078-2998-4f17-abda-f03a32e04735",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "trainable params: 12386304\n",
      "all params: 34229336064\n",
      "trainable: 0.04%\n"
     ]
    }
   ],
   "source": [
    "trainable_params = 0\n",
    "all_param = 0\n",
    "\n",
    "# iterating over all parameters\n",
    "for _, param in peft_model.named_parameters():\n",
    "    # adding parameters to total\n",
    "    all_param += param.numel()\n",
    "    # adding parameters to trainable if they require a graident\n",
    "    if param.requires_grad:\n",
    "        trainable_params += param.numel()\n",
    "\n",
    "# print number of trainable parameters\n",
    "print(f\"trainable params: {trainable_params}\")\n",
    "print(f\"all params: {all_param}\")\n",
    "print(f\"trainable: {100 * trainable_params / all_param:.2f}%\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d029921d-60db-43ad-ae8b-2ab9910bd490",
   "metadata": {},
   "source": [
    "## Preparing the training data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "216aa90b-a87d-4a37-b178-e7e83bf987ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "# prepare the data for training\n",
    "def prepare_train_data(data):\n",
    "    text_input = data['text']\n",
    "    tokenized_input = tokenizer(text_input, return_tensors='pt', padding=True)\n",
    "    tokenized_input['labels'] = tokenized_input['input_ids']\n",
    "    return tokenized_input"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "5551fac2-9a26-4a86-b8b4-d2f572ecfaa9",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = load_dataset(\"json\", data_files=\"prompts_1.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "dfa8382a-3e66-4433-b6dd-97d59a7945f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset = dataset['train'].map(prepare_train_data, batched=True, remove_columns=[\"text\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "319de4fe-f5e0-4ea6-9a61-6f18c241bd02",
   "metadata": {},
   "source": [
    "## Setting up the training parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "9bb42764-af93-463f-8cf6-68707f21151b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import DataCollatorForLanguageModeling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "4b8a94c9-2648-4626-9238-4475645aa695",
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer = Trainer(\n",
    "    model=peft_model,\n",
    "    train_dataset=train_dataset,\n",
    "    args=TrainingArguments(\n",
    "        per_device_train_batch_size=4,\n",
    "        gradient_accumulation_steps=4,\n",
    "        warmup_steps=20,\n",
    "        max_steps=20,\n",
    "        learning_rate=1e-3,\n",
    "        logging_steps=1,\n",
    "        output_dir='outputs',\n",
    "    ),\n",
    "    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "fbd62db8-65e3-4d05-b11e-179aaf8f0e65",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='20' max='20' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [20/20 27:40, Epoch 20/20]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Step</th>\n",
       "      <th>Training Loss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.816400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.808600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.808600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.804700</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>0.793000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>0.757800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>0.699200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>0.640600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>0.570300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>10</td>\n",
       "      <td>0.492200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>11</td>\n",
       "      <td>0.392600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>12</td>\n",
       "      <td>0.291000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>13</td>\n",
       "      <td>0.196300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14</td>\n",
       "      <td>0.140600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>15</td>\n",
       "      <td>0.112800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>16</td>\n",
       "      <td>0.090300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>17</td>\n",
       "      <td>0.064500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>18</td>\n",
       "      <td>0.048800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>19</td>\n",
       "      <td>0.024900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>20</td>\n",
       "      <td>0.018900</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=20, training_loss=0.428607177734375, metrics={'train_runtime': 1747.2099, 'train_samples_per_second': 0.183, 'train_steps_per_second': 0.011, 'total_flos': 219858091622400.0, 'train_loss': 0.428607177734375, 'epoch': 20.0})"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1ed2cf09-3683-4016-88d9-9ada1ddb4345",
   "metadata": {},
   "source": [
    "## Saving the finetuned model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "c37902bf-47e5-4f89-9128-a6b7d91cb437",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_id = \"Poro-34B-Lora-1\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "163b54c4-3027-4e0d-9d52-7e3d698020da",
   "metadata": {},
   "outputs": [],
   "source": [
    "peft_model.save_pretrained(model_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ec432db5-4f0c-43c7-b4e4-ef087f057bd0",
   "metadata": {},
   "outputs": [],
   "source": [
    "!ls -lh {model_id}  # Lora parameters file size"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "11460d4e-3e11-4fdb-b134-61b45bb84018",
   "metadata": {},
   "source": [
    "## Testing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "eb6a1213-a7ab-4bb5-8ffc-0e2666286dc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_output(model, inputs, max_new_tokens=100):\n",
    "    outputs = model.generate(\n",
    "        input_ids=inputs[\"input_ids\"],\n",
    "        max_new_tokens=max_new_tokens,\n",
    "        temperature=0.1,\n",
    "    )\n",
    "    return outputs"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0a844312-2a1e-4c76-9078-96506b252522",
   "metadata": {},
   "source": [
    "### Original model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d38bbed0-e938-43ef-b816-b5e0f9d066fd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Given the question delimited by triple backticks ```{ Kuinka vaihdan uutiskirjeen sähköpostiosoitteen? }```, what is the answer? Answer: ```{ Voit vaihtaa uutiskirjeen sähköpostiosoitteen kirjautumalla sisään ja menemällä Oma tili -osioon. }```\\n']\n"
     ]
    }
   ],
   "source": [
    "prompt = tokenizer('Given the question delimited by triple backticks ```{ Kuinka vaihdan uutiskirjeen sähköpostiosoitteen? }```, what is the answer? Answer:', return_tensors=\"pt\")\n",
    "result = generate_output(model,prompt)\n",
    "print(tokenizer.batch_decode(result, skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ae3c3d6a-2b07-4e46-9ddc-dccadfd07196",
   "metadata": {},
   "source": [
    "### Finetuned model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "4cf53f39-ad3f-43e2-8daa-79853b054cd2",
   "metadata": {},
   "outputs": [],
   "source": [
    "from peft import PeftModel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "142bf57d-cffc-47b2-ae91-8a5420c46d32",
   "metadata": {},
   "outputs": [],
   "source": [
    "loaded_model = PeftModel.from_pretrained(model,model_id,is_trainable=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "c3cacd26-edff-494c-9428-55b7659988de",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Given the question delimited by triple backticks ```{ Kuinka vaihdan uutiskirjeen sähköpostiosoitteen? }```, what is the answer? Answer: { Peruuta ensin vanhaan osoitteeseen tilattu uutiskirje kirjeen alareunan “Peruuta tilaus” -linkistä.\\nTilaa uutiskirje uudelleen oikeaan osoitteeseen. }.\\nKuinka vaihdan uutiskirjeen sähköpostiosoitteen?\\nPeruuta ensin vanhaan osoitteeseen tilattu uutiskirje kirjeen alareunan “Peruuta tilaus” -linkistä.\\nTilaa uutiskirje uudelleen oikeaan osoitteeseen.\\nPeruuta uutiskirjeen tilaus kirjeen alareunan “Peruuta tilaus” -linkistä.\\nTilaa uutiskirje uudelleen oikeaan osoitteeseen.\\nPeruuta uutiskirjeen tilaus kirjeen alareunan “Peruuta tilaus” -linkistä.\\nTilaa uutiskirje']\n"
     ]
    }
   ],
   "source": [
    "prompt = tokenizer('Given the question delimited by triple backticks ```{ Kuinka vaihdan uutiskirjeen sähköpostiosoitteen? }```, what is the answer? Answer:', return_tensors=\"pt\")\n",
    "result = generate_output(loaded_model,prompt)\n",
    "print(tokenizer.batch_decode(result, skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "166c476c-01a2-49cc-b03f-6cb1d9ae6136",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}