{ "cells": [ { "cell_type": "markdown", "id": "983e6b42-12bb-4513-af47-c8c4e34e3177", "metadata": {}, "source": [ "# Poro34B Lora fine-tuning with S-Group's data - 2 Q/A" ] }, { "cell_type": "code", "execution_count": null, "id": "e5f5a80c-0501-41a1-80a9-fb5792b45fea", "metadata": {}, "outputs": [], "source": [ "# This script finetunes the Poro34B model with 1 Question and Answer pair" ] }, { "cell_type": "markdown", "id": "6441fdf4-ed64-447a-b2c6-542738dc2658", "metadata": {}, "source": [ "## Initialization" ] }, { "cell_type": "code", "execution_count": 1, "id": "67f730e6-3467-4a19-ab76-e8baace8e02e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: peft in /opt/conda/lib/python3.10/site-packages (0.9.0)\n", "Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from peft) (1.26.3)\n", "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from peft) (23.2)\n", "Requirement already satisfied: psutil in /opt/conda/lib/python3.10/site-packages (from peft) (5.9.8)\n", "Requirement already satisfied: pyyaml in /opt/conda/lib/python3.10/site-packages (from peft) (6.0.1)\n", "Requirement already satisfied: torch>=1.13.0 in /opt/conda/lib/python3.10/site-packages (from peft) (2.0.0.post101)\n", "Requirement already satisfied: transformers in /opt/conda/lib/python3.10/site-packages (from peft) (4.31.0)\n", "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from peft) (4.66.1)\n", "Requirement already satisfied: accelerate>=0.21.0 in /opt/conda/lib/python3.10/site-packages (from peft) (0.21.0)\n", "Requirement already satisfied: safetensors in /opt/conda/lib/python3.10/site-packages (from peft) (0.3.3)\n", "Requirement already satisfied: huggingface-hub>=0.17.0 in /opt/conda/lib/python3.10/site-packages (from peft) (0.20.2)\n", "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft) (3.13.1)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft) (2023.6.0)\n", "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft) (2.31.0)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft) (4.5.0)\n", "Requirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft) (1.12)\n", "Requirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft) (3.2.1)\n", "Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft) (3.1.3)\n", "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.10/site-packages (from transformers->peft) (2023.12.25)\n", "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /opt/conda/lib/python3.10/site-packages (from transformers->peft) (0.13.3)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch>=1.13.0->peft) (2.1.4)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft) (3.6)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft) (1.26.18)\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft) (2023.11.17)\n", "Requirement already satisfied: mpmath>=0.19 in /opt/conda/lib/python3.10/site-packages (from sympy->torch>=1.13.0->peft) (1.3.0)\n" ] } ], "source": [ "!pip install peft" ] }, { "cell_type": "code", "execution_count": 2, "id": "80b24df2-140b-4792-aaf1-6f6aff92ece8", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-02-29 16:09:52.067525: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", "To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" ] } ], "source": [ "import torch\n", "import json\n", "from transformers import AutoModelForCausalLM, AutoTokenizer \n", "from transformers import TrainingArguments, Trainer\n", "from transformers import pipeline\n", "from peft import get_peft_model, PromptTuningConfig, TaskType, PromptTuningInit\n", "from datasets import load_dataset" ] }, { "cell_type": "code", "execution_count": 3, "id": "d31adfc6-a460-419e-871b-d0437501b026", "metadata": {}, "outputs": [], "source": [ "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "2c5a9b07-c92b-4d1d-b5b5-96e8c234e14f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "cpu\n" ] } ], "source": [ "print(device)" ] }, { "cell_type": "markdown", "id": "4b1ec235-cdd1-4b68-b9b0-fe214cbeb2be", "metadata": {}, "source": [ "## Foundation model import" ] }, { "cell_type": "code", "execution_count": 3, "id": "2c0f7b3a-9d56-46ce-9dc8-5fe40b2628a6", "metadata": {}, "outputs": [], "source": [ "model_name='LumiOpen/Poro-34B'" ] }, { "cell_type": "code", "execution_count": 4, "id": "4e4c9089-a195-4fd7-91b2-6240cafb4989", "metadata": {}, "outputs": [], "source": [ "tokenizer = AutoTokenizer.from_pretrained(model_name)" ] }, { "cell_type": "code", "execution_count": null, "id": "a42e0fb6-40d4-483b-a034-84ff351c021d", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2784e3e4025e44d4aa5edb0aa58a2aaa", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/14 [00:00" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Tracking run with wandb version 0.16.2" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Run data is saved locally in /home/sagemaker-user/wandb/run-20240229_153102-95aeur18" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Syncing run brisk-fog-8 to Weights & Biases (docs)
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ " View project at https://wandb.ai/timo-au-laine/huggingface" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ " View run at https://wandb.ai/timo-au-laine/huggingface/runs/95aeur18" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [20/20 29:32, Epoch 20/20]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining Loss
10.687500
20.683600
30.683600
40.683600
50.683600
60.671900
70.656200
80.617200
90.585900
100.543000
110.488300
120.423800
130.357400
140.281200
150.204100
160.137700
170.088400
180.058100
190.043500
200.027600

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "TrainOutput(global_step=20, training_loss=0.43031005859375, metrics={'train_runtime': 1884.4748, 'train_samples_per_second': 0.17, 'train_steps_per_second': 0.011, 'total_flos': 1415086626078720.0, 'train_loss': 0.43031005859375, 'epoch': 20.0})" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trainer.train()" ] }, { "cell_type": "markdown", "id": "efc3ef10-e402-481a-8abd-4174444d668a", "metadata": {}, "source": [ "## Saving the finetuned model" ] }, { "cell_type": "code", "execution_count": 18, "id": "c37902bf-47e5-4f89-9128-a6b7d91cb437", "metadata": {}, "outputs": [], "source": [ "model_id2 = \"Poro-34B-Lora-2\"" ] }, { "cell_type": "code", "execution_count": 19, "id": "163b54c4-3027-4e0d-9d52-7e3d698020da", "metadata": {}, "outputs": [], "source": [ "peft_model.save_pretrained(model_id2)" ] }, { "cell_type": "code", "execution_count": null, "id": "ec432db5-4f0c-43c7-b4e4-ef087f057bd0", "metadata": {}, "outputs": [], "source": [ "!ls -lh {model_id2}" ] }, { "cell_type": "markdown", "id": "059b6ee5-bfc0-4dcf-901c-ef869bedbb90", "metadata": {}, "source": [ "## Testing" ] }, { "cell_type": "code", "execution_count": null, "id": "eb6a1213-a7ab-4bb5-8ffc-0e2666286dc6", "metadata": {}, "outputs": [], "source": [ "def generate_output(model, inputs, max_new_tokens=100):\n", " outputs = model.generate(\n", " input_ids=inputs[\"input_ids\"],\n", " max_new_tokens=max_new_tokens,\n", " temperature=0.1,\n", " )\n", " return outputs" ] }, { "cell_type": "markdown", "id": "619f7e85-3310-40d1-8706-1289676b98ca", "metadata": {}, "source": [ "### Original model" ] }, { "cell_type": "code", "execution_count": null, "id": "d38bbed0-e938-43ef-b816-b5e0f9d066fd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Given the question delimited by triple backticks ```{ Kuinka vaihdan uutiskirjeen sähköpostiosoitteen? }```, what is the answer? Answer: ```{ Voit vaihtaa uutiskirjeen sähköpostiosoitteen kirjautumalla sisään ja menemällä Oma tili -osioon. }```\\n']\n" ] } ], "source": [ "prompt = tokenizer('Given the question delimited by triple backticks ```{ Kuinka vaihdan uutiskirjeen sähköpostiosoitteen? }```, what is the answer? Answer:', return_tensors=\"pt\")\n", "result = generate_output(model,prompt)\n", "print(tokenizer.batch_decode(result, skip_special_tokens=True))" ] }, { "cell_type": "code", "execution_count": null, "id": "6d354261-2dbb-4ecd-a75e-3af454c3c051", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Given the question delimited by triple backticks ```{ Miksi sähköpostiosoite tulee vahvistaa? }```, what is the answer? Answer: Because the email address needs to be confirmed.']\n" ] } ], "source": [ "prompt = tokenizer('Given the question delimited by triple backticks ```{ Miksi sähköpostiosoite tulee vahvistaa? }```, what is the answer? Answer:', return_tensors=\"pt\")\n", "result = generate_output(model,prompt)\n", "print(tokenizer.batch_decode(result, skip_special_tokens=True))" ] }, { "cell_type": "markdown", "id": "3afd691c-b853-41e1-abc5-3e462dd8f9a0", "metadata": {}, "source": [ "### Finetuned model" ] }, { "cell_type": "code", "execution_count": 23, "id": "4cf53f39-ad3f-43e2-8daa-79853b054cd2", "metadata": {}, "outputs": [], "source": [ "from peft import PeftModel" ] }, { "cell_type": "code", "execution_count": 24, "id": "142bf57d-cffc-47b2-ae91-8a5420c46d32", "metadata": {}, "outputs": [], "source": [ "loaded_model = PeftModel.from_pretrained(model,model_id2,is_trainable=False)" ] }, { "cell_type": "code", "execution_count": 25, "id": "c3cacd26-edff-494c-9428-55b7659988de", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Given the question delimited by triple backticks ```{ Kuinka vaihdan uutiskirjeen sähköpostiosoitteen? }```, what is the answer? Answer: { Peruuta ensin vanhaan osoitteeseen tilattu uutiskirje kirjeen alareunan “Peruuta tilaus” -linkistä.\\nTilaa uutiskirje uudelleen oikeaan osoitteeseen. }\\n\\n### Answering a Given Question\\n\\nGiven a question delimited by triple backticks ```{ Kuinka vaihdan uutiskirjeen sähköpostiosoitteen? }```, what is the answer? Answer: { Peruuta ensin vanhaan osoitteeseen tilattu uutiskirje kirjeen alareunan “Peruuta tilaus” -linkistä.\\nTilaa uutiskirje uudelleen oikeaan osoitteeseen. }\\n\\n## Data Sources\\n\\nThe data for currently supported']\n" ] } ], "source": [ "prompt = tokenizer('Given the question delimited by triple backticks ```{ Kuinka vaihdan uutiskirjeen sähköpostiosoitteen? }```, what is the answer? Answer:', return_tensors=\"pt\")\n", "result = generate_output(loaded_model,prompt)\n", "print(tokenizer.batch_decode(result, skip_special_tokens=True))" ] }, { "cell_type": "code", "execution_count": 26, "id": "b1ad5b3e-8f22-4763-b3e1-91b604731048", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Given the question delimited by triple backticks ```{ Miksi sähköpostiosoite tulee vahvistaa? }```, what is the answer? Answer: {Sähköpostiosoitteiden vahvistaminen on yleisesti käytössä oleva tapa varmistua siitä, että henkilöllä itsellään on pääsy hänen tiedoissaan olevaan sähköpostiosoitteeseen.\\n\\nSähköpostiosoite tulee vahvistaa itse, joko S-mobiilissa tai samalla kun luot itsellesi S-käyttäjätilin. Kun lähetät vahvistusviestin omissa tiedoissasi näkyvään sähköpostiosoitteeseen ja vahvistat itse osoitteen oikeaksi sähköpostiisi lähetetyllä vahvistuskoodilla, saamme varmistuksen, että osoitteesi on voimassa ja kuuluu juuri sinulle.\\n\\nJos asiakastiedoissasi olevasta sähköpostiosoitteesta puuttuu vielä vahvistus, näkyy']\n" ] } ], "source": [ "prompt = tokenizer('Given the question delimited by triple backticks ```{ Miksi sähköpostiosoite tulee vahvistaa? }```, what is the answer? Answer:', return_tensors=\"pt\")\n", "result = generate_output(loaded_model,prompt)\n", "print(tokenizer.batch_decode(result, skip_special_tokens=True))" ] }, { "cell_type": "code", "execution_count": null, "id": "166c476c-01a2-49cc-b03f-6cb1d9ae6136", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "09523d41-51e5-4cc6-b68b-c848063e5095", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 5 }