{ "cells": [ { "cell_type": "markdown", "id": "0c4ecb49-ce58-4b65-849a-760980576e48", "metadata": {}, "source": [ "# Poro34B Lora fine-tuning with S-Group's data - 1 Q/A" ] }, { "cell_type": "code", "execution_count": null, "id": "5b686006-65a7-43af-8207-1c7309a5e423", "metadata": {}, "outputs": [], "source": [ "# This script finetunes the Poro34B model with 1 Question and Answer pair" ] }, { "cell_type": "markdown", "id": "defcdb6f-3b69-4b03-b2dc-07c4b3027fd6", "metadata": {}, "source": [ "## Initialization" ] }, { "cell_type": "code", "execution_count": null, "id": "67f730e6-3467-4a19-ab76-e8baace8e02e", "metadata": {}, "outputs": [], "source": [ "# pip install peft, all other Python libraries are already in AWS image\n", "!pip install peft" ] }, { "cell_type": "code", "execution_count": 2, "id": "80b24df2-140b-4792-aaf1-6f6aff92ece8", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-02-29 15:06:36.945989: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", "To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" ] } ], "source": [ "import torch\n", "import json\n", "from transformers import AutoModelForCausalLM, AutoTokenizer \n", "from transformers import TrainingArguments, Trainer\n", "from transformers import pipeline\n", "from peft import get_peft_model, PromptTuningConfig, TaskType, PromptTuningInit\n", "from datasets import load_dataset" ] }, { "cell_type": "code", "execution_count": 3, "id": "d31adfc6-a460-419e-871b-d0437501b026", "metadata": {}, "outputs": [], "source": [ "# this checks wether we have GPU\n", "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "2c5a9b07-c92b-4d1d-b5b5-96e8c234e14f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "cpu\n" ] } ], "source": [ "print(device)" ] }, { "cell_type": "markdown", "id": "6ea88a10-f5f1-4342-939b-60d2b9c5bb91", "metadata": {}, "source": [ "## Foundation model import" ] }, { "cell_type": "code", "execution_count": 5, "id": "2c0f7b3a-9d56-46ce-9dc8-5fe40b2628a6", "metadata": {}, "outputs": [], "source": [ "# Foundation model\n", "model_name='LumiOpen/Poro-34B'" ] }, { "cell_type": "code", "execution_count": 6, "id": "4e4c9089-a195-4fd7-91b2-6240cafb4989", "metadata": {}, "outputs": [], "source": [ "tokenizer = AutoTokenizer.from_pretrained(model_name)" ] }, { "cell_type": "code", "execution_count": 7, "id": "a42e0fb6-40d4-483b-a034-84ff351c021d", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3a476b270f8d413c8d54e413fe791a82", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/14 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "branch = \"1000B\"\n", "model = AutoModelForCausalLM.from_pretrained(model_name,\n", " torch_dtype=torch.bfloat16,\n", " revision=branch,\n", ")" ] }, { "cell_type": "markdown", "id": "258df7e6-27c1-48a2-b20a-d377dc885884", "metadata": {}, "source": [ "## Setting up the Lora parameters" ] }, { "cell_type": "code", "execution_count": 1, "id": "63151e65-6bff-4b65-a8ae-af4d6c53036f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-02-29 15:28:54.008508: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", "To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" ] } ], "source": [ "from peft import LoraConfig, get_peft_model" ] }, { "cell_type": "code", "execution_count": 38, "id": "f35f934f-23c6-47db-95bb-df20526e29e7", "metadata": {}, "outputs": [], "source": [ "config = LoraConfig(\n", " r=8,\n", " lora_alpha=8,\n", " target_modules=[\"query_key_value\"],\n", " lora_dropout=0.05,\n", " bias=\"none\",\n", " task_type=\"CAUSAL_LM\"\n", ")" ] }, { "cell_type": "code", "execution_count": 39, "id": "a62cb983-6e28-40f1-9f8d-64a1a6ccd0f3", "metadata": {}, "outputs": [], "source": [ "peft_model = get_peft_model(model, config)" ] }, { "cell_type": "code", "execution_count": 40, "id": "fe7e7078-2998-4f17-abda-f03a32e04735", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "trainable params: 12386304\n", "all params: 34229336064\n", "trainable: 0.04%\n" ] } ], "source": [ "trainable_params = 0\n", "all_param = 0\n", "\n", "# iterating over all parameters\n", "for _, param in peft_model.named_parameters():\n", " # adding parameters to total\n", " all_param += param.numel()\n", " # adding parameters to trainable if they require a graident\n", " if param.requires_grad:\n", " trainable_params += param.numel()\n", "\n", "# print number of trainable parameters\n", "print(f\"trainable params: {trainable_params}\")\n", "print(f\"all params: {all_param}\")\n", "print(f\"trainable: {100 * trainable_params / all_param:.2f}%\")" ] }, { "cell_type": "markdown", "id": "d029921d-60db-43ad-ae8b-2ab9910bd490", "metadata": {}, "source": [ "## Preparing the training data" ] }, { "cell_type": "code", "execution_count": 10, "id": "216aa90b-a87d-4a37-b178-e7e83bf987ce", "metadata": {}, "outputs": [], "source": [ "# prepare the data for training\n", "def prepare_train_data(data):\n", " text_input = data['text']\n", " tokenized_input = tokenizer(text_input, return_tensors='pt', padding=True)\n", " tokenized_input['labels'] = tokenized_input['input_ids']\n", " return tokenized_input" ] }, { "cell_type": "code", "execution_count": 41, "id": "5551fac2-9a26-4a86-b8b4-d2f572ecfaa9", "metadata": {}, "outputs": [], "source": [ "dataset = load_dataset(\"json\", data_files=\"prompts_1.json\")" ] }, { "cell_type": "code", "execution_count": 42, "id": "dfa8382a-3e66-4433-b6dd-97d59a7945f6", "metadata": {}, "outputs": [], "source": [ "train_dataset = dataset['train'].map(prepare_train_data, batched=True, remove_columns=[\"text\"])" ] }, { "cell_type": "markdown", "id": "319de4fe-f5e0-4ea6-9a61-6f18c241bd02", "metadata": {}, "source": [ "## Setting up the training parameters" ] }, { "cell_type": "code", "execution_count": 50, "id": "9bb42764-af93-463f-8cf6-68707f21151b", "metadata": {}, "outputs": [], "source": [ "from transformers import DataCollatorForLanguageModeling" ] }, { "cell_type": "code", "execution_count": 53, "id": "4b8a94c9-2648-4626-9238-4475645aa695", "metadata": {}, "outputs": [], "source": [ "trainer = Trainer(\n", " model=peft_model,\n", " train_dataset=train_dataset,\n", " args=TrainingArguments(\n", " per_device_train_batch_size=4,\n", " gradient_accumulation_steps=4,\n", " warmup_steps=20,\n", " max_steps=20,\n", " learning_rate=1e-3,\n", " logging_steps=1,\n", " output_dir='outputs',\n", " ),\n", " data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)\n", ")" ] }, { "cell_type": "code", "execution_count": 54, "id": "fbd62db8-65e3-4d05-b11e-179aaf8f0e65", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
Step | \n", "Training Loss | \n", "
---|---|
1 | \n", "0.816400 | \n", "
2 | \n", "0.808600 | \n", "
3 | \n", "0.808600 | \n", "
4 | \n", "0.804700 | \n", "
5 | \n", "0.793000 | \n", "
6 | \n", "0.757800 | \n", "
7 | \n", "0.699200 | \n", "
8 | \n", "0.640600 | \n", "
9 | \n", "0.570300 | \n", "
10 | \n", "0.492200 | \n", "
11 | \n", "0.392600 | \n", "
12 | \n", "0.291000 | \n", "
13 | \n", "0.196300 | \n", "
14 | \n", "0.140600 | \n", "
15 | \n", "0.112800 | \n", "
16 | \n", "0.090300 | \n", "
17 | \n", "0.064500 | \n", "
18 | \n", "0.048800 | \n", "
19 | \n", "0.024900 | \n", "
20 | \n", "0.018900 | \n", "
"
],
"text/plain": [
"