{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "Q-bj6K7Qv4ft" }, "source": [ "# Fine-Tuning a Generative Pretrained Transformer (`GPT`)\n", "\n", "1. Install required libraries." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SBWCrz5GfBXo", "outputId": "c3897ecc-56b3-48fc-b9cb-1f8bb2809fbe" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Collecting transformers\n", " Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.2/7.2 MB\u001b[0m \u001b[31m78.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting datasets\n", " Downloading datasets-2.13.1-py3-none-any.whl (486 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m486.2/486.2 kB\u001b[0m \u001b[31m47.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting codecarbon\n", " Downloading codecarbon-2.2.4-py3-none-any.whl (176 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m176.0/176.0 kB\u001b[0m \u001b[31m24.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.2)\n", "Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)\n", " Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m236.8/236.8 kB\u001b[0m \u001b[31m31.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n", "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)\n", " Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m115.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting safetensors>=0.3.1 (from transformers)\n", " Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m87.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)\n", "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (9.0.0)\n", "Collecting dill<0.3.7,>=0.3.0 (from datasets)\n", " Downloading dill-0.3.6-py3-none-any.whl (110 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m110.5/110.5 kB\u001b[0m \u001b[31m16.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (1.5.3)\n", "Collecting xxhash (from datasets)\n", " Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.5/212.5 kB\u001b[0m \u001b[31m27.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting multiprocess (from datasets)\n", " Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.3/134.3 kB\u001b[0m \u001b[31m20.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)\n", "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.8.4)\n", "Collecting arrow (from codecarbon)\n", " Downloading arrow-1.2.3-py3-none-any.whl (66 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m66.4/66.4 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting pynvml (from codecarbon)\n", " Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.1/53.1 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from codecarbon) (5.9.5)\n", "Requirement already satisfied: py-cpuinfo in /usr/local/lib/python3.10/dist-packages (from codecarbon) (9.0.0)\n", "Collecting fuzzywuzzy (from codecarbon)\n", " Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)\n", "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from codecarbon) (8.1.3)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.1.0)\n", "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.0.12)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.4)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.2)\n", "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.2)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.3)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (4.6.3)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.16)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.5.7)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", "Requirement already satisfied: python-dateutil>=2.7.0 in /usr/local/lib/python3.10/dist-packages (from arrow->codecarbon) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2022.7.1)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7.0->arrow->codecarbon) (1.16.0)\n", "Installing collected packages: tokenizers, safetensors, fuzzywuzzy, xxhash, pynvml, dill, multiprocess, huggingface-hub, arrow, transformers, codecarbon, datasets\n", "Successfully installed arrow-1.2.3 codecarbon-2.2.4 datasets-2.13.1 dill-0.3.6 fuzzywuzzy-0.18.0 huggingface-hub-0.15.1 multiprocess-0.70.14 pynvml-11.5.0 safetensors-0.3.1 tokenizers-0.13.3 transformers-4.30.2 xxhash-3.2.0\n" ] } ], "source": [ "!pip install transformers datasets codecarbon" ] }, { "cell_type": "markdown", "metadata": { "id": "y5XnfvSH7w4z" }, "source": [ "2. Load the data from the hub." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 702, "referenced_widgets": [ "34e200081e97446c8cd7856137d2ed39", "3e9822a5f0d046728156b5a0c3e3c082", "6c0f0718a27944a49df4feeb5eb7f0a6", "94ce8d2f7bf24a3cb3487ea88094beda", "579d1717cad5475e9f772e84bdbabfc1", "1120e50796944b58af0f4dbd639482d2", "53f08c1231c74d14a81bd9485ba96deb", "5456362a6850407db4bebc1ded3cbe1a", "f3aaec6e08094e19bff0f311e6a64158", "485ffa98fc3a473096e4af30be5aa60a", "78fa4f26ce174a108c43033b0dd5f2b6", "158650cfa6d14e00ab4825953fabe91e", "16d718b78ba240d7b930ff422c9cbc67", "1092f96bf4b2456893e925e26d200d57", "02a61e2a34b3458a8b9313608812ca7d", "a6d3f7f2b090401f9cba84ccb242c5ec", "f1f0f70d41b64c4892da63d0aa6ec3c9", "edb44c8f27af400cab27572250043b11", "9b37e0bc0c91405a9ce5a925fb46ba7e", "8e07a97bcc0548fbb8681ad1bbd1dcbe", "857f65e34e8a4cee92857a61fb3febb2", "e0f81131b319404d8f48a1ba4edac5bc", "a39380581e804f5fb96018c6058ec4ac", "29e76265e50842c3b3aa3e68825c1cc9", "4622678ef1c440eaba2b21dabaef8a4f", "dac43af304db4a2da06c7b18201413f9", "00485f3c3dbc40b5be49c8f4fd49c3d7", "095609fe73cb4e9cbd5838422855c951", "7bc85c49a9154847b716aeaaf47d4558", "aba8b6027ea447a8b927b8674a3c78c7", "54995b0a7e01403d816d4c5dcea3ec39", "9cfd52a8054f436fa406dd12657d697c", "380d234d63644d31b43286726d4346e0", "1962a1b46f7d4c0bac70d45435b1b0b8", "476c1fd9e1644ceda44275a4630cff71", "c56b207b5c554646b09ee80b6c6f87cc", "295050c536e04607be4944fa8827a66d", "7848eb314e39456faac551687946b12b", "9d9e50a4097f4aa38e07d69728e47298", "7b3c5ea2a594482a9d6e60510d9e335f", "8c988598365c414a9fb20a7f6b37365e", "0377a5e70f2540878f435c3c13b915fc", "20ec5a844483409f8df19a37e41cb0c9", "d5b5905850d74cf8b1e8f2625658b13e", "380e286c3a844a91aa34bb28439cc430", "5771f51108984384a0d4a458320f74f3", "c0ecbbd2b8c3474f848a017f34a1bbaa", "41260544bc6c4d25986e0819ae970219", "09b056bc424b4322ab06b9a5adc53f8e", "bb2e0757b2f549b4af6695711ab9b3f0", "57dc02294d124354a292ec6df2d3e95e", "e7e1e30d208d4179a09f66c47d2397b2", "067fcad17d364367ae4047fdb63ef59f", "6c0b0dbd3d6b46959bcab23a67319f6b", "c62afc791f8241eaaf41e7f3fc84cd81", "d8e16763af2a46f0852402390bb7ce96", "dae11f5653c643368c98ee4bcf2d8991", "0a71aa8a8a394e4faec5deac27fcd796", "348e188ace234032bd59e83782586986", "123686d65ba4401c95d67007f30f502a", "12149f11baaf41aeb063d336df1276bb", "c943036f9a384fcd8e6cc5c19b3883ed", "95350ac6a725432892a41949a7af1cc5", "6b421333f2b64bd29d0dadca8de348f7", "a536609d06b745818bb6da478dcc62c3", "7bffb7e759f2478e9da398d30d399474", "76153959dfc74095b0412eb96a2a6b57", "5741589d2c0b4149ae8ef52ba6262002", "f2c3bde660014bbc8885ff8f133ad0a8", "521f6b271d504c59971e181b4c6087ce", "524dee0cb1b145b1a1f7a22f4f77c20c", "f47d8748244b4c0e8f69e62960b60114", "b2569b4d9ee340f2a042f3fcf1a9479e", "df88461423e0442dbb5db3032685cee4", "430d29dfadd04147ba3a0e15ffa37e00", "b22bfb10324d4d138c4e48bb853e54a6", "853ab6f785124189987407cebeeb36ec", "3cca4d9f55024c8e950a1f011d2a9e71", "746b4ff75fe549d294822ac0c17b9eea", "094bfddc08c143adb3ac8f92b8d8a35c", "b78b1570f36f4591aa565d06d5c97aa2", "0edd2a6262c84cedb8639689f12e3cb8", "00b3c8a77c174b33ac6df812bd38caf8", "c585a27add9a4d28b4fbfbfd0d546639", "33c26d62ae80471d92d40b19729ea0a1", "6e0bd73d8bf34f05ad9337d44f683f69", "943e960e06fa4e179f0f8f24d80c75c5", "9c67ef19d64d4496baa7a3ecaa65ac40", "093d5f2f31d248e49b53e13e34a39d19", "56f5e6f6f1994855b71defe58c3caf73", "6503cd0b1071414cb852faa395891b66", "3fa36692e1154c08bf1bf50fa4a276f4", "48a7142c00144de1b49a1c60166332c1", "54b3145407814cb6909b6a42e0d995c1", "b7d2a893077a42578d701d5e2cb04146", "3ad920375fa84ec5a1c8ef906a132f66", "a029c61ef45d483f855f9c0360b58d9f", "f55dff66b5ab47649623dec665bab8c7", "f16dd62059ba49c18d57231013da91c8", "02b3bb2540344873bdc2621a283e4b9d", "83a86345acec4ad1b53dd850a61a28c8", "7ef01e51e8574be18c5e61fd15885dc4", "2a77fed3b2c44741be4b3798561701ab", "85f79d571cf74369b95d456e3aa71b10", "ad6e4edc3c684930bec491e5f146cce0", "87ff0aba75554acc9ca5631a3a8eb86b", "e1095519890e463f8bd1028784e926a7", "b5543a0ad1da48c5b21cd21f77b78913", "e212138cf44649e7a116a7395d574e50", "4d792f196b3348a6afa7be27296b2e8e", "6831afd511fc4a468023a637abef4ffc", "0db1a095cb6f4506b57afe4478ea760f", "68de400689c94eaaab36d4a03177291e", "545a9aeba4934b5394f782b3f7805484", "b2cbefc0ba8d4b869a531cbcd1dd1279", "8ebf592281764403b1e7a71be6ab4179", "4c480e7f0ae04afeb728c25829d85b35", "c5b458f29c6d4932926f33171d21922d", "fae6fa9f6f6b4273851008bde9e21e9a", "838feb07922241239a29e6640de20f72", "dc91135f04184e29817d1d8f26637e17" ] }, "id": "7MbpXGu-v4f1", "outputId": "f0085c70-8f57-47f3-c719-16f95e40fa40" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "34e200081e97446c8cd7856137d2ed39", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading readme: 0%| | 0.00/5.12k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/nicholasKluge___parquet/nicholasKluge--fine-tuning-instruct-aira-4077fd700c38fc36/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "158650cfa6d14e00ab4825953fabe91e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data files: 0%| | 0/4 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a39380581e804f5fb96018c6058ec4ac", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/1.71M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1962a1b46f7d4c0bac70d45435b1b0b8", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/13.5M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "380e286c3a844a91aa34bb28439cc430", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/14.5M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d8e16763af2a46f0852402390bb7ce96", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/1.84M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "76153959dfc74095b0412eb96a2a6b57", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Extracting data files: 0%| | 0/4 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3cca4d9f55024c8e950a1f011d2a9e71", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating aira_english split: 0%| | 0/9183 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "093d5f2f31d248e49b53e13e34a39d19", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating aira_instruct_english split: 0%| | 0/48666 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "02b3bb2540344873bdc2621a283e4b9d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating aira_instruct_portuguese split: 0%| | 0/48571 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6831afd511fc4a468023a637abef4ffc", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating aira_portuguese split: 0%| | 0/9204 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/nicholasKluge___parquet/nicholasKluge--fine-tuning-instruct-aira-4077fd700c38fc36/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.\n" ] }, { "data": { "text/html": [ "\n", "
\n", " | prompt | \n", "completion | \n", "
---|---|---|
0 | \n", "Como é ser a última pessoa na terra? | \n", "Você pensaria que o mundo seria seu para explo... | \n", "
1 | \n", "Você pode explicar o conceito de Anarquismo? | \n", "O anarquismo é uma filosofia política que defe... | \n", "
2 | \n", "Qual é a melhor maneira de pendurar uma pintur... | \n", "A melhor maneira de pendurar uma pintura com s... | \n", "
3 | \n", "Qual é a relação entre o problema de controle ... | \n", "A noção de controle se manifesta quando contem... | \n", "
4 | \n", "Identifique qual instrumento é corda ou percus... | \n", "Liuqin é corda, Nagareh é percussão. | \n", "
... | \n", "... | \n", "... | \n", "
48566 | \n", "Qual é a história da luta de sumô. | \n", "A luta de sumô é uma forma antiga de arte marc... | \n", "
48567 | \n", "Como posso melhorar falar em público. | \n", "Comece estando preparado. Pesquise o tópico, p... | \n", "
48568 | \n", "Como faço para navegar até um determinado loca... | \n", "Para navegar até um determinado local usando t... | \n", "
48569 | \n", "Quais são algumas dicas de etiqueta de mensage... | \n", "Apresente-se - Deixe a pessoa saber quem você ... | \n", "
48570 | \n", "Estou pensando em participar de um concurso de... | \n", "Eu recomendaria que você se concentrasse em su... | \n", "
48571 rows × 2 columns
\n", "