{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "Q-bj6K7Qv4ft" }, "source": [ "# Fine-Tuning a Generative Pretrained Transformer (`GPT`)\n", "\n", "1. Install required libraries." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SBWCrz5GfBXo", "outputId": "71632b4a-5582-4ebb-cdaf-015edaadc079" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting transformers\n", " Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.2/7.2 MB\u001b[0m \u001b[31m61.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting datasets\n", " Downloading datasets-2.13.1-py3-none-any.whl (486 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m486.2/486.2 kB\u001b[0m \u001b[31m44.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting codecarbon\n", " Downloading codecarbon-2.2.4-py3-none-any.whl (176 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m176.0/176.0 kB\u001b[0m \u001b[31m21.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.2)\n", "Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)\n", " Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m236.8/236.8 kB\u001b[0m \u001b[31m28.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n", "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)\n", " Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m93.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting safetensors>=0.3.1 (from transformers)\n", " Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m75.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)\n", "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (9.0.0)\n", "Collecting dill<0.3.7,>=0.3.0 (from datasets)\n", " Downloading dill-0.3.6-py3-none-any.whl (110 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m110.5/110.5 kB\u001b[0m \u001b[31m15.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (1.5.3)\n", "Collecting xxhash (from datasets)\n", " Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.5/212.5 kB\u001b[0m \u001b[31m23.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting multiprocess (from datasets)\n", " Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.3/134.3 kB\u001b[0m \u001b[31m18.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)\n", "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.8.4)\n", "Collecting arrow (from codecarbon)\n", " Downloading arrow-1.2.3-py3-none-any.whl (66 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m66.4/66.4 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting pynvml (from codecarbon)\n", " Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.1/53.1 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from codecarbon) (5.9.5)\n", "Requirement already satisfied: py-cpuinfo in /usr/local/lib/python3.10/dist-packages (from codecarbon) (9.0.0)\n", "Collecting fuzzywuzzy (from codecarbon)\n", " Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)\n", "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from codecarbon) (8.1.3)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.1.0)\n", "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.0.12)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.4)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.2)\n", "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.2)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.3)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (4.6.3)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.16)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.5.7)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", "Requirement already satisfied: python-dateutil>=2.7.0 in /usr/local/lib/python3.10/dist-packages (from arrow->codecarbon) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2022.7.1)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7.0->arrow->codecarbon) (1.16.0)\n", "Installing collected packages: tokenizers, safetensors, fuzzywuzzy, xxhash, pynvml, dill, multiprocess, huggingface-hub, arrow, transformers, codecarbon, datasets\n", "Successfully installed arrow-1.2.3 codecarbon-2.2.4 datasets-2.13.1 dill-0.3.6 fuzzywuzzy-0.18.0 huggingface-hub-0.15.1 multiprocess-0.70.14 pynvml-11.5.0 safetensors-0.3.1 tokenizers-0.13.3 transformers-4.30.2 xxhash-3.2.0\n" ] } ], "source": [ "!pip install transformers datasets codecarbon" ] }, { "cell_type": "markdown", "metadata": { "id": "y5XnfvSH7w4z" }, "source": [ "2. Load the data from the hub." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 702, "referenced_widgets": [ "8e487319ea88415e91602a6f9820f4d5", "396414917dd644e38e36a8c7240a234a", "3cff65adb3104a219c1cc1fcdb899598", "55016b2997934c919bbe10c2b4a0ca64", "bb537e4986c2408abfb0d78bc933846f", "17ca377d4bc34da3b6086bdb50e9d13a", "f5fd0df9de8c46ecb9dd5b030e40a6f2", "9783d51a5dfa4ba39091c702d4b17eac", "224a97dd5fb5458ca16e7b59ab537f47", "e78962716ecc42b3ac39a64a6b15f6d9", "cd37dbce516740578ef65b21a9c4c67c", "1d9f64595ea94de99455bfa2756f576b", "9089827f7c964e568099dd644f9bf866", "9848b009f303407ea15de664ba6912ee", "c1a7b8cce90d4196baf136c87bb99839", "3c76c2cba00843eabcb0f9b051a7f0de", "29fc4531fe4a43bfbabbf08a03e2c92c", "7a5d8f16f25045f7ba449473eca61b5c", "8a1da8f0b0974e41bf10c639ddf377bf", "2f79ef9e2c0d4675b013fb73d664d0e6", "029c87d4b9364bc586caa9401a91cfea", "20152bedb4284b3ebe6d42adfd372915", "8a0dc2cf014042858355a478974eb83b", "1e29596501214a57b3247f42488660d0", "31730399da5d4eeeb1b09256ec0d4ca8", "a5b66b872e1549b58fbc59fa9c8cd1c7", "9b0e10f07be646f5aad690dd27e5e228", "4077e325ffa141e0a9314272418e32b7", "dec391b8a2294567a5ec5b84ce4fdfa4", "db946d6f117e4f9186b242640cbb298e", "c300d2d1622643ba8089a77871693e00", "7d0a9a8fa09543aab40425ac19adfdb0", "a841048400c64b069cdfb0da00d660a9", "53903eb29af54226b61963544f0c7c22", "e3caba493e3e41e89ab9ceb5920beb0d", "a937aa8f92ac4e009a51049429ce4df4", "c2b7d2984ae44ad0b6d257d990e8367a", "bb5ab37ea8914974b41fdf0d9b401b75", "b0d8e1bd7f5e4a41a8a092ed0526ce4a", "dabf8238f2ce41448c30f2df647d9755", "8c72141899d6480184ae59ee516db519", "f4e7586c00444c12b81fec4d47247076", "d6527b8737d94491b49bb23a8f7b2309", "fcf083caa45c49ceacdb1d296adada0b", "55f6f9660fb64744b3b4794adb386a80", "5b26b54602874476ac1e33e517ced94f", "ef646af727014122a709581a5877c1ab", "9cf8439b97ae4e71b9dadc37db3cecb1", "051bbaeec414408a849d09be76a33e16", "32c77d63eeb34b84b67bb71d3b1dd8c2", "563c21a7658c4d50aeec369f332de879", "e289b4fba89e4a48b353e34df07286d4", "438b281d640746a8a5db088336a807f7", "3427272349944d4692ee166a580da497", "6ca80252237141a79f424bfc627fb6bc", "dc4e291bf94245f7b11be71dbc9ffd10", "46d2413560d643c2814fb99eff80f464", "8b61a7b2883641a987b19c75eaa42a1a", "e43f01ac59f549748e0bafb78b1b05bb", "fb3606b854c44c9097b5be06d19c39bf", "b62438ac81b246739e6077432a7aa83d", "a4b0fd8a19b44908ab204f90cf52f77a", "57dde2a42f95487f8d5463505995065d", "d62f7772d4c842268d5f73cd4095bf81", "57c28687259d44aa84d48ed8249f36a6", "666179f07a4f44ab82562dd8aa8ab9ed", "1321f5ab2dfd48ea83019267e2a5d442", "b2be32c7331c4979bf34262b55f8be95", "c0bb60c6dadc4e2e9db0a25724aa5798", "8306fa8ddaba4388bc6d284f781e227d", "64fe558b22964b399991e55a9e5ef78a", "1f3b8e0c2eb7472fa18852118b81eae2", "974d8d08eaeb4a2899aaabe0e049cbbc", "e7322494e0fa46b4b3ad046af2938450", "dc01848055844384badd233227acb13d", "de17e3905fb449039cbf3b492b20d05f", "46ee1bda293c4b379e5efbb58c75ba5e", "7bfb9e59c4a243f5837d5dc0c69dc6d4", "64af67338f6d40fa84882b342e26a24f", "ba4df13eb47846e08a6893d8fa03b06b", "18ca578fb6934754887cc8441febdfa9", "950d2ebd690c4faeba1e9cd84d81ff7c", "accac6999fa241c8b92dcdbedf072420", "e12bc77f106e4b87b497078838e3f123", "66af1e64128647ad9c46b2ed0ecf588d", "f865555cdf9049fb94db287b02cbb79c", "1554871002044ec7b6eac838c0cca770", "3821aa65e332455bb423d95456f5c493", "ed3d550e3e664c139053063669b2b47c", "84a9d75e7d4b47d9a531b394326a0210", "c29432790c4945a297613986893df6c7", "de3f29257d96444685b599418381a4cf", "e61d4525dd9c45a39ac3ce18baa81219", "7b94af7594e242d1b85896af2df387a5", "0409a9b4564d4f02a2ca61e379db7aa2", "19dbf8fc5dbe46e6b7805ed6f76d191c", "f6491321f2bc47cfa7bdded510e7c890", "c124ad03902c41e19f11ac6f9bdb93d2", "10fb158948cc43c1b174b45d41ab278c", "24e619a197b045c2b59fe16bb1e5a60b", "3cdc835cf43c44f482b238c061f157fe", "b116d0589ada4ee69e2762978a514c54", "b1f6ac3ea3b2431281cd35e2dad767a5", "e9b3281921144f95965afc7372ad3ab6", "ddb9e67905e64e639c3fbb5f407dc027", "d1890ba06dda4986bbe5996f8269fe3b", "d64b7eef01fc4ffd9c322179b30e2afc", "a804ad87c240425b8071eec9750d6f6f", "d6e1eebb30d947bcad2ddf5c30cdd09c", "84ba45b3f1dd4df29523003ad6cec5af", "d4bc02fa889a4aadacf3c5afaf47cbf6", "8dce15c298d143dcb0fe5e897890157a", "b29cf5a42b434c85bf509760f0e63037", "065a42b2d3bd4ddaa5458cc87b747f30", "3e97ef8e831a4d5a9b19e2856fb6eaba", "480fdcef380f47a393817756fb172ff4", "2439a2edd1564c8d8491d65e33cfe252", "5fd3bca2b4c04131b88474fd8d39d5b2", "384449391b614c348dd31a8dcbde67a3", "98cb528bccf641e784f9aaa3f024747e", "4f0e3de1ad804e1e9bb22def9f260b51" ] }, "id": "7MbpXGu-v4f1", "outputId": "fce01dc1-509a-4b15-b686-4384c20f0765" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8e487319ea88415e91602a6f9820f4d5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading readme: 0%| | 0.00/5.12k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/nicholasKluge___parquet/nicholasKluge--fine-tuning-instruct-aira-4077fd700c38fc36/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1d9f64595ea94de99455bfa2756f576b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data files: 0%| | 0/4 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8a0dc2cf014042858355a478974eb83b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/1.71M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "53903eb29af54226b61963544f0c7c22", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/13.5M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "55f6f9660fb64744b3b4794adb386a80", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/14.5M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "dc4e291bf94245f7b11be71dbc9ffd10", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/1.84M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1321f5ab2dfd48ea83019267e2a5d442", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Extracting data files: 0%| | 0/4 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7bfb9e59c4a243f5837d5dc0c69dc6d4", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating aira_english split: 0%| | 0/9183 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ed3d550e3e664c139053063669b2b47c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating aira_instruct_english split: 0%| | 0/48666 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "24e619a197b045c2b59fe16bb1e5a60b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating aira_instruct_portuguese split: 0%| | 0/48571 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d4bc02fa889a4aadacf3c5afaf47cbf6", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating aira_portuguese split: 0%| | 0/9204 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/nicholasKluge___parquet/nicholasKluge--fine-tuning-instruct-aira-4077fd700c38fc36/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.\n" ] }, { "data": { "text/html": [ "\n", "
\n", " | prompt | \n", "completion | \n", "
---|---|---|
0 | \n", "Why do philosophers use the concept of a philo... | \n", "A philosophical concept known as a p-zombie re... | \n", "
1 | \n", "Can you provide a definition for expert iterat... | \n", "ExIt stands as a magnificent reinforcement lea... | \n", "
2 | \n", "What are the different types of insurance and ... | \n", "The different types of insurance include auto ... | \n", "
3 | \n", "What is the best way to organize a large event. | \n", "The best way to organize a large event is to p... | \n", "
4 | \n", "How do I get started with yoga. | \n", "To get started with yoga, the best thing to do... | \n", "
... | \n", "... | \n", "... | \n", "
48661 | \n", "What role do AI developers have in promoting t... | \n", "The principle of truthfulness requires that AI... | \n", "
48662 | \n", "Who create Ben & Jerry's icecream? | \n", "Ben & Jerry's ice cream was founded by Ben Coh... | \n", "
48663 | \n", "What is the capital of Finland and when was it... | \n", "The capital of Finland is Helsinki and it was ... | \n", "
48664 | \n", "What are some tips for overcoming stress. | \n", "Take time for yourself to relax and unwind. \\n... | \n", "
48665 | \n", "How can I increase my savings. | \n", "One way to increase your savings is to create ... | \n", "
48666 rows × 2 columns
\n", "