{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "fcb5ed0e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/administrator/.local/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "#!pip install torch\n", "import torch\n", "device = torch.device('cuda')" ] }, { "cell_type": "code", "execution_count": 2, "id": "0f8058d5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "device(type='cuda')" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "device" ] }, { "cell_type": "code", "execution_count": 3, "id": "1b13da51", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mon Jul 18 14:47:09 2022 \r\n", "+-----------------------------------------------------------------------------+\r\n", "| NVIDIA-SMI 510.47.03 Driver Version: 510.47.03 CUDA Version: 11.6 |\r\n", "|-------------------------------+----------------------+----------------------+\r\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\r\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\r\n", "| | | MIG M. |\r\n", "|===============================+======================+======================|\r\n", "| 0 NVIDIA GeForce ... On | 00000000:01:00.0 On | N/A |\r\n", "| 0% 42C P8 31W / 320W | 3209MiB / 10240MiB | 11% Default |\r\n", "| | | N/A |\r\n", "+-------------------------------+----------------------+----------------------+\r\n", " \r\n", "+-----------------------------------------------------------------------------+\r\n", "| Processes: |\r\n", "| GPU GI CI PID Type Process name GPU Memory |\r\n", "| ID ID Usage |\r\n", "|=============================================================================|\r\n", "| 0 N/A N/A 1061 G /usr/lib/xorg/Xorg 171MiB |\r\n", "| 0 N/A N/A 1510 G cinnamon 35MiB |\r\n", "| 0 N/A N/A 2542 G /usr/lib/firefox/firefox 150MiB |\r\n", "| 0 N/A N/A 2899 C /usr/bin/python3 2843MiB |\r\n", "| 0 N/A N/A 3957 G /usr/lib/firefox/firefox-bin 3MiB |\r\n", "+-----------------------------------------------------------------------------+\r\n" ] } ], "source": [ "!nvidia-smi" ] }, { "cell_type": "code", "execution_count": 4, "id": "1513fee7", "metadata": {}, "outputs": [], "source": [ "#!pip install pandas\n", "import numpy as np \n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 5, "id": "d33202d1", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (1.26.10) or chardet (3.0.4) doesn't match a supported version!\n", " warnings.warn(\"urllib3 ({}) or chardet ({}) doesn't match a supported \"\n", "Reusing dataset imdb (/home/administrator/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)\n", "100%|███████████████████████████████████████████| 3/3 [00:00<00:00, 1113.04it/s]\n" ] } ], "source": [ "#!pip install datasets\n", "from datasets import load_dataset\n", "imdb = load_dataset(\"imdb\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "5624c9e3", "metadata": {}, "outputs": [], "source": [ "#!pip install transformers\n", "from transformers import AutoTokenizer\n", "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "8af0934e", "metadata": {}, "outputs": [], "source": [ "def preprocess_function(examples):\n", " return tokenizer(examples[\"text\"], truncation=True)" ] }, { "cell_type": "code", "execution_count": 8, "id": "dec51d8c", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Parameter 'function'= of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", "100%|███████████████████████████████████████████| 25/25 [00:02<00:00, 9.68ba/s]\n", "100%|███████████████████████████████████████████| 25/25 [00:02<00:00, 9.94ba/s]\n", "100%|███████████████████████████████████████████| 50/50 [00:05<00:00, 9.56ba/s]\n" ] } ], "source": [ "tokenized_imdb = imdb.map(preprocess_function, batched=True)" ] }, { "cell_type": "code", "execution_count": 9, "id": "b8665b3c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\\'t match the background, and painfully one-dimensional characters cannot be overcome with a \\'sci-fi\\' setting. (I\\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\\'s not. It\\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\\'s rubbish as they have to always say \"Gene Roddenberry\\'s Earth...\" otherwise people would not continue watching. Roddenberry\\'s ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.',\n", " 'label': 0}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "imdb['test'][0]" ] }, { "cell_type": "code", "execution_count": 10, "id": "3c916b03", "metadata": {}, "outputs": [], "source": [ "from transformers import DataCollatorWithPadding\n", "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)" ] }, { "cell_type": "code", "execution_count": 11, "id": "4c8e9902", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']\n", "- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer\n", "model = AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", num_labels=2)" ] }, { "cell_type": "code", "execution_count": 12, "id": "0d162112", "metadata": {}, "outputs": [], "source": [ "import pickle\n", "pickle.dump(model, open(\"dist_bert_uncased.pkl\",\"wb\"))" ] }, { "cell_type": "code", "execution_count": 13, "id": "25c57b94", "metadata": {}, "outputs": [], "source": [ "#!pip3 install --upgrade tensorflow-gpu" ] }, { "cell_type": "code", "execution_count": 14, "id": "fe48e099", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Num GPUs Available: 0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2022-07-18 14:47:35.224460: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", "2022-07-18 14:47:35.321191: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory\n", "2022-07-18 14:47:35.321211: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.\n", "Skipping registering GPU devices...\n" ] } ], "source": [ "#!pip install tensorflow\n", "import tensorflow as tf\n", "print(\"Num GPUs Available: \", len(tf.config.list_physical_devices('GPU')))" ] }, { "cell_type": "code", "execution_count": 15, "id": "23929dda", "metadata": {}, "outputs": [], "source": [ "import gc\n", "gc.collect()\n", "torch.cuda.empty_cache()" ] }, { "cell_type": "code", "execution_count": 16, "id": "7adddbb6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'|===========================================================================|\\n| PyTorch CUDA memory summary, device ID 0 |\\n|---------------------------------------------------------------------------|\\n| CUDA OOMs: 0 | cudaMalloc retries: 0 |\\n|===========================================================================|\\n| Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed |\\n|---------------------------------------------------------------------------|\\n| Allocated memory | 0 B | 0 B | 0 B | 0 B |\\n| from large pool | 0 B | 0 B | 0 B | 0 B |\\n| from small pool | 0 B | 0 B | 0 B | 0 B |\\n|---------------------------------------------------------------------------|\\n| Active memory | 0 B | 0 B | 0 B | 0 B |\\n| from large pool | 0 B | 0 B | 0 B | 0 B |\\n| from small pool | 0 B | 0 B | 0 B | 0 B |\\n|---------------------------------------------------------------------------|\\n| GPU reserved memory | 0 B | 0 B | 0 B | 0 B |\\n| from large pool | 0 B | 0 B | 0 B | 0 B |\\n| from small pool | 0 B | 0 B | 0 B | 0 B |\\n|---------------------------------------------------------------------------|\\n| Non-releasable memory | 0 B | 0 B | 0 B | 0 B |\\n| from large pool | 0 B | 0 B | 0 B | 0 B |\\n| from small pool | 0 B | 0 B | 0 B | 0 B |\\n|---------------------------------------------------------------------------|\\n| Allocations | 0 | 0 | 0 | 0 |\\n| from large pool | 0 | 0 | 0 | 0 |\\n| from small pool | 0 | 0 | 0 | 0 |\\n|---------------------------------------------------------------------------|\\n| Active allocs | 0 | 0 | 0 | 0 |\\n| from large pool | 0 | 0 | 0 | 0 |\\n| from small pool | 0 | 0 | 0 | 0 |\\n|---------------------------------------------------------------------------|\\n| GPU reserved segments | 0 | 0 | 0 | 0 |\\n| from large pool | 0 | 0 | 0 | 0 |\\n| from small pool | 0 | 0 | 0 | 0 |\\n|---------------------------------------------------------------------------|\\n| Non-releasable allocs | 0 | 0 | 0 | 0 |\\n| from large pool | 0 | 0 | 0 | 0 |\\n| from small pool | 0 | 0 | 0 | 0 |\\n|---------------------------------------------------------------------------|\\n| Oversize allocations | 0 | 0 | 0 | 0 |\\n|---------------------------------------------------------------------------|\\n| Oversize GPU segments | 0 | 0 | 0 | 0 |\\n|===========================================================================|\\n'" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "torch.cuda.memory_summary(device=None, abbreviated=False)" ] }, { "cell_type": "code", "execution_count": 17, "id": "f6e1ca9d", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message.\n", "/home/administrator/.local/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n", "***** Running training *****\n", " Num examples = 25000\n", " Num Epochs = 5\n", " Instantaneous batch size per device = 4\n", " Total train batch size (w. parallel, distributed & accumulation) = 4\n", " Gradient Accumulation steps = 1\n", " Total optimization steps = 31250\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [31250/31250 28:05, Epoch 5/5]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining Loss
5000.444300
10000.400200
15000.415500
20000.388900
25000.379600
30000.396100
35000.356700
40000.388000
45000.317500
50000.364600
55000.353900
60000.349500
65000.299400
70000.237400
75000.223600
80000.206500
85000.268500
90000.192800
95000.235800
100000.237400
105000.197800
110000.251500
115000.217100
120000.201600
125000.223200
130000.095300
135000.116100
140000.101100
145000.092600
150000.118600
155000.114800
160000.096000
165000.105600
170000.088300
175000.136200
180000.111900
185000.135800
190000.081800
195000.041100
200000.058100
205000.043600
210000.051800
215000.019900
220000.064000
225000.041500
230000.051200
235000.059900
240000.044900
245000.043300
250000.038700
255000.016400
260000.016100
265000.017300
270000.035000
275000.017800
280000.025900
285000.023900
290000.008400
295000.034100
300000.009600
305000.007300
310000.009600

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "Saving model checkpoint to ./results/checkpoint-500\n", "Configuration saved in ./results/checkpoint-500/config.json\n", "Model weights saved in ./results/checkpoint-500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-1000\n", "Configuration saved in ./results/checkpoint-1000/config.json\n", "Model weights saved in ./results/checkpoint-1000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-1500\n", "Configuration saved in ./results/checkpoint-1500/config.json\n", "Model weights saved in ./results/checkpoint-1500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-1500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-1500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-2000\n", "Configuration saved in ./results/checkpoint-2000/config.json\n", "Model weights saved in ./results/checkpoint-2000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-2000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-2000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-2500\n", "Configuration saved in ./results/checkpoint-2500/config.json\n", "Model weights saved in ./results/checkpoint-2500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-2500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-2500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-3000\n", "Configuration saved in ./results/checkpoint-3000/config.json\n", "Model weights saved in ./results/checkpoint-3000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-3000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-3000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-3500\n", "Configuration saved in ./results/checkpoint-3500/config.json\n", "Model weights saved in ./results/checkpoint-3500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-3500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-3500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-4000\n", "Configuration saved in ./results/checkpoint-4000/config.json\n", "Model weights saved in ./results/checkpoint-4000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-4000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-4000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-4500\n", "Configuration saved in ./results/checkpoint-4500/config.json\n", "Model weights saved in ./results/checkpoint-4500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-4500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-4500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-5000\n", "Configuration saved in ./results/checkpoint-5000/config.json\n", "Model weights saved in ./results/checkpoint-5000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-5000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-5000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-5500\n", "Configuration saved in ./results/checkpoint-5500/config.json\n", "Model weights saved in ./results/checkpoint-5500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-5500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-5500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-6000\n", "Configuration saved in ./results/checkpoint-6000/config.json\n", "Model weights saved in ./results/checkpoint-6000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-6000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-6000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-6500\n", "Configuration saved in ./results/checkpoint-6500/config.json\n", "Model weights saved in ./results/checkpoint-6500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-6500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-6500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-7000\n", "Configuration saved in ./results/checkpoint-7000/config.json\n", "Model weights saved in ./results/checkpoint-7000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-7000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-7000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-7500\n", "Configuration saved in ./results/checkpoint-7500/config.json\n", "Model weights saved in ./results/checkpoint-7500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-7500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-7500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-8000\n", "Configuration saved in ./results/checkpoint-8000/config.json\n", "Model weights saved in ./results/checkpoint-8000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-8000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-8000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-8500\n", "Configuration saved in ./results/checkpoint-8500/config.json\n", "Model weights saved in ./results/checkpoint-8500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-8500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-8500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-9000\n", "Configuration saved in ./results/checkpoint-9000/config.json\n", "Model weights saved in ./results/checkpoint-9000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-9000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-9000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-9500\n", "Configuration saved in ./results/checkpoint-9500/config.json\n", "Model weights saved in ./results/checkpoint-9500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-9500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-9500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-10000\n", "Configuration saved in ./results/checkpoint-10000/config.json\n", "Model weights saved in ./results/checkpoint-10000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-10000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-10000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-10500\n", "Configuration saved in ./results/checkpoint-10500/config.json\n", "Model weights saved in ./results/checkpoint-10500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-10500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-10500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-11000\n", "Configuration saved in ./results/checkpoint-11000/config.json\n", "Model weights saved in ./results/checkpoint-11000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-11000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-11000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-11500\n", "Configuration saved in ./results/checkpoint-11500/config.json\n", "Model weights saved in ./results/checkpoint-11500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-11500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-11500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-12000\n", "Configuration saved in ./results/checkpoint-12000/config.json\n", "Model weights saved in ./results/checkpoint-12000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-12000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-12000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-12500\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Configuration saved in ./results/checkpoint-12500/config.json\n", "Model weights saved in ./results/checkpoint-12500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-12500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-12500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-13000\n", "Configuration saved in ./results/checkpoint-13000/config.json\n", "Model weights saved in ./results/checkpoint-13000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-13000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-13000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-13500\n", "Configuration saved in ./results/checkpoint-13500/config.json\n", "Model weights saved in ./results/checkpoint-13500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-13500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-13500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-14000\n", "Configuration saved in ./results/checkpoint-14000/config.json\n", "Model weights saved in ./results/checkpoint-14000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-14000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-14000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-14500\n", "Configuration saved in ./results/checkpoint-14500/config.json\n", "Model weights saved in ./results/checkpoint-14500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-14500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-14500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-15000\n", "Configuration saved in ./results/checkpoint-15000/config.json\n", "Model weights saved in ./results/checkpoint-15000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-15000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-15000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-15500\n", "Configuration saved in ./results/checkpoint-15500/config.json\n", "Model weights saved in ./results/checkpoint-15500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-15500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-15500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-16000\n", "Configuration saved in ./results/checkpoint-16000/config.json\n", "Model weights saved in ./results/checkpoint-16000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-16000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-16000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-16500\n", "Configuration saved in ./results/checkpoint-16500/config.json\n", "Model weights saved in ./results/checkpoint-16500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-16500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-16500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-17000\n", "Configuration saved in ./results/checkpoint-17000/config.json\n", "Model weights saved in ./results/checkpoint-17000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-17000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-17000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-17500\n", "Configuration saved in ./results/checkpoint-17500/config.json\n", "Model weights saved in ./results/checkpoint-17500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-17500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-17500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-18000\n", "Configuration saved in ./results/checkpoint-18000/config.json\n", "Model weights saved in ./results/checkpoint-18000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-18000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-18000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-18500\n", "Configuration saved in ./results/checkpoint-18500/config.json\n", "Model weights saved in ./results/checkpoint-18500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-18500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-18500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-19000\n", "Configuration saved in ./results/checkpoint-19000/config.json\n", "Model weights saved in ./results/checkpoint-19000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-19000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-19000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-19500\n", "Configuration saved in ./results/checkpoint-19500/config.json\n", "Model weights saved in ./results/checkpoint-19500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-19500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-19500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-20000\n", "Configuration saved in ./results/checkpoint-20000/config.json\n", "Model weights saved in ./results/checkpoint-20000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-20000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-20000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-20500\n", "Configuration saved in ./results/checkpoint-20500/config.json\n", "Model weights saved in ./results/checkpoint-20500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-20500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-20500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-21000\n", "Configuration saved in ./results/checkpoint-21000/config.json\n", "Model weights saved in ./results/checkpoint-21000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-21000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-21000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-21500\n", "Configuration saved in ./results/checkpoint-21500/config.json\n", "Model weights saved in ./results/checkpoint-21500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-21500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-21500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-22000\n", "Configuration saved in ./results/checkpoint-22000/config.json\n", "Model weights saved in ./results/checkpoint-22000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-22000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-22000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-22500\n", "Configuration saved in ./results/checkpoint-22500/config.json\n", "Model weights saved in ./results/checkpoint-22500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-22500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-22500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-23000\n", "Configuration saved in ./results/checkpoint-23000/config.json\n", "Model weights saved in ./results/checkpoint-23000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-23000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-23000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-23500\n", "Configuration saved in ./results/checkpoint-23500/config.json\n", "Model weights saved in ./results/checkpoint-23500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-23500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-23500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-24000\n", "Configuration saved in ./results/checkpoint-24000/config.json\n", "Model weights saved in ./results/checkpoint-24000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-24000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-24000/special_tokens_map.json\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Saving model checkpoint to ./results/checkpoint-24500\n", "Configuration saved in ./results/checkpoint-24500/config.json\n", "Model weights saved in ./results/checkpoint-24500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-24500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-24500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-25000\n", "Configuration saved in ./results/checkpoint-25000/config.json\n", "Model weights saved in ./results/checkpoint-25000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-25000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-25000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-25500\n", "Configuration saved in ./results/checkpoint-25500/config.json\n", "Model weights saved in ./results/checkpoint-25500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-25500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-25500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-26000\n", "Configuration saved in ./results/checkpoint-26000/config.json\n", "Model weights saved in ./results/checkpoint-26000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-26000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-26000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-26500\n", "Configuration saved in ./results/checkpoint-26500/config.json\n", "Model weights saved in ./results/checkpoint-26500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-26500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-26500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-27000\n", "Configuration saved in ./results/checkpoint-27000/config.json\n", "Model weights saved in ./results/checkpoint-27000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-27000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-27000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-27500\n", "Configuration saved in ./results/checkpoint-27500/config.json\n", "Model weights saved in ./results/checkpoint-27500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-27500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-27500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-28000\n", "Configuration saved in ./results/checkpoint-28000/config.json\n", "Model weights saved in ./results/checkpoint-28000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-28000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-28000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-28500\n", "Configuration saved in ./results/checkpoint-28500/config.json\n", "Model weights saved in ./results/checkpoint-28500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-28500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-28500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-29000\n", "Configuration saved in ./results/checkpoint-29000/config.json\n", "Model weights saved in ./results/checkpoint-29000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-29000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-29000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-29500\n", "Configuration saved in ./results/checkpoint-29500/config.json\n", "Model weights saved in ./results/checkpoint-29500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-29500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-29500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-30000\n", "Configuration saved in ./results/checkpoint-30000/config.json\n", "Model weights saved in ./results/checkpoint-30000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-30000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-30000/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-30500\n", "Configuration saved in ./results/checkpoint-30500/config.json\n", "Model weights saved in ./results/checkpoint-30500/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-30500/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-30500/special_tokens_map.json\n", "Saving model checkpoint to ./results/checkpoint-31000\n", "Configuration saved in ./results/checkpoint-31000/config.json\n", "Model weights saved in ./results/checkpoint-31000/pytorch_model.bin\n", "tokenizer config file saved in ./results/checkpoint-31000/tokenizer_config.json\n", "Special tokens file saved in ./results/checkpoint-31000/special_tokens_map.json\n", "\n", "\n", "Training completed. Do not forget to share your model on huggingface.co/models =)\n", "\n", "\n" ] }, { "data": { "text/plain": [ "TrainOutput(global_step=31250, training_loss=0.155623394241333, metrics={'train_runtime': 1685.1863, 'train_samples_per_second': 74.176, 'train_steps_per_second': 18.544, 'total_flos': 1.3845206874696768e+16, 'train_loss': 0.155623394241333, 'epoch': 5.0})" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "training_args = TrainingArguments(\n", " output_dir=\"./results\",\n", " learning_rate=2e-5,\n", " per_device_train_batch_size=4,\n", " per_device_eval_batch_size=4,\n", " num_train_epochs=5,\n", " weight_decay=0.01,\n", ")\n", "\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=tokenized_imdb[\"train\"],\n", " eval_dataset=tokenized_imdb[\"test\"],\n", " tokenizer=tokenizer,\n", " data_collator=data_collator,\n", ")\n", "\n", "trainer.train()" ] }, { "cell_type": "code", "execution_count": 18, "id": "bdf45691", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['text', 'label', 'input_ids', 'attention_mask'],\n", " num_rows: 25000\n", "})" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame()\n", "tokenized_imdb['test']" ] }, { "cell_type": "code", "execution_count": null, "id": "e2409c78", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message.\n", "***** Running Prediction *****\n", " Num examples = 25000\n", " Batch size = 4\n" ] }, { "data": { "text/html": [ "\n", "

\n", " \n", " \n", " [2987/6250 00:40 < 00:43, 74.22 it/s]\n", "
\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trainer.predict(tokenized_imdb[\"test\"])" ] }, { "cell_type": "code", "execution_count": null, "id": "958d92c2", "metadata": {}, "outputs": [], "source": [ "from transformers import TextClassificationPipeline" ] }, { "cell_type": "code", "execution_count": null, "id": "a6eca3fe", "metadata": {}, "outputs": [], "source": [ "import os\n", "os.environ['CUDA_LAUNCH_BLOCKING'] = \"1\"" ] }, { "cell_type": "code", "execution_count": null, "id": "e5179a95", "metadata": {}, "outputs": [], "source": [ "sent = 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\\'t match the background, and painfully one-dimensional characters cannot be overcome with a \\'sci-fi\\' setting. (I\\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\\'s not. It\\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\\'s rubbish as they have to always say \"Gene Roddenberry\\'s Earth...\" otherwise people would not continue watching. Roddenberry\\'s ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.'" ] }, { "cell_type": "code", "execution_count": null, "id": "c600300b", "metadata": {}, "outputs": [], "source": [ "pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True, device = 0)\n", "pipe(sent)\n", "# outputs a list of dicts like [[{'label': 'NEGATIVE', 'score': 0.0001223755971295759}, {'label': 'POSITIVE', 'score': 0.9998776316642761}]]\n", "# pipe(\"I love this movie!\")" ] }, { "cell_type": "code", "execution_count": null, "id": "f36983b0", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "05f21b21", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }