{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 11742, "status": "ok", "timestamp": 1730569728288, "user": { "displayName": "Ismat Samadov", "userId": "13714662825869203427" }, "user_tz": -240 }, "id": "5v8KnAaD-z9t", "outputId": "e58efe1c-a3de-4271-8e43-ecb8168a35de" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.44.2)\n", "Collecting datasets\n", " Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)\n", "Collecting seqeval\n", " Downloading seqeval-1.2.2.tar.gz (43 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.6/43.6 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "Requirement already satisfied: huggingface_hub in /usr/local/lib/python3.10/dist-packages (0.24.7)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.16.1)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.26.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.2)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.9.11)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.32.3)\n", "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.5)\n", "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.19.1)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.5)\n", "Collecting pyarrow>=15.0.0 (from datasets)\n", " Downloading pyarrow-18.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)\n", "Collecting dill<0.3.9,>=0.3.0 (from datasets)\n", " Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.1.4)\n", "Collecting xxhash (from datasets)\n", " Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n", "Collecting multiprocess<0.70.17 (from datasets)\n", " Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)\n", "Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets) (2024.6.1)\n", "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.10.5)\n", "Requirement already satisfied: scikit-learn>=0.21.3 in /usr/local/lib/python3.10/dist-packages (from seqeval) (1.5.2)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (4.12.2)\n", "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.0)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.1)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\n", "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.11.1)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.2.3)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2024.8.30)\n", "Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.21.3->seqeval) (1.13.1)\n", "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.21.3->seqeval) (1.4.2)\n", "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.21.3->seqeval) (3.5.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n", "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.1)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n", "Downloading datasets-3.1.0-py3-none-any.whl (480 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m31.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m10.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m13.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading pyarrow-18.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.0 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.0/40.0 MB\u001b[0m \u001b[31m56.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m19.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hBuilding wheels for collected packages: seqeval\n", " Building wheel for seqeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=0d4c765f01269491e0baddd39f053312bc7aa62fa5ead5b3e0dcb0bea5f36694\n", " Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa\n", "Successfully built seqeval\n", "Installing collected packages: xxhash, pyarrow, dill, multiprocess, seqeval, datasets\n", " Attempting uninstall: pyarrow\n", " Found existing installation: pyarrow 14.0.2\n", " Uninstalling pyarrow-14.0.2:\n", " Successfully uninstalled pyarrow-14.0.2\n", "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 18.0.0 which is incompatible.\n", "ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 18.0.0 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0mSuccessfully installed datasets-3.1.0 dill-0.3.8 multiprocess-0.70.16 pyarrow-18.0.0 seqeval-1.2.2 xxhash-3.5.0\n" ] } ], "source": [ "!pip install transformers datasets seqeval huggingface_hub\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "amREIFSH-z7r" }, "outputs": [], "source": [ "# Standard library imports\n", "import os # Provides functions for interacting with the operating system\n", "import warnings # Used to handle or suppress warnings\n", "import numpy as np # Essential for numerical operations and array manipulation\n", "import torch # PyTorch library for tensor computations and model handling\n", "import ast # Used for safe evaluation of strings to Python objects (e.g., parsing tokens)\n", "\n", "# Hugging Face and Transformers imports\n", "from datasets import load_dataset # Loads datasets for model training and evaluation\n", "from transformers import (\n", " AutoTokenizer, # Initializes a tokenizer from a pre-trained model\n", " DataCollatorForTokenClassification, # Handles padding and formatting of token classification data\n", " TrainingArguments, # Defines training parameters like batch size and learning rate\n", " Trainer, # High-level API for managing training and evaluation\n", " AutoModelForTokenClassification, # Loads a pre-trained model for token classification tasks\n", " get_linear_schedule_with_warmup, # Learning rate scheduler for gradual warm-up and linear decay\n", " EarlyStoppingCallback # Callback to stop training if validation performance plateaus\n", ")\n", "\n", "# Hugging Face Hub\n", "from huggingface_hub import login # Allows logging in to Hugging Face Hub to upload models\n", "\n", "# seqeval metrics for NER evaluation\n", "from seqeval.metrics import precision_score, recall_score, f1_score, classification_report\n", "# Provides precision, recall, F1-score, and classification report for evaluating NER model performance\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 6, "status": "ok", "timestamp": 1730569738838, "user": { "displayName": "Ismat Samadov", "userId": "13714662825869203427" }, "user_tz": -240 }, "id": "K7adlboI-z4p", "outputId": "a4306ab9-b7d5-4108-e3f6-1edc4899694f" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n", "Token is valid (permission: fineGrained).\n", "Your token has been saved to /root/.cache/huggingface/token\n", "Login successful\n" ] } ], "source": [ "# Log in to Hugging Face Hub\n", "login(token=\"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\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Qccgsjfs-zzA" }, "outputs": [], "source": [ "# Disable WandB (Weights & Biases) logging to avoid unwanted log outputs during training\n", "os.environ[\"WANDB_DISABLED\"] = \"true\"\n", "\n", "# Suppress warning messages to keep output clean, especially during training and evaluation\n", "warnings.filterwarnings(\"ignore\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 576, "referenced_widgets": [ "7de65e247932425882a27257bf77b913", "228bb22c294f4fd1a08910c080586e65", "1b3d4b4f685f47f5b4cd1a83e76330c0", "3d96ee9792cf4ba5b57b7858678985bb", "84a7d8ad32da45c792d570a918f136d6", "c8b59cd42f04425e8de6f832acff43a4", "5546cbf48870406ca3e93e6ebed6487b", "f553dfdc8cf441a6990a8dc2126e55e5", "dcfd93ce24474f639418c7f2b56ce101", "7a14fd2365fa40ac92e8851019639a46", "804273498d974d388585d2f8769db2cb", "a0d5b69380ab4cccbe9fa9555e733d29", "a5f12e46915c4c7dbd92a8e6866a2136", "9d2dd1835a6d436abdda27377c39951d", "bde988b2464c49e38d9d660124e1a1ca", "6786dbb578764ecbb07549f99284e872", "1a4a5ae499224227b594f7e44ec05626", "d71e12cac7384779b7cf6deccdec6205", "6dd8528c7a8c4cf59a8be94ea2eec2ac", "e53b7766d65f4f31a000d433fe1718d8", "ee5a83f45127475496dbb8347c2e3b9e", "998183d1e43d4b58b6d49796dd6c65d2", "e2baeb32f3bb47aeab3d6ebf3a5e8876", "6d2ed7f8c8e54d32b94a410de93bebb4", "b42331f0c0ca417ea779764f7c7b91a6", "77528b7a72024e3ba2c03f495188b091", "4d770190eebf44e59e4c85598c48bb5e", "f1247d8b523e4d03bf802ea761090255", "dc951346b3d742d7861929d4d4bd69db", "2e185b6e38e348919accedcf54fe32ec", "daa89fa3059144ceaf95c8d8d209c59d", "371faab019ee49d78fa2547a65fc68b9", "30f20a8152e3450cb5c0c5300f2a677b", "fba60dcd62c04b53acba3c371d926149", "844335f761134693ae7aff60803dc62a", "0835d840cd7a4444b477187796df6ab1", "0b430c3c4ff641a38e4a6603b30dfdf0", "e68c8857489c4b5e83c52b12d32b71e6", "9a688924cb2b4a9c984eddee5d6e57d7", "a2681b1b407e45dbb2277685ad082fd3", "c324af888b614caa9cead361c9decafe", "542aeebec2144a79b36baa0fdfb4bc32", "975acf8253a3411293d961167054e7f0", "1c184b2b6c164ac4965182b4dd760faa" ] }, "executionInfo": { "elapsed": 23970, "status": "ok", "timestamp": 1730569762804, "user": { "displayName": "Ismat Samadov", "userId": "13714662825869203427" }, "user_tz": -240 }, "id": "fQ6ttUM8-zwM", "outputId": "a7edc99a-0064-4378-ab18-c31f29c9b727" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7de65e247932425882a27257bf77b913", "version_major": 2, "version_minor": 0 }, "text/plain": [ "README.md: 0%| | 0.00/2.87k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a0d5b69380ab4cccbe9fa9555e733d29", "version_major": 2, "version_minor": 0 }, "text/plain": [ "train-00000-of-00001.parquet: 0%| | 0.00/13.6M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e2baeb32f3bb47aeab3d6ebf3a5e8876", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating train split: 0%| | 0/99545 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['index', 'tokens', 'ner_tags'],\n", " num_rows: 99545\n", " })\n", "})\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "fba60dcd62c04b53acba3c371d926149", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/99545 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Skipping malformed example: 7171f30e-fa1e-49ec-975e-16c88c9b95e9 due to error: malformed node or string: None\n", "Skipping malformed example: 91dfd97b-2997-4080-8054-00cadec14dfc due to error: malformed node or string: None\n", "Skipping malformed example: cfb8beb4-ae7a-4185-9a54-08b0e85d03d3 due to error: malformed node or string: None\n", "Skipping malformed example: 5f0a2991-38b3-435b-9059-a05382e89a62 due to error: malformed node or string: None\n", "Skipping malformed example: 9d705fde-ce09-4bef-9f4a-9ad1fa452cc9 due to error: malformed node or string: None\n", "Skipping malformed example: 182457fb-c648-4fca-a207-af5a00072d4a due to error: malformed node or string: None\n", "Skipping malformed example: d9205ccd-c692-4cf1-8310-181de8f4cdc8 due to error: malformed node or string: None\n", "Skipping malformed example: dac55265-38cd-4c4b-9e56-a48a77e108d4 due to error: malformed node or string: None\n", "Skipping malformed example: f3d38b45-0035-45ab-b0aa-79ae7c63ba7a due to error: malformed node or string: None\n", "Skipping malformed example: 5ed32762-bf5b-4db4-9dbd-07cd5c0541dc due to error: malformed node or string: None\n", "Skipping malformed example: 426fc958-8c6b-41d8-acfe-2082a6be6ada due to error: malformed node or string: None\n", "Skipping malformed example: 4b5aa52d-cd5e-43ee-ac4f-7a8da00860e1 due to error: malformed node or string: None\n", "Skipping malformed example: 53b1ce49-1f71-4770-a344-bf1d804fefd4 due to error: malformed node or string: None\n", "Skipping malformed example: 03e9e957-da8f-45dc-84d0-e556bfd023b3 due to error: malformed node or string: None\n", "Skipping malformed example: b7e12634-f7be-42cb-8e76-837af2f2d877 due to error: malformed node or string: None\n", "Skipping malformed example: 0c77b0ac-b1cf-4730-ae3d-d7c59221f181 due to error: malformed node or string: None\n", "Skipping malformed example: b4623202-dfcb-4fa8-9d28-5af818111de2 due to error: malformed node or string: None\n" ] } ], "source": [ "# Load the Azerbaijani NER dataset from Hugging Face\n", "dataset = load_dataset(\"LocalDoc/azerbaijani-ner-dataset\")\n", "print(dataset) # Display dataset structure (e.g., train/validation splits)\n", "\n", "# Preprocessing function to format tokens and NER tags correctly\n", "def preprocess_example(example):\n", " try:\n", " # Convert string of tokens to a list and parse NER tags to integers\n", " example[\"tokens\"] = ast.literal_eval(example[\"tokens\"])\n", " example[\"ner_tags\"] = list(map(int, ast.literal_eval(example[\"ner_tags\"])))\n", " except (ValueError, SyntaxError) as e:\n", " # Skip and log malformed examples, ensuring error resilience\n", " print(f\"Skipping malformed example: {example['index']} due to error: {e}\")\n", " example[\"tokens\"] = []\n", " example[\"ner_tags\"] = []\n", " return example\n", "\n", "# Apply preprocessing to each dataset entry, ensuring consistent formatting\n", "dataset = dataset.map(preprocess_example)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 177, "referenced_widgets": [ "f891b1b37df947eeae060bb5daefdea9", "470d93420bfc4d23bca07c308f37c316", "8e2fd00ac8414b2c92f17d565abacb91", "747e1fd9be3240169325520985f51ab0", "6f3022c618f548e0b21d8b031fabdfc8", "0cf2e757689b44ff8dd5b099d4deec99", "e378c4f4d73f42469868b21c0b5168db", "048f42e41b8e45a0847370c4ecec5c75", "c9dee5569d5340a2a7cbf59eb6a3cd1f", "8d8255757c434c74b955ee6ee6118282", "de1715ae4aa04626a4a2243fc7274908", "642afa2917004372804f7069dfde13b6", "a6181461c40040548b4afe1cc25d3eb9", "fd779f16d24a4cf481b32ed6d7d386b4", "ce2f73c695c84899864bd8b90ce61acf", "91d6176ee34e43b6bdc589651a84bb39", "784e878215004b05a056ca81b603e225", "e5d4bdd8daa5443a9fb56198088232bb", "1e8546990a404bffa6cacc8ebcc455be", "b32ad591846a4b2bbdbd5c5294dfa1e3", "014ac61faabf4dacbf8329d9f22a7d74", "c0753a29729549cc916b418d573deff9", "e9f6227be31945afac150d187957647e", "82ed7a9ba8df4adaaf1d231075efb691", "7300805850d246f3835e49a289b02671", "891da1220f74415288167b0f35040b52", "85661d8637c647889b3dafaae662c8c1", "82278ca1ab3e4106ab564b41ed0b927b", "193b019db0f04dbbac49d3b30f82656e", "93e964b3e62b41749b61a73323d71bef", "4bb3f9448e4f4faba78182c611bce084", "0e3c916040074de588a41860861a9516", "1d905e050d3a42149fab7130b62c7d1d", "2f27db2330d7432b8499a888e5821a36", "e0e314c4a33b4e18bc5c2e97afe8c17f", "a56b84f1b5034c8c81d515246fd87887", "4d12a628725640c7ab9e1ba39c2ee6b0", "0137fa7d552b4a00bd816a0f54ebcee3", "c255450a404e439faa7f7ba1b9f9b653", "aa091f0d4006467ba27ad0ca51550699", "fcb683aa15fa426183a93ba7c10b7c74", "6ec43eaa004c4332abee14f0a8b80af0", "cc67d9ac9d994d1382105b0c28aab58f", "21259e62de9c4d20a8d201e0932326b1", "ec4dcd069102401a8986f7a4f2d5ec1f", "a84e04c3028d4aeaaa6538bc42d451a6", "9a61a654eb464427ac2a6b6a39c313cf", "cddf6d30c1da48b3a5a2f3d9fbf2f17e", "95759f5980b24a349c6e5977f697f9d3", "7e73fe17010c474c97bc87da3b074857", "8281d037241b428eaa03670abb52afdb", "9dd78a2d839443219a70975dbeeff3a5", "29e3fcbe8f734b0e88ba3b42cf424c8a", "5b7e4d0f3f4f4a199e9315e7c16da8a0", "d1ee514ad0c0466fb548756d8aefc16d" ] }, "executionInfo": { "elapsed": 55700, "status": "ok", "timestamp": 1730569818501, "user": { "displayName": "Ismat Samadov", "userId": "13714662825869203427" }, "user_tz": -240 }, "id": "-24SJijT-zth", "outputId": "23b9a2b1-b7f3-437c-cecf-a988402b54f3" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f891b1b37df947eeae060bb5daefdea9", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer_config.json: 0%| | 0.00/25.0 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "642afa2917004372804f7069dfde13b6", "version_major": 2, "version_minor": 0 }, "text/plain": [ "config.json: 0%| | 0.00/615 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e9f6227be31945afac150d187957647e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "sentencepiece.bpe.model: 0%| | 0.00/5.07M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2f27db2330d7432b8499a888e5821a36", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer.json: 0%| | 0.00/9.10M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ec4dcd069102401a8986f7a4f2d5ec1f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/99545 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Initialize the tokenizer for multilingual NER using XLM-RoBERTa\n", "tokenizer = AutoTokenizer.from_pretrained(\"xlm-roberta-base\")\n", "\n", "# Function to tokenize input and align labels with tokenized words\n", "def tokenize_and_align_labels(example):\n", " # Tokenize the sentence while preserving word boundaries for correct NER tag alignment\n", " tokenized_inputs = tokenizer(\n", " example[\"tokens\"], # List of words (tokens) in the sentence\n", " truncation=True, # Truncate sentences longer than max_length\n", " is_split_into_words=True, # Specify that input is a list of words\n", " padding=\"max_length\", # Pad to maximum sequence length\n", " max_length=128, # Set the maximum sequence length to 128 tokens\n", " )\n", "\n", " labels = [] # List to store aligned NER labels\n", " word_ids = tokenized_inputs.word_ids() # Get word IDs for each token\n", " previous_word_idx = None # Initialize previous word index for tracking\n", "\n", " # Loop through word indices to align NER tags with subword tokens\n", " for word_idx in word_ids:\n", " if word_idx is None:\n", " labels.append(-100) # Set padding token labels to -100 (ignored in loss)\n", " elif word_idx != previous_word_idx:\n", " # Assign the label from example's NER tags if word index matches\n", " labels.append(example[\"ner_tags\"][word_idx] if word_idx < len(example[\"ner_tags\"]) else -100)\n", " else:\n", " labels.append(-100) # Label subword tokens with -100 to avoid redundant labels\n", " previous_word_idx = word_idx # Update previous word index\n", "\n", " tokenized_inputs[\"labels\"] = labels # Add labels to tokenized inputs\n", " return tokenized_inputs\n", "\n", "# Apply tokenization and label alignment function to the dataset\n", "tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=False)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 4, "status": "ok", "timestamp": 1730569818502, "user": { "displayName": "Ismat Samadov", "userId": "13714662825869203427" }, "user_tz": -240 }, "id": "DA7mW2it-zoo", "outputId": "d4830123-d1c4-4056-8fcf-01d50ed580e9" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['index', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],\n", " num_rows: 89590\n", " })\n", " test: Dataset({\n", " features: ['index', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],\n", " num_rows: 9955\n", " })\n", "})\n" ] } ], "source": [ "# Create a 90-10 split of the dataset for training and validation\n", "tokenized_datasets = tokenized_datasets[\"train\"].train_test_split(test_size=0.1)\n", "print(tokenized_datasets) # Output structure of split datasets" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "-lVHfKEE-zmm" }, "outputs": [], "source": [ "# Define a list of entity labels for NER tagging with B- (beginning) and I- (inside) markers\n", "label_list = [\n", " \"O\", # Outside of a named entity\n", " \"B-PERSON\", \"I-PERSON\", # Person name (e.g., \"John\" in \"John Doe\")\n", " \"B-LOCATION\", \"I-LOCATION\", # Geographical location (e.g., \"Paris\")\n", " \"B-ORGANISATION\", \"I-ORGANISATION\", # Organization name (e.g., \"UNICEF\")\n", " \"B-DATE\", \"I-DATE\", # Date entity (e.g., \"2024-11-05\")\n", " \"B-TIME\", \"I-TIME\", # Time (e.g., \"12:00 PM\")\n", " \"B-MONEY\", \"I-MONEY\", # Monetary values (e.g., \"$20\")\n", " \"B-PERCENTAGE\", \"I-PERCENTAGE\", # Percentage values (e.g., \"20%\")\n", " \"B-FACILITY\", \"I-FACILITY\", # Physical facilities (e.g., \"Airport\")\n", " \"B-PRODUCT\", \"I-PRODUCT\", # Product names (e.g., \"iPhone\")\n", " \"B-EVENT\", \"I-EVENT\", # Named events (e.g., \"Olympics\")\n", " \"B-ART\", \"I-ART\", # Works of art (e.g., \"Mona Lisa\")\n", " \"B-LAW\", \"I-LAW\", # Laws and legal documents (e.g., \"Article 50\")\n", " \"B-LANGUAGE\", \"I-LANGUAGE\", # Languages (e.g., \"Azerbaijani\")\n", " \"B-GPE\", \"I-GPE\", # Geopolitical entities (e.g., \"Europe\")\n", " \"B-NORP\", \"I-NORP\", # Nationalities, religious groups, political groups\n", " \"B-ORDINAL\", \"I-ORDINAL\", # Ordinal indicators (e.g., \"first\", \"second\")\n", " \"B-CARDINAL\", \"I-CARDINAL\", # Cardinal numbers (e.g., \"three\")\n", " \"B-DISEASE\", \"I-DISEASE\", # Diseases (e.g., \"COVID-19\")\n", " \"B-CONTACT\", \"I-CONTACT\", # Contact info (e.g., email or phone number)\n", " \"B-ADAGE\", \"I-ADAGE\", # Common sayings or adages\n", " \"B-QUANTITY\", \"I-QUANTITY\", # Quantities (e.g., \"5 km\")\n", " \"B-MISCELLANEOUS\", \"I-MISCELLANEOUS\", # Miscellaneous entities not fitting other categories\n", " \"B-POSITION\", \"I-POSITION\", # Job titles or positions (e.g., \"CEO\")\n", " \"B-PROJECT\", \"I-PROJECT\" # Project names (e.g., \"Project Apollo\")\n", "]\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 105, "referenced_widgets": [ "61375655521643eba759627e20a995e2", "6838baef09dc472392da22f0df45da05", "d2bff630e99949acbf1f96ec82891b0b", "130a9da286934112b0621a1bda3a91db", "5de921168c404ce78dcb72b68b9ef61f", "fdc06ee8fd15441491cf4233afacc864", "ae18a99e3f9941b78220e5207f3b6f7c", "9e5b9c0b3d7a41eaa39a237ce23a262a", "64e949b60f8d45918329760e07f0c94c", "2a3b340c95c9443ca1396686173b534a", "995621d110a34982960177b9e3d6e5af" ] }, "executionInfo": { "elapsed": 6243, "status": "ok", "timestamp": 1730569844595, "user": { "displayName": "Ismat Samadov", "userId": "13714662825869203427" }, "user_tz": -240 }, "id": "jUfWCaen-zjr", "outputId": "5c2a4470-3c9b-4f80-d841-24bff349c52f" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "61375655521643eba759627e20a995e2", "version_major": 2, "version_minor": 0 }, "text/plain": [ "model.safetensors: 0%| | 0.00/1.12G [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "# Initialize a data collator to handle padding and formatting for token classification\n", "data_collator = DataCollatorForTokenClassification(tokenizer)\n", "\n", "# Load a pre-trained model for token classification, adapted for NER tasks\n", "model = AutoModelForTokenClassification.from_pretrained(\n", " \"xlm-roberta-large\", # Base model (multilingual XLM-RoBERTa) for NER\n", " num_labels=len(label_list) # Set the number of output labels to match NER categories\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "9b7EajE_-zhS" }, "outputs": [], "source": [ "# Define a function to compute evaluation metrics for the model's predictions\n", "def compute_metrics(p):\n", " predictions, labels = p # Unpack predictions and true labels from the input\n", "\n", " # Convert logits to predicted label indices by taking the argmax along the last axis\n", " predictions = np.argmax(predictions, axis=2)\n", "\n", " # Filter out special padding labels (-100) and convert indices to label names\n", " true_labels = [[label_list[l] for l in label if l != -100] for label in labels]\n", " true_predictions = [\n", " [label_list[p] for (p, l) in zip(prediction, label) if l != -100]\n", " for prediction, label in zip(predictions, labels)\n", " ]\n", "\n", " # Print a detailed classification report for each label category\n", " print(classification_report(true_labels, true_predictions))\n", "\n", " # Calculate and return key evaluation metrics\n", " return {\n", " # Precision measures the accuracy of predicted positive instances\n", " # Important in NER to ensure entity predictions are correct and reduce false positives.\n", " \"precision\": precision_score(true_labels, true_predictions),\n", "\n", " # Recall measures the model's ability to capture all relevant entities\n", " # Essential in NER to ensure the model captures all entities, reducing false negatives.\n", " \"recall\": recall_score(true_labels, true_predictions),\n", "\n", " # F1-score is the harmonic mean of precision and recall, balancing both metrics\n", " # Useful in NER for providing an overall performance measure, especially when precision and recall are both important.\n", " \"f1\": f1_score(true_labels, true_predictions),\n", " }\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "PmJTMpp6-zew" }, "outputs": [], "source": [ "# Set up training arguments for model training, defining essential training configurations\n", "training_args = TrainingArguments(\n", " output_dir=\"./results\", # Directory to save model checkpoints and final outputs\n", " evaluation_strategy=\"epoch\", # Evaluate model on the validation set at the end of each epoch\n", " save_strategy=\"epoch\", # Save model checkpoints at the end of each epoch\n", " learning_rate=2e-5, # Set a low learning rate to ensure stable training for fine-tuning\n", " per_device_train_batch_size=128, # Number of examples per batch during training, balancing speed and memory\n", " per_device_eval_batch_size=128, # Number of examples per batch during evaluation\n", " num_train_epochs=12, # Number of full training passes over the dataset\n", " weight_decay=0.005, # Regularization term to prevent overfitting by penalizing large weights\n", " fp16=True, # Use 16-bit floating point for faster and memory-efficient training\n", " logging_dir='./logs', # Directory to store training logs\n", " save_total_limit=2, # Keep only the 2 latest model checkpoints to save storage space\n", " load_best_model_at_end=True, # Load the best model based on metrics at the end of training\n", " metric_for_best_model=\"f1\", # Use F1-score to determine the best model checkpoint\n", " report_to=\"none\" # Disable reporting to external services (useful in local runs)\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "WqoF7QJy-zb2" }, "outputs": [], "source": [ "# Initialize the Trainer class to manage the training loop with all necessary components\n", "trainer = Trainer(\n", " model=model, # The pre-trained model to be fine-tuned\n", " args=training_args, # Training configuration parameters defined in TrainingArguments\n", " train_dataset=tokenized_datasets[\"train\"], # Tokenized training dataset\n", " eval_dataset=tokenized_datasets[\"test\"], # Tokenized validation dataset\n", " tokenizer=tokenizer, # Tokenizer used for processing input text\n", " data_collator=data_collator, # Data collator for padding and batching during training\n", " compute_metrics=compute_metrics, # Function to calculate evaluation metrics like precision, recall, F1\n", " callbacks=[EarlyStoppingCallback(early_stopping_patience=5)] # Stop training early if validation metrics don't improve for 2 epochs\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "executionInfo": { "elapsed": 1179788, "status": "ok", "timestamp": 1730571069005, "user": { "displayName": "Ismat Samadov", "userId": "13714662825869203427" }, "user_tz": -240 }, "id": "QveYYwvA-zUR", "outputId": "a96f9658-96c1-479f-fa66-385d2e527103" }, "outputs": [ { "data": { "text/html": [ "\n", "
Epoch | \n", "Training Loss | \n", "Validation Loss | \n", "Precision | \n", "Recall | \n", "F1 | \n", "
---|---|---|---|---|---|
1 | \n", "0.323100 | \n", "0.275503 | \n", "0.775799 | \n", "0.694886 | \n", "0.733117 | \n", "
2 | \n", "0.272500 | \n", "0.262481 | \n", "0.739266 | \n", "0.739900 | \n", "0.739583 | \n", "
3 | \n", "0.248600 | \n", "0.252498 | \n", "0.751478 | \n", "0.741152 | \n", "0.746280 | \n", "
4 | \n", "0.236800 | \n", "0.249968 | \n", "0.754882 | \n", "0.741449 | \n", "0.748105 | \n", "
5 | \n", "0.223800 | \n", "0.252187 | \n", "0.764390 | \n", "0.740460 | \n", "0.752235 | \n", "
6 | \n", "0.218600 | \n", "0.249887 | \n", "0.756352 | \n", "0.741646 | \n", "0.748927 | \n", "
7 | \n", "0.209700 | \n", "0.250748 | \n", "0.760696 | \n", "0.739438 | \n", "0.749916 | \n", "
"
],
"text/plain": [
"