amanpatkar
/

distilbert-finetuned-ner

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import transformers\n",
+    "from transformers import AutoTokenizer\n",
+    "import datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading data: 100%|██████████| 983k/983k [00:00<00:00, 1.13MB/s] \n",
+      "Generating train split: 100%|██████████| 14041/14041 [00:01<00:00, 7095.93 examples/s]\n",
+      "Generating validation split: 100%|██████████| 3250/3250 [00:00<00:00, 6601.81 examples/s]\n",
+      "Generating test split: 100%|██████████| 3453/3453 [00:00<00:00, 7659.18 examples/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Load datasets\n",
+    "data = datasets.load_dataset(\"conll2003\",trust_remote_code=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n",
+       "        num_rows: 14041\n",
+       "    })\n",
+       "    validation: Dataset({\n",
+       "        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n",
+       "        num_rows: 3250\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n",
+       "        num_rows: 3453\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# label_names\n",
+    "label_names = data['train'].features['ner_tags'].feature.names\n",
+    "label_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\Aman\\anaconda3\\envs\\huggingfacev3\\Lib\\site-packages\\huggingface_hub\\file_download.py:157: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\Aman\\.cache\\huggingface\\hub\\models--distilbert-base-cased. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
+      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
+      "  warnings.warn(message)\n",
+      "c:\\Users\\Aman\\anaconda3\\envs\\huggingfacev3\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Tokenizer\n",
+    "checkpoint = \"distilbert-base-cased\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data['train'][0]['tokens'] # Already in tokens"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t = tokenizer(data['train'][0]['tokens'], is_split_into_words=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "t"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "t.word_ids()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Target Alignment\n",
+    "* Like for Shantanu, this tokenizer can tokenize further based on sub word like Shan & ####tanu, so we need something like B-PER, I-PER for this"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'] \n",
+    "begin2Inside = {\n",
+    "    1:2,\n",
+    "    3:4,\n",
+    "    5:6,\n",
+    "    7:8\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def align_target(labels, word_ids):\n",
+    "    aligned_labels=[]\n",
+    "    last_word = None\n",
+    "    for word in word_ids:\n",
+    "        if word is None:\n",
+    "            label = -100 # Assigning -100 for [CLS] [PAD] special tokens\n",
+    "        elif word!=last_word:\n",
+    "            label = labels[word]\n",
+    "        else:\n",
+    "            label = labels[word]\n",
+    "            #Change B-<tag> to I-<tag>\n",
+    "            if label in begin2Inside:\n",
+    "                label=begin2Inside[label]\n",
+    "        aligned_labels.append(label)\n",
+    "        last_word=word\n",
+    "    return aligned_labels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Tokenize for both input and target(label)\n",
+    "def tokenize_fn(batch):\n",
+    "    # Tokenize the input seq first\n",
+    "    # It will populate inputs_ids, attention_mask etc\n",
+    "    tokenized_inputs = tokenizer(batch['tokens'], is_split_into_words=True, truncation=True)\n",
+    "\n",
+    "    labels_batch = batch['ner_tags'] #original Targets\n",
+    "    aligned_label_batch = []\n",
+    "    for i, lables in enumerate(labels_batch):\n",
+    "        words_ids = tokenized_inputs.word_ids(i)\n",
+    "        aligned_label_batch.append(align_target(labels=lables,word_ids=words_ids))\n",
+    "\n",
+    "    tokenized_inputs['labels'] = aligned_label_batch\n",
+    "\n",
+    "    return tokenized_inputs\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags']"
+      ]
+     },
+     "execution_count": 47,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data['train'].column_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Map: 100%|██████████| 14041/14041 [00:01<00:00, 8967.65 examples/s]\n",
+      "Map: 100%|██████████| 3250/3250 [00:00<00:00, 7560.07 examples/s]\n",
+      "Map: 100%|██████████| 3453/3453 [00:00<00:00, 10502.31 examples/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "tokenized_datasets = data.map(\n",
+    "    tokenize_fn,\n",
+    "    batched=True,\n",
+    "    remove_columns=data['train'].column_names # Removing column other than input_ids, attention_mask, labels\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'input_ids': [101, 1943, 14428, 102],\n",
+       " 'attention_mask': [1, 1, 1, 1],\n",
+       " 'labels': [-100, 1, 2, -100]}"
+      ]
+     },
+     "execution_count": 53,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenized_datasets['train'][1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'[CLS] Peter Blackburn [SEP]'"
+      ]
+     },
+     "execution_count": 52,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer.decode(tokenized_datasets['train'][1]['input_ids'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Metric"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_metric\n",
+    "metric = load_metric('seqeval',trust_remote_code=True) # This metric is just for NER"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_metric(logits_and_labels):\n",
+    "    logists, labels = logits_and_labels\n",
+    "    preds = np.argmax(logists,axis=-1)\n",
+    "\n",
+    "    #Remove -100 from label and pred\n",
+    "    # and convert the label_ids to label_names\n",
+    "    str_labels = [[label_names[t] for t in label if t!=-100] for label in labels]\n",
+    "\n",
+    "    str_preds = [[label_names[t] for p, t in zip(pred, target) if t!=-100] for pred, target in zip(preds, labels)]\n",
+    "\n",
+    "    the_metrics = metric.compute(predictions=str_preds,references=str_labels)\n",
+    "    \n",
+    "    return {\n",
+    "        \"precision\":the_metrics['overall_precision'],\n",
+    "        \"recall\":the_metrics['overall_recall'],\n",
+    "        \"f1\":the_metrics['overall_f1'],\n",
+    "        \"accuracy\":the_metrics['overall_accuracy']\n",
+    "    }\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']"
+      ]
+     },
+     "execution_count": 61,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "label_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "id2label = {k:val for k, val in enumerate(label_names)}\n",
+    "label2id = {val:k for k, val in id2label.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoModelForTokenClassification\n",
+    "\n",
+    "model = AutoModelForTokenClassification.from_pretrained(\n",
+    "    checkpoint,\n",
+    "    id2label = id2label,\n",
+    "    label2id = label2id\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments\n",
+    "\n",
+    "train_args = TrainingArguments(\n",
+    "    \"distilbert-finetuned-ner\",\n",
+    "    evaluation_strategy=\"epoch\",\n",
+    "    save_strategy=\"epoch\",\n",
+    "    learning_rate=2e-5,\n",
+    "    num_train_epochs=3,\n",
+    "    weight_decay=0.1\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import DataCollatorForTokenClassification\n",
+    "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import Trainer\n",
+    "trainer = Trainer(\n",
+    "    model = model,\n",
+    "    args = train_args,\n",
+    "    train_dataset=tokenized_datasets['train'],\n",
+    "    eval_dataset=tokenized_datasets['validation'],\n",
+    "    data_collator=data_collator,\n",
+    "    compute_metrics=compute_metric,\n",
+    "    tokenizer=tokenizer\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 10%|▉         | 501/5268 [01:34<13:16,  5.99it/s] "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.2966, 'grad_norm': 8.722156524658203, 'learning_rate': 1.810174639331815e-05, 'epoch': 0.28}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 19%|█▉        | 1001/5268 [03:00<13:00,  5.47it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.129, 'grad_norm': 0.30502110719680786, 'learning_rate': 1.6203492786636296e-05, 'epoch': 0.57}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 28%|██▊       | 1501/5268 [04:26<09:43,  6.45it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.0908, 'grad_norm': 4.793717861175537, 'learning_rate': 1.4305239179954442e-05, 'epoch': 0.85}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                   \n",
+      " 33%|███▎      | 1756/5268 [05:27<08:45,  6.68it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.088701531291008, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 17.0771, 'eval_samples_per_second': 190.314, 'eval_steps_per_second': 23.833, 'epoch': 1.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 38%|███▊      | 2001/5268 [06:14<08:48,  6.18it/s]  "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.0749, 'grad_norm': 0.12880899012088776, 'learning_rate': 1.240698557327259e-05, 'epoch': 1.14}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 47%|████▋     | 2501/5268 [07:40<06:25,  7.17it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.0541, 'grad_norm': 7.310864448547363, 'learning_rate': 1.0508731966590738e-05, 'epoch': 1.42}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 57%|█████▋    | 3001/5268 [09:03<06:24,  5.90it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.0547, 'grad_norm': 2.9379518032073975, 'learning_rate': 8.610478359908885e-06, 'epoch': 1.71}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 66%|██████▋   | 3501/5268 [10:27<05:16,  5.58it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.0467, 'grad_norm': 0.3557382822036743, 'learning_rate': 6.712224753227031e-06, 'epoch': 1.99}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                   \n",
+      " 67%|██████▋   | 3512/5268 [10:47<04:07,  7.10it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.07127507776021957, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 18.2863, 'eval_samples_per_second': 177.728, 'eval_steps_per_second': 22.257, 'epoch': 2.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 76%|███████▌  | 4001/5268 [12:12<03:37,  5.83it/s]  "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.0289, 'grad_norm': 3.5150299072265625, 'learning_rate': 4.8139711465451785e-06, 'epoch': 2.28}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 85%|████████▌ | 4501/5268 [13:44<02:10,  5.88it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.0266, 'grad_norm': 5.119720935821533, 'learning_rate': 2.9157175398633257e-06, 'epoch': 2.56}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 95%|█████████▍| 5001/5268 [15:12<00:48,  5.50it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.0276, 'grad_norm': 0.05973776802420616, 'learning_rate': 1.0174639331814731e-06, 'epoch': 2.85}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                   \n",
+      "100%|██████████| 5268/5268 [16:15<00:00,  6.31it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.07107102125883102, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 17.759, 'eval_samples_per_second': 183.006, 'eval_steps_per_second': 22.918, 'epoch': 3.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 5268/5268 [16:21<00:00,  5.36it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'train_runtime': 981.963, 'train_samples_per_second': 42.897, 'train_steps_per_second': 5.365, 'train_loss': 0.08021244997315635, 'epoch': 3.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "TrainOutput(global_step=5268, training_loss=0.08021244997315635, metrics={'train_runtime': 981.963, 'train_samples_per_second': 42.897, 'train_steps_per_second': 5.365, 'total_flos': 460431563935266.0, 'train_loss': 0.08021244997315635, 'epoch': 3.0})"
+      ]
+     },
+     "execution_count": 74,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer.save_model(\"my_saved_model\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "training_args.bin: 100%|██████████| 5.11k/5.11k [00:01<00:00, 4.19kB/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/amanpatkar/distilbert-finetuned-ner/commit/8276ef3336762d679ee7e10218fe8518eab8e4aa', commit_message='amanpatkar/distilbert-finetuned-ner', commit_description='', oid='8276ef3336762d679ee7e10218fe8518eab8e4aa', pr_url=None, pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 79,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer.push_to_hub(\"amanpatkar/distilbert-finetuned-ner\", token = \"<>\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\Aman\\anaconda3\\envs\\huggingfacev3\\Lib\\site-packages\\huggingface_hub\\file_download.py:157: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\Aman\\.cache\\huggingface\\hub\\models--amanpatkar--distilbert-finetuned-ner. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
+      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
+      "  warnings.warn(message)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/amanpatkar/distilbert-finetuned-ner/commit/4c7253e599d89d1f7af6827260b567a44f4a9741', commit_message='Upload tokenizer', commit_description='', oid='4c7253e599d89d1f7af6827260b567a44f4a9741', pr_url=None, pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 80,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer.push_to_hub(\"amanpatkar/distilbert-finetuned-ner\", token = \"<>\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ner = pipeline(\n",
+    "    \"token-classification\",\n",
+    "    model = \"amanpatkar/distilbert-finetuned-ner\",\n",
+    "    aggregation_strategy = \"simple\",\n",
+    "    device = 0\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'entity_group': 'PER',\n",
+       "  'score': np.float32(0.9989685),\n",
+       "  'word': 'Aman Patkar',\n",
+       "  'start': 0,\n",
+       "  'end': 11},\n",
+       " {'entity_group': 'ORG',\n",
+       "  'score': np.float32(0.99077755),\n",
+       "  'word': 'Honda KTM',\n",
+       "  'start': 21,\n",
+       "  'end': 30},\n",
+       " {'entity_group': 'LOC',\n",
+       "  'score': np.float32(0.9992505),\n",
+       "  'word': 'India',\n",
+       "  'start': 43,\n",
+       "  'end': 48}]"
+      ]
+     },
+     "execution_count": 85,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "s = \"Aman Patkar owns the Honda KTM showroom in India. He is a boy.\"\n",
+    "ner(s)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "tensorflowgpu1",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}