amanpatkar commited on
Commit
2c45582
β€’
1 Parent(s): 2735c3f

Training and pushing NER

Browse files
Files changed (1) hide show
  1. NER-Tagging.ipynb +861 -0
NER-Tagging.ipynb ADDED
@@ -0,0 +1,861 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 16,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import numpy as np\n",
10
+ "import pandas as pd\n",
11
+ "import transformers\n",
12
+ "from transformers import AutoTokenizer\n",
13
+ "import datasets"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 3,
19
+ "metadata": {},
20
+ "outputs": [
21
+ {
22
+ "name": "stderr",
23
+ "output_type": "stream",
24
+ "text": [
25
+ "Downloading data: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 983k/983k [00:00<00:00, 1.13MB/s] \n",
26
+ "Generating train split: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 14041/14041 [00:01<00:00, 7095.93 examples/s]\n",
27
+ "Generating validation split: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3250/3250 [00:00<00:00, 6601.81 examples/s]\n",
28
+ "Generating test split: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3453/3453 [00:00<00:00, 7659.18 examples/s]\n"
29
+ ]
30
+ }
31
+ ],
32
+ "source": [
33
+ "#Load datasets\n",
34
+ "data = datasets.load_dataset(\"conll2003\",trust_remote_code=True)"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": 4,
40
+ "metadata": {},
41
+ "outputs": [
42
+ {
43
+ "data": {
44
+ "text/plain": [
45
+ "DatasetDict({\n",
46
+ " train: Dataset({\n",
47
+ " features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n",
48
+ " num_rows: 14041\n",
49
+ " })\n",
50
+ " validation: Dataset({\n",
51
+ " features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n",
52
+ " num_rows: 3250\n",
53
+ " })\n",
54
+ " test: Dataset({\n",
55
+ " features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n",
56
+ " num_rows: 3453\n",
57
+ " })\n",
58
+ "})"
59
+ ]
60
+ },
61
+ "execution_count": 4,
62
+ "metadata": {},
63
+ "output_type": "execute_result"
64
+ }
65
+ ],
66
+ "source": [
67
+ "data"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": 15,
73
+ "metadata": {},
74
+ "outputs": [
75
+ {
76
+ "data": {
77
+ "text/plain": [
78
+ "['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']"
79
+ ]
80
+ },
81
+ "execution_count": 15,
82
+ "metadata": {},
83
+ "output_type": "execute_result"
84
+ }
85
+ ],
86
+ "source": [
87
+ "# label_names\n",
88
+ "label_names = data['train'].features['ner_tags'].feature.names\n",
89
+ "label_names"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": 17,
95
+ "metadata": {},
96
+ "outputs": [
97
+ {
98
+ "name": "stderr",
99
+ "output_type": "stream",
100
+ "text": [
101
+ "c:\\Users\\Aman\\anaconda3\\envs\\huggingfacev3\\Lib\\site-packages\\huggingface_hub\\file_download.py:157: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\Aman\\.cache\\huggingface\\hub\\models--distilbert-base-cased. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
102
+ "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
103
+ " warnings.warn(message)\n",
104
+ "c:\\Users\\Aman\\anaconda3\\envs\\huggingfacev3\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
105
+ " warnings.warn(\n"
106
+ ]
107
+ }
108
+ ],
109
+ "source": [
110
+ "# Tokenizer\n",
111
+ "checkpoint = \"distilbert-base-cased\"\n",
112
+ "tokenizer = AutoTokenizer.from_pretrained(checkpoint)"
113
+ ]
114
+ },
115
+ {
116
+ "cell_type": "code",
117
+ "execution_count": 20,
118
+ "metadata": {},
119
+ "outputs": [
120
+ {
121
+ "data": {
122
+ "text/plain": [
123
+ "['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']"
124
+ ]
125
+ },
126
+ "execution_count": 20,
127
+ "metadata": {},
128
+ "output_type": "execute_result"
129
+ }
130
+ ],
131
+ "source": [
132
+ "data['train'][0]['tokens'] # Already in tokens"
133
+ ]
134
+ },
135
+ {
136
+ "cell_type": "code",
137
+ "execution_count": 21,
138
+ "metadata": {},
139
+ "outputs": [],
140
+ "source": [
141
+ "t = tokenizer(data['train'][0]['tokens'], is_split_into_words=True)"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "execution_count": 22,
147
+ "metadata": {},
148
+ "outputs": [
149
+ {
150
+ "data": {
151
+ "text/plain": [
152
+ "{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
153
+ ]
154
+ },
155
+ "execution_count": 22,
156
+ "metadata": {},
157
+ "output_type": "execute_result"
158
+ }
159
+ ],
160
+ "source": [
161
+ "t"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "code",
166
+ "execution_count": 24,
167
+ "metadata": {},
168
+ "outputs": [
169
+ {
170
+ "data": {
171
+ "text/plain": [
172
+ "[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]"
173
+ ]
174
+ },
175
+ "execution_count": 24,
176
+ "metadata": {},
177
+ "output_type": "execute_result"
178
+ }
179
+ ],
180
+ "source": [
181
+ "t.word_ids()"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "markdown",
186
+ "metadata": {},
187
+ "source": [
188
+ "### Target Alignment\n",
189
+ "* Like for Shantanu, this tokenizer can tokenize further based on sub word like Shan & ####tanu, so we need something like B-PER, I-PER for this"
190
+ ]
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "execution_count": 25,
195
+ "metadata": {},
196
+ "outputs": [],
197
+ "source": [
198
+ "# ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'] \n",
199
+ "begin2Inside = {\n",
200
+ " 1:2,\n",
201
+ " 3:4,\n",
202
+ " 5:6,\n",
203
+ " 7:8\n",
204
+ "}"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": 45,
210
+ "metadata": {},
211
+ "outputs": [],
212
+ "source": [
213
+ "def align_target(labels, word_ids):\n",
214
+ " aligned_labels=[]\n",
215
+ " last_word = None\n",
216
+ " for word in word_ids:\n",
217
+ " if word is None:\n",
218
+ " label = -100 # Assigning -100 for [CLS] [PAD] special tokens\n",
219
+ " elif word!=last_word:\n",
220
+ " label = labels[word]\n",
221
+ " else:\n",
222
+ " label = labels[word]\n",
223
+ " #Change B-<tag> to I-<tag>\n",
224
+ " if label in begin2Inside:\n",
225
+ " label=begin2Inside[label]\n",
226
+ " aligned_labels.append(label)\n",
227
+ " last_word=word\n",
228
+ " return aligned_labels"
229
+ ]
230
+ },
231
+ {
232
+ "cell_type": "code",
233
+ "execution_count": 46,
234
+ "metadata": {},
235
+ "outputs": [],
236
+ "source": [
237
+ "# Tokenize for both input and target(label)\n",
238
+ "def tokenize_fn(batch):\n",
239
+ " # Tokenize the input seq first\n",
240
+ " # It will populate inputs_ids, attention_mask etc\n",
241
+ " tokenized_inputs = tokenizer(batch['tokens'], is_split_into_words=True, truncation=True)\n",
242
+ "\n",
243
+ " labels_batch = batch['ner_tags'] #original Targets\n",
244
+ " aligned_label_batch = []\n",
245
+ " for i, lables in enumerate(labels_batch):\n",
246
+ " words_ids = tokenized_inputs.word_ids(i)\n",
247
+ " aligned_label_batch.append(align_target(labels=lables,word_ids=words_ids))\n",
248
+ "\n",
249
+ " tokenized_inputs['labels'] = aligned_label_batch\n",
250
+ "\n",
251
+ " return tokenized_inputs\n"
252
+ ]
253
+ },
254
+ {
255
+ "cell_type": "code",
256
+ "execution_count": 47,
257
+ "metadata": {},
258
+ "outputs": [
259
+ {
260
+ "data": {
261
+ "text/plain": [
262
+ "['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags']"
263
+ ]
264
+ },
265
+ "execution_count": 47,
266
+ "metadata": {},
267
+ "output_type": "execute_result"
268
+ }
269
+ ],
270
+ "source": [
271
+ "data['train'].column_names"
272
+ ]
273
+ },
274
+ {
275
+ "cell_type": "code",
276
+ "execution_count": 48,
277
+ "metadata": {},
278
+ "outputs": [
279
+ {
280
+ "name": "stderr",
281
+ "output_type": "stream",
282
+ "text": [
283
+ "Map: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 14041/14041 [00:01<00:00, 8967.65 examples/s]\n",
284
+ "Map: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3250/3250 [00:00<00:00, 7560.07 examples/s]\n",
285
+ "Map: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3453/3453 [00:00<00:00, 10502.31 examples/s]\n"
286
+ ]
287
+ }
288
+ ],
289
+ "source": [
290
+ "tokenized_datasets = data.map(\n",
291
+ " tokenize_fn,\n",
292
+ " batched=True,\n",
293
+ " remove_columns=data['train'].column_names # Removing column other than input_ids, attention_mask, labels\n",
294
+ ")"
295
+ ]
296
+ },
297
+ {
298
+ "cell_type": "code",
299
+ "execution_count": 53,
300
+ "metadata": {},
301
+ "outputs": [
302
+ {
303
+ "data": {
304
+ "text/plain": [
305
+ "{'input_ids': [101, 1943, 14428, 102],\n",
306
+ " 'attention_mask': [1, 1, 1, 1],\n",
307
+ " 'labels': [-100, 1, 2, -100]}"
308
+ ]
309
+ },
310
+ "execution_count": 53,
311
+ "metadata": {},
312
+ "output_type": "execute_result"
313
+ }
314
+ ],
315
+ "source": [
316
+ "tokenized_datasets['train'][1]"
317
+ ]
318
+ },
319
+ {
320
+ "cell_type": "code",
321
+ "execution_count": 52,
322
+ "metadata": {},
323
+ "outputs": [
324
+ {
325
+ "data": {
326
+ "text/plain": [
327
+ "'[CLS] Peter Blackburn [SEP]'"
328
+ ]
329
+ },
330
+ "execution_count": 52,
331
+ "metadata": {},
332
+ "output_type": "execute_result"
333
+ }
334
+ ],
335
+ "source": [
336
+ "tokenizer.decode(tokenized_datasets['train'][1]['input_ids'])"
337
+ ]
338
+ },
339
+ {
340
+ "cell_type": "markdown",
341
+ "metadata": {},
342
+ "source": [
343
+ "### Metric"
344
+ ]
345
+ },
346
+ {
347
+ "cell_type": "code",
348
+ "execution_count": 59,
349
+ "metadata": {},
350
+ "outputs": [],
351
+ "source": [
352
+ "from datasets import load_metric\n",
353
+ "metric = load_metric('seqeval',trust_remote_code=True) # This metric is just for NER"
354
+ ]
355
+ },
356
+ {
357
+ "cell_type": "code",
358
+ "execution_count": 60,
359
+ "metadata": {},
360
+ "outputs": [],
361
+ "source": [
362
+ "def compute_metric(logits_and_labels):\n",
363
+ " logists, labels = logits_and_labels\n",
364
+ " preds = np.argmax(logists,axis=-1)\n",
365
+ "\n",
366
+ " #Remove -100 from label and pred\n",
367
+ " # and convert the label_ids to label_names\n",
368
+ " str_labels = [[label_names[t] for t in label if t!=-100] for label in labels]\n",
369
+ "\n",
370
+ " str_preds = [[label_names[t] for p, t in zip(pred, target) if t!=-100] for pred, target in zip(preds, labels)]\n",
371
+ "\n",
372
+ " the_metrics = metric.compute(predictions=str_preds,references=str_labels)\n",
373
+ " \n",
374
+ " return {\n",
375
+ " \"precision\":the_metrics['overall_precision'],\n",
376
+ " \"recall\":the_metrics['overall_recall'],\n",
377
+ " \"f1\":the_metrics['overall_f1'],\n",
378
+ " \"accuracy\":the_metrics['overall_accuracy']\n",
379
+ " }\n",
380
+ "\n"
381
+ ]
382
+ },
383
+ {
384
+ "cell_type": "code",
385
+ "execution_count": 61,
386
+ "metadata": {},
387
+ "outputs": [
388
+ {
389
+ "data": {
390
+ "text/plain": [
391
+ "['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']"
392
+ ]
393
+ },
394
+ "execution_count": 61,
395
+ "metadata": {},
396
+ "output_type": "execute_result"
397
+ }
398
+ ],
399
+ "source": [
400
+ "label_names"
401
+ ]
402
+ },
403
+ {
404
+ "cell_type": "code",
405
+ "execution_count": 64,
406
+ "metadata": {},
407
+ "outputs": [],
408
+ "source": [
409
+ "id2label = {k:val for k, val in enumerate(label_names)}\n",
410
+ "label2id = {val:k for k, val in id2label.items()}"
411
+ ]
412
+ },
413
+ {
414
+ "cell_type": "code",
415
+ "execution_count": 68,
416
+ "metadata": {},
417
+ "outputs": [
418
+ {
419
+ "name": "stderr",
420
+ "output_type": "stream",
421
+ "text": [
422
+ "Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
423
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
424
+ ]
425
+ }
426
+ ],
427
+ "source": [
428
+ "from transformers import AutoModelForTokenClassification\n",
429
+ "\n",
430
+ "model = AutoModelForTokenClassification.from_pretrained(\n",
431
+ " checkpoint,\n",
432
+ " id2label = id2label,\n",
433
+ " label2id = label2id\n",
434
+ ")"
435
+ ]
436
+ },
437
+ {
438
+ "cell_type": "code",
439
+ "execution_count": 70,
440
+ "metadata": {},
441
+ "outputs": [],
442
+ "source": [
443
+ "from transformers import TrainingArguments\n",
444
+ "\n",
445
+ "train_args = TrainingArguments(\n",
446
+ " \"distilbert-finetuned-ner\",\n",
447
+ " evaluation_strategy=\"epoch\",\n",
448
+ " save_strategy=\"epoch\",\n",
449
+ " learning_rate=2e-5,\n",
450
+ " num_train_epochs=3,\n",
451
+ " weight_decay=0.1\n",
452
+ ")"
453
+ ]
454
+ },
455
+ {
456
+ "cell_type": "code",
457
+ "execution_count": 71,
458
+ "metadata": {},
459
+ "outputs": [],
460
+ "source": [
461
+ "from transformers import DataCollatorForTokenClassification\n",
462
+ "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)"
463
+ ]
464
+ },
465
+ {
466
+ "cell_type": "code",
467
+ "execution_count": 73,
468
+ "metadata": {},
469
+ "outputs": [],
470
+ "source": [
471
+ "from transformers import Trainer\n",
472
+ "trainer = Trainer(\n",
473
+ " model = model,\n",
474
+ " args = train_args,\n",
475
+ " train_dataset=tokenized_datasets['train'],\n",
476
+ " eval_dataset=tokenized_datasets['validation'],\n",
477
+ " data_collator=data_collator,\n",
478
+ " compute_metrics=compute_metric,\n",
479
+ " tokenizer=tokenizer\n",
480
+ ")"
481
+ ]
482
+ },
483
+ {
484
+ "cell_type": "code",
485
+ "execution_count": 74,
486
+ "metadata": {},
487
+ "outputs": [
488
+ {
489
+ "name": "stderr",
490
+ "output_type": "stream",
491
+ "text": [
492
+ " 10%|β–‰ | 501/5268 [01:34<13:16, 5.99it/s] "
493
+ ]
494
+ },
495
+ {
496
+ "name": "stdout",
497
+ "output_type": "stream",
498
+ "text": [
499
+ "{'loss': 0.2966, 'grad_norm': 8.722156524658203, 'learning_rate': 1.810174639331815e-05, 'epoch': 0.28}\n"
500
+ ]
501
+ },
502
+ {
503
+ "name": "stderr",
504
+ "output_type": "stream",
505
+ "text": [
506
+ " 19%|β–ˆβ–‰ | 1001/5268 [03:00<13:00, 5.47it/s]"
507
+ ]
508
+ },
509
+ {
510
+ "name": "stdout",
511
+ "output_type": "stream",
512
+ "text": [
513
+ "{'loss': 0.129, 'grad_norm': 0.30502110719680786, 'learning_rate': 1.6203492786636296e-05, 'epoch': 0.57}\n"
514
+ ]
515
+ },
516
+ {
517
+ "name": "stderr",
518
+ "output_type": "stream",
519
+ "text": [
520
+ " 28%|β–ˆβ–ˆβ–Š | 1501/5268 [04:26<09:43, 6.45it/s]"
521
+ ]
522
+ },
523
+ {
524
+ "name": "stdout",
525
+ "output_type": "stream",
526
+ "text": [
527
+ "{'loss': 0.0908, 'grad_norm': 4.793717861175537, 'learning_rate': 1.4305239179954442e-05, 'epoch': 0.85}\n"
528
+ ]
529
+ },
530
+ {
531
+ "name": "stderr",
532
+ "output_type": "stream",
533
+ "text": [
534
+ " \n",
535
+ " 33%|β–ˆβ–ˆβ–ˆβ–Ž | 1756/5268 [05:27<08:45, 6.68it/s]"
536
+ ]
537
+ },
538
+ {
539
+ "name": "stdout",
540
+ "output_type": "stream",
541
+ "text": [
542
+ "{'eval_loss': 0.088701531291008, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 17.0771, 'eval_samples_per_second': 190.314, 'eval_steps_per_second': 23.833, 'epoch': 1.0}\n"
543
+ ]
544
+ },
545
+ {
546
+ "name": "stderr",
547
+ "output_type": "stream",
548
+ "text": [
549
+ " 38%|β–ˆβ–ˆβ–ˆβ–Š | 2001/5268 [06:14<08:48, 6.18it/s] "
550
+ ]
551
+ },
552
+ {
553
+ "name": "stdout",
554
+ "output_type": "stream",
555
+ "text": [
556
+ "{'loss': 0.0749, 'grad_norm': 0.12880899012088776, 'learning_rate': 1.240698557327259e-05, 'epoch': 1.14}\n"
557
+ ]
558
+ },
559
+ {
560
+ "name": "stderr",
561
+ "output_type": "stream",
562
+ "text": [
563
+ " 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 2501/5268 [07:40<06:25, 7.17it/s]"
564
+ ]
565
+ },
566
+ {
567
+ "name": "stdout",
568
+ "output_type": "stream",
569
+ "text": [
570
+ "{'loss': 0.0541, 'grad_norm': 7.310864448547363, 'learning_rate': 1.0508731966590738e-05, 'epoch': 1.42}\n"
571
+ ]
572
+ },
573
+ {
574
+ "name": "stderr",
575
+ "output_type": "stream",
576
+ "text": [
577
+ " 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 3001/5268 [09:03<06:24, 5.90it/s]"
578
+ ]
579
+ },
580
+ {
581
+ "name": "stdout",
582
+ "output_type": "stream",
583
+ "text": [
584
+ "{'loss': 0.0547, 'grad_norm': 2.9379518032073975, 'learning_rate': 8.610478359908885e-06, 'epoch': 1.71}\n"
585
+ ]
586
+ },
587
+ {
588
+ "name": "stderr",
589
+ "output_type": "stream",
590
+ "text": [
591
+ " 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 3501/5268 [10:27<05:16, 5.58it/s]"
592
+ ]
593
+ },
594
+ {
595
+ "name": "stdout",
596
+ "output_type": "stream",
597
+ "text": [
598
+ "{'loss': 0.0467, 'grad_norm': 0.3557382822036743, 'learning_rate': 6.712224753227031e-06, 'epoch': 1.99}\n"
599
+ ]
600
+ },
601
+ {
602
+ "name": "stderr",
603
+ "output_type": "stream",
604
+ "text": [
605
+ " \n",
606
+ " 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 3512/5268 [10:47<04:07, 7.10it/s]"
607
+ ]
608
+ },
609
+ {
610
+ "name": "stdout",
611
+ "output_type": "stream",
612
+ "text": [
613
+ "{'eval_loss': 0.07127507776021957, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 18.2863, 'eval_samples_per_second': 177.728, 'eval_steps_per_second': 22.257, 'epoch': 2.0}\n"
614
+ ]
615
+ },
616
+ {
617
+ "name": "stderr",
618
+ "output_type": "stream",
619
+ "text": [
620
+ " 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4001/5268 [12:12<03:37, 5.83it/s] "
621
+ ]
622
+ },
623
+ {
624
+ "name": "stdout",
625
+ "output_type": "stream",
626
+ "text": [
627
+ "{'loss': 0.0289, 'grad_norm': 3.5150299072265625, 'learning_rate': 4.8139711465451785e-06, 'epoch': 2.28}\n"
628
+ ]
629
+ },
630
+ {
631
+ "name": "stderr",
632
+ "output_type": "stream",
633
+ "text": [
634
+ " 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4501/5268 [13:44<02:10, 5.88it/s]"
635
+ ]
636
+ },
637
+ {
638
+ "name": "stdout",
639
+ "output_type": "stream",
640
+ "text": [
641
+ "{'loss': 0.0266, 'grad_norm': 5.119720935821533, 'learning_rate': 2.9157175398633257e-06, 'epoch': 2.56}\n"
642
+ ]
643
+ },
644
+ {
645
+ "name": "stderr",
646
+ "output_type": "stream",
647
+ "text": [
648
+ " 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 5001/5268 [15:12<00:48, 5.50it/s]"
649
+ ]
650
+ },
651
+ {
652
+ "name": "stdout",
653
+ "output_type": "stream",
654
+ "text": [
655
+ "{'loss': 0.0276, 'grad_norm': 0.05973776802420616, 'learning_rate': 1.0174639331814731e-06, 'epoch': 2.85}\n"
656
+ ]
657
+ },
658
+ {
659
+ "name": "stderr",
660
+ "output_type": "stream",
661
+ "text": [
662
+ " \n",
663
+ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5268/5268 [16:15<00:00, 6.31it/s]"
664
+ ]
665
+ },
666
+ {
667
+ "name": "stdout",
668
+ "output_type": "stream",
669
+ "text": [
670
+ "{'eval_loss': 0.07107102125883102, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 17.759, 'eval_samples_per_second': 183.006, 'eval_steps_per_second': 22.918, 'epoch': 3.0}\n"
671
+ ]
672
+ },
673
+ {
674
+ "name": "stderr",
675
+ "output_type": "stream",
676
+ "text": [
677
+ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5268/5268 [16:21<00:00, 5.36it/s]"
678
+ ]
679
+ },
680
+ {
681
+ "name": "stdout",
682
+ "output_type": "stream",
683
+ "text": [
684
+ "{'train_runtime': 981.963, 'train_samples_per_second': 42.897, 'train_steps_per_second': 5.365, 'train_loss': 0.08021244997315635, 'epoch': 3.0}\n"
685
+ ]
686
+ },
687
+ {
688
+ "name": "stderr",
689
+ "output_type": "stream",
690
+ "text": [
691
+ "\n"
692
+ ]
693
+ },
694
+ {
695
+ "data": {
696
+ "text/plain": [
697
+ "TrainOutput(global_step=5268, training_loss=0.08021244997315635, metrics={'train_runtime': 981.963, 'train_samples_per_second': 42.897, 'train_steps_per_second': 5.365, 'total_flos': 460431563935266.0, 'train_loss': 0.08021244997315635, 'epoch': 3.0})"
698
+ ]
699
+ },
700
+ "execution_count": 74,
701
+ "metadata": {},
702
+ "output_type": "execute_result"
703
+ }
704
+ ],
705
+ "source": [
706
+ "trainer.train()"
707
+ ]
708
+ },
709
+ {
710
+ "cell_type": "code",
711
+ "execution_count": 75,
712
+ "metadata": {},
713
+ "outputs": [],
714
+ "source": [
715
+ "trainer.save_model(\"my_saved_model\")"
716
+ ]
717
+ },
718
+ {
719
+ "cell_type": "code",
720
+ "execution_count": 79,
721
+ "metadata": {},
722
+ "outputs": [
723
+ {
724
+ "name": "stderr",
725
+ "output_type": "stream",
726
+ "text": [
727
+ "training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5.11k/5.11k [00:01<00:00, 4.19kB/s]\n"
728
+ ]
729
+ },
730
+ {
731
+ "data": {
732
+ "text/plain": [
733
+ "CommitInfo(commit_url='https://huggingface.co/amanpatkar/distilbert-finetuned-ner/commit/8276ef3336762d679ee7e10218fe8518eab8e4aa', commit_message='amanpatkar/distilbert-finetuned-ner', commit_description='', oid='8276ef3336762d679ee7e10218fe8518eab8e4aa', pr_url=None, pr_revision=None, pr_num=None)"
734
+ ]
735
+ },
736
+ "execution_count": 79,
737
+ "metadata": {},
738
+ "output_type": "execute_result"
739
+ }
740
+ ],
741
+ "source": [
742
+ "trainer.push_to_hub(\"amanpatkar/distilbert-finetuned-ner\", token = \"<>\")"
743
+ ]
744
+ },
745
+ {
746
+ "cell_type": "code",
747
+ "execution_count": 80,
748
+ "metadata": {},
749
+ "outputs": [
750
+ {
751
+ "name": "stderr",
752
+ "output_type": "stream",
753
+ "text": [
754
+ "c:\\Users\\Aman\\anaconda3\\envs\\huggingfacev3\\Lib\\site-packages\\huggingface_hub\\file_download.py:157: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\Aman\\.cache\\huggingface\\hub\\models--amanpatkar--distilbert-finetuned-ner. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
755
+ "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
756
+ " warnings.warn(message)\n"
757
+ ]
758
+ },
759
+ {
760
+ "data": {
761
+ "text/plain": [
762
+ "CommitInfo(commit_url='https://huggingface.co/amanpatkar/distilbert-finetuned-ner/commit/4c7253e599d89d1f7af6827260b567a44f4a9741', commit_message='Upload tokenizer', commit_description='', oid='4c7253e599d89d1f7af6827260b567a44f4a9741', pr_url=None, pr_revision=None, pr_num=None)"
763
+ ]
764
+ },
765
+ "execution_count": 80,
766
+ "metadata": {},
767
+ "output_type": "execute_result"
768
+ }
769
+ ],
770
+ "source": [
771
+ "tokenizer.push_to_hub(\"amanpatkar/distilbert-finetuned-ner\", token = \"<>\")"
772
+ ]
773
+ },
774
+ {
775
+ "cell_type": "code",
776
+ "execution_count": 76,
777
+ "metadata": {},
778
+ "outputs": [],
779
+ "source": [
780
+ "from transformers import pipeline"
781
+ ]
782
+ },
783
+ {
784
+ "cell_type": "code",
785
+ "execution_count": 84,
786
+ "metadata": {},
787
+ "outputs": [],
788
+ "source": [
789
+ "ner = pipeline(\n",
790
+ " \"token-classification\",\n",
791
+ " model = \"amanpatkar/distilbert-finetuned-ner\",\n",
792
+ " aggregation_strategy = \"simple\",\n",
793
+ " device = 0\n",
794
+ ")"
795
+ ]
796
+ },
797
+ {
798
+ "cell_type": "code",
799
+ "execution_count": 85,
800
+ "metadata": {},
801
+ "outputs": [
802
+ {
803
+ "data": {
804
+ "text/plain": [
805
+ "[{'entity_group': 'PER',\n",
806
+ " 'score': np.float32(0.9989685),\n",
807
+ " 'word': 'Aman Patkar',\n",
808
+ " 'start': 0,\n",
809
+ " 'end': 11},\n",
810
+ " {'entity_group': 'ORG',\n",
811
+ " 'score': np.float32(0.99077755),\n",
812
+ " 'word': 'Honda KTM',\n",
813
+ " 'start': 21,\n",
814
+ " 'end': 30},\n",
815
+ " {'entity_group': 'LOC',\n",
816
+ " 'score': np.float32(0.9992505),\n",
817
+ " 'word': 'India',\n",
818
+ " 'start': 43,\n",
819
+ " 'end': 48}]"
820
+ ]
821
+ },
822
+ "execution_count": 85,
823
+ "metadata": {},
824
+ "output_type": "execute_result"
825
+ }
826
+ ],
827
+ "source": [
828
+ "s = \"Aman Patkar owns the Honda KTM showroom in India. He is a boy.\"\n",
829
+ "ner(s)"
830
+ ]
831
+ },
832
+ {
833
+ "cell_type": "code",
834
+ "execution_count": null,
835
+ "metadata": {},
836
+ "outputs": [],
837
+ "source": []
838
+ }
839
+ ],
840
+ "metadata": {
841
+ "kernelspec": {
842
+ "display_name": "tensorflowgpu1",
843
+ "language": "python",
844
+ "name": "python3"
845
+ },
846
+ "language_info": {
847
+ "codemirror_mode": {
848
+ "name": "ipython",
849
+ "version": 3
850
+ },
851
+ "file_extension": ".py",
852
+ "mimetype": "text/x-python",
853
+ "name": "python",
854
+ "nbconvert_exporter": "python",
855
+ "pygments_lexer": "ipython3",
856
+ "version": "3.12.4"
857
+ }
858
+ },
859
+ "nbformat": 4,
860
+ "nbformat_minor": 2
861
+ }