In [16]:
import numpy as np
import pandas as pd
import transformers
from transformers import AutoTokenizer
import datasets

In [3]:
#Load datasets
data = datasets.load_dataset("conll2003",trust_remote_code=True)

Downloading data: 100%|██████████| 983k/983k [00:00<00:00, 1.13MB/s] 
Generating train split: 100%|██████████| 14041/14041 [00:01<00:00, 7095.93 examples/s]
Generating validation split: 100%|██████████| 3250/3250 [00:00<00:00, 6601.81 examples/s]
Generating test split: 100%|██████████| 3453/3453 [00:00<00:00, 7659.18 examples/s]


In [4]:
data

DatasetDict({
 train: Dataset({
 features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
 num_rows: 14041
 })
 validation: Dataset({
 features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
 num_rows: 3250
 })
 test: Dataset({
 features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
 num_rows: 3453
 })
})

In [15]:
# label_names
label_names = data['train'].features['ner_tags'].feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [17]:
# Tokenizer
checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [20]:
data['train'][0]['tokens'] # Already in tokens

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [21]:
t = tokenizer(data['train'][0]['tokens'], is_split_into_words=True)

In [22]:
t

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [24]:
t.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

### Target Alignment
* Like for Shantanu, this tokenizer can tokenize further based on sub word like Shan & ####tanu, so we need something like B-PER, I-PER for this

In [25]:
# ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'] 
begin2Inside = {
 1:2,
 3:4,
 5:6,
 7:8
}

In [45]:
def align_target(labels, word_ids):
 aligned_labels=[]
 last_word = None
 for word in word_ids:
 if word is None:
 label = -100 # Assigning -100 for [CLS] [PAD] special tokens
 elif word!=last_word:
 label = labels[word]
 else:
 label = labels[word]
 #Change B- to I-
 if label in begin2Inside:
 label=begin2Inside[label]
 aligned_labels.append(label)
 last_word=word
 return aligned_labels

In [46]:
# Tokenize for both input and target(label)
def tokenize_fn(batch):
 # Tokenize the input seq first
 # It will populate inputs_ids, attention_mask etc
 tokenized_inputs = tokenizer(batch['tokens'], is_split_into_words=True, truncation=True)

 labels_batch = batch['ner_tags'] #original Targets
 aligned_label_batch = []
 for i, lables in enumerate(labels_batch):
 words_ids = tokenized_inputs.word_ids(i)
 aligned_label_batch.append(align_target(labels=lables,word_ids=words_ids))

 tokenized_inputs['labels'] = aligned_label_batch

 return tokenized_inputs


In [47]:
data['train'].column_names

['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags']

In [48]:
tokenized_datasets = data.map(
 tokenize_fn,
 batched=True,
 remove_columns=data['train'].column_names # Removing column other than input_ids, attention_mask, labels
)

Map: 100%|██████████| 14041/14041 [00:01<00:00, 8967.65 examples/s]
Map: 100%|██████████| 3250/3250 [00:00<00:00, 7560.07 examples/s]
Map: 100%|██████████| 3453/3453 [00:00<00:00, 10502.31 examples/s]


In [53]:
tokenized_datasets['train'][1]

{'input_ids': [101, 1943, 14428, 102],
 'attention_mask': [1, 1, 1, 1],
 'labels': [-100, 1, 2, -100]}

In [52]:
tokenizer.decode(tokenized_datasets['train'][1]['input_ids'])

'[CLS] Peter Blackburn [SEP]'

### Metric

In [59]:
from datasets import load_metric
metric = load_metric('seqeval',trust_remote_code=True) # This metric is just for NER

In [60]:
def compute_metric(logits_and_labels):
 logists, labels = logits_and_labels
 preds = np.argmax(logists,axis=-1)

 #Remove -100 from label and pred
 # and convert the label_ids to label_names
 str_labels = [[label_names[t] for t in label if t!=-100] for label in labels]

 str_preds = [[label_names[t] for p, t in zip(pred, target) if t!=-100] for pred, target in zip(preds, labels)]

 the_metrics = metric.compute(predictions=str_preds,references=str_labels)
 
 return {
 "precision":the_metrics['overall_precision'],
 "recall":the_metrics['overall_recall'],
 "f1":the_metrics['overall_f1'],
 "accuracy":the_metrics['overall_accuracy']
 }



In [61]:
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [64]:
id2label = {k:val for k, val in enumerate(label_names)}
label2id = {val:k for k, val in id2label.items()}

In [68]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
 checkpoint,
 id2label = id2label,
 label2id = label2id
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [70]:
from transformers import TrainingArguments

train_args = TrainingArguments(
 "distilbert-finetuned-ner",
 evaluation_strategy="epoch",
 save_strategy="epoch",
 learning_rate=2e-5,
 num_train_epochs=3,
 weight_decay=0.1
)

In [71]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [73]:
from transformers import Trainer
trainer = Trainer(
 model = model,
 args = train_args,
 train_dataset=tokenized_datasets['train'],
 eval_dataset=tokenized_datasets['validation'],
 data_collator=data_collator,
 compute_metrics=compute_metric,
 tokenizer=tokenizer
)

In [74]:
trainer.train()

 10%|▉ | 501/5268 [01:34<13:16, 5.99it/s] 

{'loss': 0.2966, 'grad_norm': 8.722156524658203, 'learning_rate': 1.810174639331815e-05, 'epoch': 0.28}


 19%|█▉ | 1001/5268 [03:00<13:00, 5.47it/s]

{'loss': 0.129, 'grad_norm': 0.30502110719680786, 'learning_rate': 1.6203492786636296e-05, 'epoch': 0.57}


 28%|██▊ | 1501/5268 [04:26<09:43, 6.45it/s]

{'loss': 0.0908, 'grad_norm': 4.793717861175537, 'learning_rate': 1.4305239179954442e-05, 'epoch': 0.85}


 
 33%|███▎ | 1756/5268 [05:27<08:45, 6.68it/s]

{'eval_loss': 0.088701531291008, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 17.0771, 'eval_samples_per_second': 190.314, 'eval_steps_per_second': 23.833, 'epoch': 1.0}


 38%|███▊ | 2001/5268 [06:14<08:48, 6.18it/s] 

{'loss': 0.0749, 'grad_norm': 0.12880899012088776, 'learning_rate': 1.240698557327259e-05, 'epoch': 1.14}


 47%|████▋ | 2501/5268 [07:40<06:25, 7.17it/s]

{'loss': 0.0541, 'grad_norm': 7.310864448547363, 'learning_rate': 1.0508731966590738e-05, 'epoch': 1.42}


 57%|█████▋ | 3001/5268 [09:03<06:24, 5.90it/s]

{'loss': 0.0547, 'grad_norm': 2.9379518032073975, 'learning_rate': 8.610478359908885e-06, 'epoch': 1.71}


 66%|██████▋ | 3501/5268 [10:27<05:16, 5.58it/s]

{'loss': 0.0467, 'grad_norm': 0.3557382822036743, 'learning_rate': 6.712224753227031e-06, 'epoch': 1.99}


 
 67%|██████▋ | 3512/5268 [10:47<04:07, 7.10it/s]

{'eval_loss': 0.07127507776021957, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 18.2863, 'eval_samples_per_second': 177.728, 'eval_steps_per_second': 22.257, 'epoch': 2.0}


 76%|███████▌ | 4001/5268 [12:12<03:37, 5.83it/s] 

{'loss': 0.0289, 'grad_norm': 3.5150299072265625, 'learning_rate': 4.8139711465451785e-06, 'epoch': 2.28}


 85%|████████▌ | 4501/5268 [13:44<02:10, 5.88it/s]

{'loss': 0.0266, 'grad_norm': 5.119720935821533, 'learning_rate': 2.9157175398633257e-06, 'epoch': 2.56}


 95%|█████████▍| 5001/5268 [15:12<00:48, 5.50it/s]

{'loss': 0.0276, 'grad_norm': 0.05973776802420616, 'learning_rate': 1.0174639331814731e-06, 'epoch': 2.85}


 
100%|██████████| 5268/5268 [16:15<00:00, 6.31it/s]

{'eval_loss': 0.07107102125883102, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 17.759, 'eval_samples_per_second': 183.006, 'eval_steps_per_second': 22.918, 'epoch': 3.0}


100%|██████████| 5268/5268 [16:21<00:00, 5.36it/s]

{'train_runtime': 981.963, 'train_samples_per_second': 42.897, 'train_steps_per_second': 5.365, 'train_loss': 0.08021244997315635, 'epoch': 3.0}





TrainOutput(global_step=5268, training_loss=0.08021244997315635, metrics={'train_runtime': 981.963, 'train_samples_per_second': 42.897, 'train_steps_per_second': 5.365, 'total_flos': 460431563935266.0, 'train_loss': 0.08021244997315635, 'epoch': 3.0})

In [75]:
trainer.save_model("my_saved_model")

In [79]:
trainer.push_to_hub("amanpatkar/distilbert-finetuned-ner", token = "<>")

training_args.bin: 100%|██████████| 5.11k/5.11k [00:01<00:00, 4.19kB/s]


CommitInfo(commit_url='https://huggingface.co/amanpatkar/distilbert-finetuned-ner/commit/8276ef3336762d679ee7e10218fe8518eab8e4aa', commit_message='amanpatkar/distilbert-finetuned-ner', commit_description='', oid='8276ef3336762d679ee7e10218fe8518eab8e4aa', pr_url=None, pr_revision=None, pr_num=None)

In [80]:
tokenizer.push_to_hub("amanpatkar/distilbert-finetuned-ner", token = "<>")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CommitInfo(commit_url='https://huggingface.co/amanpatkar/distilbert-finetuned-ner/commit/4c7253e599d89d1f7af6827260b567a44f4a9741', commit_message='Upload tokenizer', commit_description='', oid='4c7253e599d89d1f7af6827260b567a44f4a9741', pr_url=None, pr_revision=None, pr_num=None)

In [76]:
from transformers import pipeline

In [84]:
ner = pipeline(
 "token-classification",
 model = "amanpatkar/distilbert-finetuned-ner",
 aggregation_strategy = "simple",
 device = 0
)

In [85]:
s = "Aman Patkar owns the Honda KTM showroom in India. He is a boy."
ner(s)

[{'entity_group': 'PER',
 'score': np.float32(0.9989685),
 'word': 'Aman Patkar',
 'start': 0,
 'end': 11},
 {'entity_group': 'ORG',
 'score': np.float32(0.99077755),
 'word': 'Honda KTM',
 'start': 21,
 'end': 30},
 {'entity_group': 'LOC',
 'score': np.float32(0.9992505),
 'word': 'India',
 'start': 43,
 'end': 48}]