Spaces:
Sleeping
Sleeping
from transformers import AutoTokenizer, AutoModelForTokenClassification | |
from transformers import pipeline | |
class NamedEntityRecognition(): | |
def __init__(self): | |
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english") | |
model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english") | |
self.nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True) | |
def get_annotation(self, preds, text): | |
splits = [0] | |
entities = {} | |
for i in preds: | |
splits.append(i['start']) | |
splits.append(i['end']) | |
entities[i['word']] = i['entity_group'] | |
# Exclude bad preds | |
exclude = ['', '.', '. ', ' '] | |
for x in exclude: | |
if x in entities.keys(): | |
entities.pop(x) | |
parts = [text[i:j] for i, j in zip(splits, splits[1:] + [None])] | |
final_annotation = [(x, entities[x], "") if x in entities.keys() else x for x in parts] | |
return final_annotation | |
def classify(self, text): | |
preds = self.nlp(text) | |
ner_annotation = self.get_annotation(preds, text) | |
return preds, ner_annotation |