File size: 1,495 Bytes
b365d1b a2ee67b b365d1b a2ee67b b365d1b a2ee67b b365d1b a2ee67b b365d1b a2ee67b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
from transformers import Pipeline, AutoModelForTokenClassification, AutoTokenizer, pipeline
class AnonymizationPipeline(Pipeline):
def __init__(self, model=None, tokenizer=None, **kwargs):
self.model_name = "JonathanEGP/Beto_Ner"
if model is None:
model = AutoModelForTokenClassification.from_pretrained(self.model_name)
if tokenizer is None:
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
super().__init__(model=model, tokenizer=tokenizer, **kwargs)
self.ner_pipeline = pipeline("ner", model=self.model, tokenizer=self.tokenizer)
def _sanitize_parameters(self, **kwargs):
return {}, {}, {}
def preprocess(self, text):
return {"text": text}
def _forward(self, model_inputs):
text = model_inputs["text"]
entities = self.ner_pipeline(text)
return {"text": text, "entities": entities}
def postprocess(self, model_outputs):
text = model_outputs["text"]
entities = model_outputs["entities"]
entities.sort(key=lambda x: x['end'], reverse=True)
for entity in entities:
start = entity['start']
end = entity['end']
entity_type = entity['entity']
text = text[:start] + f"[{entity_type}]" + text[end:]
return {"anonymized_text": text}
def __call__(self, text, **kwargs):
return super().__call__(text, **kwargs) |