|
from transformers import Pipeline, AutoModelForTokenClassification, AutoTokenizer, pipeline |
|
|
|
class AnonymizationPipeline(Pipeline): |
|
def __init__(self, model=None, tokenizer=None, **kwargs): |
|
self.model_name = "JonathanEGP/RoBERTa_base_bn_ner" |
|
|
|
if model is None: |
|
model = AutoModelForTokenClassification.from_pretrained(self.model_name) |
|
if tokenizer is None: |
|
tokenizer = AutoTokenizer.from_pretrained(self.model_name) |
|
|
|
super().__init__(model=model, tokenizer=tokenizer, **kwargs) |
|
|
|
self.ner_pipeline = pipeline("ner", model=self.model, tokenizer=self.tokenizer) |
|
|
|
def _sanitize_parameters(self, **kwargs): |
|
return {}, {}, {} |
|
|
|
def preprocess(self, text): |
|
return {"text": text} |
|
|
|
def _forward(self, model_inputs): |
|
text = model_inputs["text"] |
|
entities = self.ner_pipeline(text) |
|
return {"text": text, "entities": entities} |
|
|
|
def postprocess(self, model_outputs): |
|
text = model_outputs["text"] |
|
entities = model_outputs["entities"] |
|
|
|
entities.sort(key=lambda x: x['end'], reverse=True) |
|
|
|
for entity in entities: |
|
start = entity['start'] |
|
end = entity['end'] |
|
entity_type = entity['entity'] |
|
text = text[:start] + f"[{entity_type}]" + text[end:] |
|
|
|
return {"anonymized_text": text} |
|
|
|
def __call__(self, text, **kwargs): |
|
return super().__call__(text, **kwargs) |