from transformers import Pipeline, AutoModelForTokenClassification, AutoTokenizer, pipeline class AnonymizationPipeline(Pipeline): def __init__(self, model=None, tokenizer=None, **kwargs): self.model_name = "JonathanEGP/RoBERTa_base_bn_ner" if model is None: model = AutoModelForTokenClassification.from_pretrained(self.model_name) if tokenizer is None: tokenizer = AutoTokenizer.from_pretrained(self.model_name) super().__init__(model=model, tokenizer=tokenizer, **kwargs) self.ner_pipeline = pipeline("ner", model=self.model, tokenizer=self.tokenizer) def _sanitize_parameters(self, **kwargs): return {}, {}, {} def preprocess(self, text): return {"text": text} def _forward(self, model_inputs): text = model_inputs["text"] entities = self.ner_pipeline(text) return {"text": text, "entities": entities} def postprocess(self, model_outputs): text = model_outputs["text"] entities = model_outputs["entities"] entities.sort(key=lambda x: x['end'], reverse=True) for entity in entities: start = entity['start'] end = entity['end'] entity_type = entity['entity'] text = text[:start] + f"[{entity_type}]" + text[end:] return {"anonymized_text": text} def __call__(self, text, **kwargs): return super().__call__(text, **kwargs)