from transformers import Pipeline, AutoModelForTokenClassification, AutoTokenizer class AnonymizationPipeline(Pipeline): def __init__(self, model=None, tokenizer=None, **kwargs): super().__init__(model=model, tokenizer=tokenizer, **kwargs) if self.model is None: self.model = AutoModelForTokenClassification.from_pretrained("JonathanEGP/Beto_Ner") if self.tokenizer is None: self.tokenizer = AutoTokenizer.from_pretrained("JonathanEGP/Beto_Ner") self.ner_pipeline = Pipeline("ner", model=self.model, tokenizer=self.tokenizer) def _sanitize_parameters(self, **kwargs): return {}, {}, {} # No additional parameters needed for now def preprocess(self, text): return {"text": text} def _forward(self, model_inputs): text = model_inputs["text"] entities = self.ner_pipeline(text) return {"text": text, "entities": entities} def postprocess(self, model_outputs): text = model_outputs["text"] entities = model_outputs["entities"] # Ordenar las entidades de final a principio para no afectar los índices entities.sort(key=lambda x: x['end'], reverse=True) # Reemplazar las entidades con sus etiquetas for entity in entities: start = entity['start'] end = entity['end'] entity_type = entity['entity'] text = text[:start] + f"[{entity_type}]" + text[end:] return {"anonymized_text": text} def __call__(self, text, **kwargs): return super().__call__(text, **kwargs)