|
from transformers import Pipeline, AutoModelForTokenClassification, AutoTokenizer |
|
|
|
class AnonymizationPipeline(Pipeline): |
|
def __init__(self, model=None, tokenizer=None, **kwargs): |
|
super().__init__(model=model, tokenizer=tokenizer, **kwargs) |
|
|
|
if self.model is None: |
|
self.model = AutoModelForTokenClassification.from_pretrained("JonathanEGP/Beto_Ner") |
|
if self.tokenizer is None: |
|
self.tokenizer = AutoTokenizer.from_pretrained("JonathanEGP/Beto_Ner") |
|
|
|
self.ner_pipeline = Pipeline("ner", model=self.model, tokenizer=self.tokenizer) |
|
|
|
def _sanitize_parameters(self, **kwargs): |
|
return {}, {}, {} # No additional parameters needed for now |
|
|
|
def preprocess(self, text): |
|
return {"text": text} |
|
|
|
def _forward(self, model_inputs): |
|
text = model_inputs["text"] |
|
entities = self.ner_pipeline(text) |
|
return {"text": text, "entities": entities} |
|
|
|
def postprocess(self, model_outputs): |
|
text = model_outputs["text"] |
|
entities = model_outputs["entities"] |
|
|
|
# Ordenar las entidades de final a principio para no afectar los índices |
|
entities.sort(key=lambda x: x['end'], reverse=True) |
|
|
|
# Reemplazar las entidades con sus etiquetas |
|
for entity in entities: |
|
start = entity['start'] |
|
end = entity['end'] |
|
entity_type = entity['entity'] |
|
text = text[:start] + f"[{entity_type}]" + text[end:] |
|
|
|
return {"anonymized_text": text} |
|
|
|
def __call__(self, text, **kwargs): |
|
return super().__call__(text, **kwargs) |