Anonimizador_Ner / Anonimizador_Ner.txt
JonathanEGP's picture
Update Anonimizador_Ner.txt
a2ee67b verified
raw
history blame
1.64 kB
from transformers import Pipeline, AutoModelForTokenClassification, AutoTokenizer
class AnonymizationPipeline(Pipeline):
def __init__(self, model=None, tokenizer=None, **kwargs):
super().__init__(model=model, tokenizer=tokenizer, **kwargs)
if self.model is None:
self.model = AutoModelForTokenClassification.from_pretrained("JonathanEGP/Beto_Ner")
if self.tokenizer is None:
self.tokenizer = AutoTokenizer.from_pretrained("JonathanEGP/Beto_Ner")
self.ner_pipeline = Pipeline("ner", model=self.model, tokenizer=self.tokenizer)
def _sanitize_parameters(self, **kwargs):
return {}, {}, {} # No additional parameters needed for now
def preprocess(self, text):
return {"text": text}
def _forward(self, model_inputs):
text = model_inputs["text"]
entities = self.ner_pipeline(text)
return {"text": text, "entities": entities}
def postprocess(self, model_outputs):
text = model_outputs["text"]
entities = model_outputs["entities"]
# Ordenar las entidades de final a principio para no afectar los índices
entities.sort(key=lambda x: x['end'], reverse=True)
# Reemplazar las entidades con sus etiquetas
for entity in entities:
start = entity['start']
end = entity['end']
entity_type = entity['entity']
text = text[:start] + f"[{entity_type}]" + text[end:]
return {"anonymized_text": text}
def __call__(self, text, **kwargs):
return super().__call__(text, **kwargs)