Anonimizador_Ner / Anonimizador_Ner.py
JonathanEGP's picture
Update Anonimizador_Ner.py
df891cf verified
raw
history blame
1.51 kB
from transformers import Pipeline, AutoModelForTokenClassification, AutoTokenizer, pipeline
class AnonymizationPipeline(Pipeline):
def __init__(self, model=None, tokenizer=None, **kwargs):
self.model_name = "JonathanEGP/RoBERTa_base_bn_ner"
if model is None:
model = AutoModelForTokenClassification.from_pretrained(self.model_name)
if tokenizer is None:
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
super().__init__(model=model, tokenizer=tokenizer, **kwargs)
self.ner_pipeline = pipeline("ner", model=self.model, tokenizer=self.tokenizer)
def _sanitize_parameters(self, **kwargs):
return {}, {}, {}
def preprocess(self, text):
return {"text": text}
def _forward(self, model_inputs):
text = model_inputs["text"]
entities = self.ner_pipeline(text)
return {"text": text, "entities": entities}
def postprocess(self, model_outputs):
text = model_outputs["text"]
entities = model_outputs["entities"]
entities.sort(key=lambda x: x['end'], reverse=True)
for entity in entities:
start = entity['start']
end = entity['end']
entity_type = entity['entity']
text = text[:start] + f"[{entity_type}]" + text[end:]
return {"anonymized_text": text}
def __call__(self, text, **kwargs):
return super().__call__(text, **kwargs)