from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer | |
class AnonymizationPipeline: | |
def __init__(self, model_name): | |
self.model = AutoModelForTokenClassification.from_pretrained("JonathanEGP/Beto_Ner") | |
self.tokenizer = AutoTokenizer.from_pretrained("JonathanEGP/Beto_Ner") | |
self.ner_pipeline = pipeline("ner", model=self.model, tokenizer=self.tokenizer) | |
def anonymize(self, text): | |
entities = self.ner_pipeline(text) | |
entities.sort(key=lambda x: x['end'], reverse=True) | |
for entity in entities: | |
start = entity['start'] | |
end = entity['end'] | |
entity_type = entity['entity'] | |
text = text[:start] + f"[{entity_type}]" + text[end:] | |
return text | |