Zamanonymize3 / anonymize_file_clear.py
jfrery-zama's picture
update anonymize file in clear with roberta +update uuid map with query id
d0b1031
raw
history blame
No virus
3.21 kB
import argparse
import json
import re
import uuid
from pathlib import Path
import gensim
from concrete.ml.common.serialization.loaders import load
from transformers import AutoTokenizer, AutoModel
from utils_demo import get_batch_text_representation
def load_models():
base_dir = Path(__file__).parent / "models"
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
with open(base_dir / "cml_logreg.model", "r") as model_file:
fhe_ner_detection = load(file=model_file)
return embeddings_model, tokenizer, fhe_ner_detection
def anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection):
token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
tokens = re.findall(token_pattern, text)
uuid_map = {}
processed_tokens = []
for token in tokens:
if token.strip() and re.match(r"\w+", token): # If the token is a word
x = get_batch_text_representation([token], embeddings_model, tokenizer)
prediction_proba = fhe_ner_detection.predict_proba(x)
probability = prediction_proba[0][1]
prediction = probability >= 0.5
if prediction:
if token not in uuid_map:
uuid_map[token] = str(uuid.uuid4())[:8]
processed_tokens.append(uuid_map[token])
else:
processed_tokens.append(token)
else:
processed_tokens.append(token) # Preserve punctuation and spaces as is
anonymized_text = ''.join(processed_tokens)
return anonymized_text, uuid_map
def main():
parser = argparse.ArgumentParser(description="Anonymize named entities in a text file and save the mapping to a JSON file.")
parser.add_argument("file_path", type=str, help="The path to the file to be processed.")
args = parser.parse_args()
embeddings_model, tokenizer, fhe_ner_detection = load_models()
# Read the input file
with open(args.file_path, 'r', encoding='utf-8') as file:
text = file.read()
# Save the original text to its specified file
original_file_path = Path(__file__).parent / "files" / "original_document.txt"
with open(original_file_path, 'w', encoding='utf-8') as original_file:
original_file.write(text)
# Anonymize the text
anonymized_text, uuid_map = anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection)
# Save the anonymized text to its specified file
anonymized_file_path = Path(__file__).parent / "files" / "anonymized_document.txt"
with open(anonymized_file_path, 'w', encoding='utf-8') as anonymized_file:
anonymized_file.write(anonymized_text)
# Save the UUID mapping to a JSON file
mapping_path = Path(args.file_path).stem + "_uuid_mapping.json"
with open(mapping_path, 'w', encoding='utf-8') as file:
json.dump(uuid_map, file, indent=4, sort_keys=True)
print(f"Original text saved to {original_file_path}")
print(f"Anonymized text saved to {anonymized_file_path}")
print(f"UUID mapping saved to {mapping_path}")
if __name__ == "__main__":
main()