Zamanonymize3

Sleeping

File size: 3,210 Bytes

df6182e
 
 
 
 
 
 
d0b1031
 
df6182e
 
628fe8f
d0b1031
 
 
 
 
 
df6182e
d0b1031
df6182e
d0b1031
df6182e
 
 
 
 
 
 
d0b1031
df6182e
 
 
 
 
 
 
 
 
 
 
 
628fe8f
 
df6182e
 
 
 
 
 
d0b1031
df6182e
 
 
 
 
628fe8f
 
 
 
 
df6182e
d0b1031
628fe8f
 
 
 
 
df6182e
 
 
 
 
 
628fe8f
 
df6182e

import argparse
import json
import re
import uuid
from pathlib import Path
import gensim
from concrete.ml.common.serialization.loaders import load
from transformers import AutoTokenizer, AutoModel
from utils_demo import get_batch_text_representation

def load_models():
    base_dir = Path(__file__).parent / "models"

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
    embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")

    with open(base_dir / "cml_logreg.model", "r") as model_file:
        fhe_ner_detection = load(file=model_file)
    return embeddings_model, tokenizer, fhe_ner_detection

def anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection):
    token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
    tokens = re.findall(token_pattern, text)
    uuid_map = {}
    processed_tokens = []

    for token in tokens:
        if token.strip() and re.match(r"\w+", token):  # If the token is a word
            x = get_batch_text_representation([token], embeddings_model, tokenizer)
            prediction_proba = fhe_ner_detection.predict_proba(x)
            probability = prediction_proba[0][1]
            prediction = probability >= 0.5
            if prediction:
                if token not in uuid_map:
                    uuid_map[token] = str(uuid.uuid4())[:8]
                processed_tokens.append(uuid_map[token])
            else:
                processed_tokens.append(token)
        else:
            processed_tokens.append(token)  # Preserve punctuation and spaces as is

    anonymized_text = ''.join(processed_tokens)
    return anonymized_text, uuid_map

def main():
    parser = argparse.ArgumentParser(description="Anonymize named entities in a text file and save the mapping to a JSON file.")
    parser.add_argument("file_path", type=str, help="The path to the file to be processed.")
    args = parser.parse_args()

    embeddings_model, tokenizer, fhe_ner_detection = load_models()

    # Read the input file
    with open(args.file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Save the original text to its specified file
    original_file_path = Path(__file__).parent / "files" / "original_document.txt"
    with open(original_file_path, 'w', encoding='utf-8') as original_file:
        original_file.write(text)
    
    # Anonymize the text
    anonymized_text, uuid_map = anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection)

    # Save the anonymized text to its specified file
    anonymized_file_path = Path(__file__).parent / "files" / "anonymized_document.txt"
    with open(anonymized_file_path, 'w', encoding='utf-8') as anonymized_file:
        anonymized_file.write(anonymized_text)

    # Save the UUID mapping to a JSON file
    mapping_path = Path(args.file_path).stem + "_uuid_mapping.json"
    with open(mapping_path, 'w', encoding='utf-8') as file:
        json.dump(uuid_map, file, indent=4, sort_keys=True)

    print(f"Original text saved to {original_file_path}")
    print(f"Anonymized text saved to {anonymized_file_path}")
    print(f"UUID mapping saved to {mapping_path}")

if __name__ == "__main__":
    main()