Zamanonymize3 / anonymize_file_clear.py
jfrery-zama's picture
use without pronoun model
628fe8f
raw
history blame
2.97 kB
import argparse
import json
import re
import uuid
from pathlib import Path
import gensim
from concrete.ml.common.serialization.loaders import load
def load_models():
base_dir = Path(__file__).parent / "models"
embeddings_model = gensim.models.FastText.load(str(base_dir / "without_pronoun_embedded_model.model"))
with open(base_dir / "without_pronoun_cml_xgboost.model", "r") as model_file:
fhe_ner_detection = load(file=model_file)
return embeddings_model, fhe_ner_detection
def anonymize_text(text, embeddings_model, fhe_ner_detection):
token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
tokens = re.findall(token_pattern, text)
uuid_map = {}
processed_tokens = []
for token in tokens:
if token.strip() and re.match(r"\w+", token): # If the token is a word
x = embeddings_model.wv[token][None]
prediction_proba = fhe_ner_detection.predict_proba(x)
probability = prediction_proba[0][1]
prediction = probability >= 0.5
if prediction:
if token not in uuid_map:
uuid_map[token] = str(uuid.uuid4())[:8]
processed_tokens.append(uuid_map[token])
else:
processed_tokens.append(token)
else:
processed_tokens.append(token) # Preserve punctuation and spaces as is
anonymized_text = ''.join(processed_tokens)
return anonymized_text, uuid_map
def main():
parser = argparse.ArgumentParser(description="Anonymize named entities in a text file and save the mapping to a JSON file.")
parser.add_argument("file_path", type=str, help="The path to the file to be processed.")
args = parser.parse_args()
embeddings_model, fhe_ner_detection = load_models()
# Read the input file
with open(args.file_path, 'r', encoding='utf-8') as file:
text = file.read()
# Save the original text to its specified file
original_file_path = Path(__file__).parent / "files" / "original_document.txt"
with open(original_file_path, 'w', encoding='utf-8') as original_file:
original_file.write(text)
# Anonymize the text
anonymized_text, uuid_map = anonymize_text(text, embeddings_model, fhe_ner_detection)
# Save the anonymized text to its specified file
anonymized_file_path = Path(__file__).parent / "files" / "anonymized_document.txt"
with open(anonymized_file_path, 'w', encoding='utf-8') as anonymized_file:
anonymized_file.write(anonymized_text)
# Save the UUID mapping to a JSON file
mapping_path = Path(args.file_path).stem + "_uuid_mapping.json"
with open(mapping_path, 'w', encoding='utf-8') as file:
json.dump(uuid_map, file, indent=4, sort_keys=True)
print(f"Original text saved to {original_file_path}")
print(f"Anonymized text saved to {anonymized_file_path}")
print(f"UUID mapping saved to {mapping_path}")
if __name__ == "__main__":
main()