Zamanonymize3

Sleeping

App Files Files Community

Zamanonymize3 / anonymize_file_clear.py

jfrery-zama

use without pronoun model

628fe8f 8 months ago

raw

history blame

2.97 kB

	import argparse
	import json
	import re
	import uuid
	from pathlib import Path
	import gensim
	from concrete.ml.common.serialization.loaders import load

	def load_models():
	base_dir = Path(__file__).parent / "models"
	embeddings_model = gensim.models.FastText.load(str(base_dir / "without_pronoun_embedded_model.model"))
	with open(base_dir / "without_pronoun_cml_xgboost.model", "r") as model_file:
	fhe_ner_detection = load(file=model_file)
	return embeddings_model, fhe_ner_detection

	def anonymize_text(text, embeddings_model, fhe_ner_detection):
	token_pattern = r"(\b[\w\.\/\-@]+\b\|[\s,.!?;:'\"-]+)"
	tokens = re.findall(token_pattern, text)
	uuid_map = {}
	processed_tokens = []

	for token in tokens:
	if token.strip() and re.match(r"\w+", token): # If the token is a word
	x = embeddings_model.wv[token][None]
	prediction_proba = fhe_ner_detection.predict_proba(x)
	probability = prediction_proba[0][1]
	prediction = probability >= 0.5
	if prediction:
	if token not in uuid_map:
	uuid_map[token] = str(uuid.uuid4())[:8]
	processed_tokens.append(uuid_map[token])
	else:
	processed_tokens.append(token)
	else:
	processed_tokens.append(token) # Preserve punctuation and spaces as is

	anonymized_text = ''.join(processed_tokens)
	return anonymized_text, uuid_map

	def main():
	parser = argparse.ArgumentParser(description="Anonymize named entities in a text file and save the mapping to a JSON file.")
	parser.add_argument("file_path", type=str, help="The path to the file to be processed.")
	args = parser.parse_args()

	embeddings_model, fhe_ner_detection = load_models()

	# Read the input file
	with open(args.file_path, 'r', encoding='utf-8') as file:
	text = file.read()

	# Save the original text to its specified file
	original_file_path = Path(__file__).parent / "files" / "original_document.txt"
	with open(original_file_path, 'w', encoding='utf-8') as original_file:
	original_file.write(text)

	# Anonymize the text
	anonymized_text, uuid_map = anonymize_text(text, embeddings_model, fhe_ner_detection)

	# Save the anonymized text to its specified file
	anonymized_file_path = Path(__file__).parent / "files" / "anonymized_document.txt"
	with open(anonymized_file_path, 'w', encoding='utf-8') as anonymized_file:
	anonymized_file.write(anonymized_text)

	# Save the UUID mapping to a JSON file
	mapping_path = Path(args.file_path).stem + "_uuid_mapping.json"
	with open(mapping_path, 'w', encoding='utf-8') as file:
	json.dump(uuid_map, file, indent=4, sort_keys=True)

	print(f"Original text saved to {original_file_path}")
	print(f"Anonymized text saved to {anonymized_file_path}")
	print(f"UUID mapping saved to {mapping_path}")

	if __name__ == "__main__":
	main()