import json import os import pickle as pkl import re import shutil import string from collections import Counter from pathlib import Path import numpy as np import torch MAX_USER_QUERY_LEN = 80 # List of example queries for easy access DEFAULT_QUERIES = { "Example Query 1": "Who visited microsoft.com on September 18?", "Example Query 2": "Does Kate have a driving licence?", "Example Query 3": "What's David Johnson's phone number?", } CURRENT_DIR = Path(__file__).parent DATA_PATH = CURRENT_DIR / "files" LOGREG_MODEL_PATH = CURRENT_DIR / "models" / "cml_logreg.model" DEPLOYMENT_DIR = CURRENT_DIR / "deployment" KEYS_DIR = DEPLOYMENT_DIR / ".fhe_keys" ORIGINAL_FILE_PATH = DATA_PATH / "original_document.txt" ANONYMIZED_FILE_PATH = DATA_PATH / "anonymized_document.txt" MAPPING_UUID_PATH = DATA_PATH / "original_document_uuid_mapping.json" MAPPING_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_anonymized.pkl" PROMPT_PATH = DATA_PATH / "chatgpt_prompt.txt" ALL_DIRS = [KEYS_DIR] PUNCTUATION_LIST = list(string.punctuation) PUNCTUATION_LIST.remove("%") PUNCTUATION_LIST.remove("$") PUNCTUATION_LIST = "".join(PUNCTUATION_LIST) def clean_directory() -> None: """Clear direcgtories""" print("Cleaning...\n") for target_dir in ALL_DIRS: if os.path.exists(target_dir) and os.path.isdir(target_dir): shutil.rmtree(target_dir) target_dir.mkdir(exist_ok=True, parents=True) def get_batch_text_representation(texts, model, tokenizer, batch_size=1): """Get mean-pooled representations of given texts in batches.""" mean_pooled_batch = [] for i in range(0, len(texts), batch_size): batch_texts = texts[i : i + batch_size] inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True) with torch.no_grad(): outputs = model(**inputs, output_hidden_states=False) last_hidden_states = outputs.last_hidden_state input_mask_expanded = ( inputs["attention_mask"].unsqueeze(-1).expand(last_hidden_states.size()).float() ) sum_embeddings = torch.sum(last_hidden_states * input_mask_expanded, 1) sum_mask = input_mask_expanded.sum(1) mean_pooled = sum_embeddings / sum_mask mean_pooled_batch.extend(mean_pooled.cpu().detach().numpy()) return np.array(mean_pooled_batch) def is_user_query_valid(user_query: str) -> bool: """ Check if the `user_query` is None and not empty. Args: user_query (str): The input text to be checked. Returns: bool: True if the `user_query` is None or empty, False otherwise. """ # If the query is not part of the default queries is_default_query = user_query in DEFAULT_QUERIES.values() # Check if the query exceeds the length limit is_exceeded_max_length = user_query is not None and len(user_query) <= MAX_USER_QUERY_LEN return not is_default_query and not is_exceeded_max_length def compare_texts_ignoring_extra_spaces(original_text, modified_text): """Check if the modified_text is identical to the original_text except for additional spaces. Args: original_text (str): The original text for comparison. modified_text (str): The modified text to compare against the original. Returns: (bool): True if the modified_text is the same as the original_text except for additional spaces; False otherwise. """ normalized_original = " ".join(original_text.split()) normalized_modified = " ".join(modified_text.split()) return normalized_original == normalized_modified def is_strict_deletion_only(original_text, modified_text): # Define a regex pattern that matches a word character next to a punctuation # or a punctuation next to a word character, without a space between them. pattern = r"(?<=[\w])(?=[^\w\s])|(?<=[^\w\s])(?=[\w])" # Replace instances found by the pattern with a space original_text = re.sub(pattern, " ", original_text) modified_text = re.sub(pattern, " ", modified_text) # Tokenize the texts into words, considering also punctuation original_words = Counter(original_text.lower().split()) modified_words = Counter(modified_text.lower().split()) base_words = all(item in original_words.keys() for item in modified_words.keys()) base_count = all(original_words[k] >= v for k, v in modified_words.items()) return base_words and base_count def read_txt(file_path): """Read text from a file.""" with open(file_path, "r", encoding="utf-8") as file: return file.read() def write_txt(file_path, data): """Write text to a file.""" with open(file_path, "w", encoding="utf-8") as file: file.write(data) def write_pickle(file_path, data): """Save data to a pickle file.""" with open(file_path, "wb") as f: pkl.dump(data, f) def read_pickle(file_name): """Load data from a pickle file.""" with open(file_name, "rb") as file: return pkl.load(file) def read_json(file_name): """Load data from a json file.""" with open(file_name, "r") as file: return json.load(file) def write_json(file_name, data): """Save data to a json file.""" with open(file_name, "w", encoding="utf-8") as file: json.dump(data, file, indent=4, sort_keys=True)