kcelia commited on
Commit
67fa189
1 Parent(s): cf6aebf

chore: update with marketing comments + add unicorn server

Browse files
Files changed (4) hide show
  1. app.py +300 -76
  2. files/original_document_uuid_mapping.json +19 -1
  3. server.py +105 -0
  4. utils_demo.py +39 -14
app.py CHANGED
@@ -1,33 +1,48 @@
1
  """A Gradio app for anonymizing text data using FHE."""
2
 
 
3
  import os
4
  import re
 
 
 
5
  from typing import Dict, List
6
- import numpy
7
  import gradio as gr
 
8
  import pandas as pd
 
9
  from fhe_anonymizer import FHEAnonymizer
10
  from openai import OpenAI
11
  from utils_demo import *
 
12
  from concrete.ml.deployment import FHEModelClient
13
 
14
 
15
- ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n")
16
- ANONYMIZED_DOCUMENT = read_txt(ANONYMIZED_FILE_PATH)
17
- MAPPING_SENTENCES = read_pickle(MAPPING_SENTENCES_PATH)
18
 
 
 
 
 
19
  subprocess.Popen(["uvicorn", "server:app"], cwd=CURRENT_DIR)
20
  time.sleep(3)
21
 
22
- clean_directory()
 
 
 
 
 
 
23
 
24
- anonymizer = FHEAnonymizer()
 
25
 
26
- client = OpenAI(api_key=os.environ.get("openaikey"))
 
27
 
28
- # Generate a random user ID
29
- user_id = numpy.random.randint(0, 2**32)
30
- print(f"Your user ID is: {user_id}....")
31
 
32
  def select_static_sentences_fn(selected_sentences: List):
33
 
@@ -41,14 +56,14 @@ def select_static_sentences_fn(selected_sentences: List):
41
 
42
 
43
  def key_gen_fn() -> Dict:
44
- """Generate keys for a given user.
 
 
45
 
46
- Returns:
47
- dict: A dictionary containing the generated keys and related information.
48
- """
49
- print("Step 1: Key Generation:")
50
 
51
- client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{user_id}")
 
52
  client.load()
53
 
54
  # Creates the private and evaluation keys on the client side
@@ -59,10 +74,9 @@ def key_gen_fn() -> Dict:
59
  assert isinstance(serialized_evaluation_keys, bytes)
60
 
61
  # Save the evaluation key
62
- evaluation_key_path = KEYS_DIR / f"{user_id}/evaluation_key"
63
-
64
- with evaluation_key_path.open("wb") as f:
65
- f.write(serialized_evaluation_keys)
66
 
67
  # anonymizer.generate_key()
68
 
@@ -73,39 +87,43 @@ def key_gen_fn() -> Dict:
73
  print(error_message)
74
  return {gen_key_btn: gr.update(value=error_message)}
75
  else:
 
76
  return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
77
 
78
 
79
  def encrypt_query_fn(query):
80
 
81
- print(f"Step 2 Query encryption: {query=}")
82
-
83
- evaluation_key_path = KEYS_DIR / f"{user_id}/evaluation_key"
84
 
85
- if not evaluation_key_path.is_file():
86
- error_message = "Error ❌: Please generate the key first!"
87
- return {output_encrypted_box: gr.update(value=error_message)}
88
 
89
  if is_user_query_valid(query):
90
- error_msg = (
91
- "Unable to process ❌: The request exceeds the length limit or falls "
92
- "outside the scope of this document. Please refine your query."
93
- )
94
- print(error_msg)
95
- return {query_box: gr.update(value=error_msg)}
 
 
96
 
97
  # Retrieve the client API
98
- client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{user_id}")
99
  client.load()
100
 
 
 
101
  # Pattern to identify words and non-words (including punctuation, spaces, etc.)
102
  tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", query)
103
- encrypted_tokens = []
104
 
105
  for token in tokens:
 
 
106
  if bool(re.match(r"^\s+$", token)):
107
  continue
108
- # Directly append non-word tokens or whitespace to processed_tokens
 
109
 
110
  # Prediction for each word
111
  emb_x = get_batch_text_representation([token], EMBEDDINGS_MODEL, TOKENIZER)
@@ -114,36 +132,220 @@ def encrypt_query_fn(query):
114
 
115
  encrypted_tokens.append(encrypted_x)
116
 
117
- write_pickle(KEYS_DIR / f"{user_id}/encrypted_input", encrypted_tokens)
 
 
118
 
 
 
 
 
119
 
120
- #anonymizer.encrypt_query(query)
121
 
122
- encrypted_quant_tokens_hex = [token.hex()[500:510] for token in encrypted_tokens]
 
 
 
 
123
 
124
- return {output_encrypted_box: gr.update(value=" ".join(encrypted_quant_tokens_hex))}
125
 
 
 
126
 
127
- def run_fhe_fn(query_box):
 
 
 
 
128
 
129
- evaluation_key_path = KEYS_DIR / "evaluation_key"
130
  if not evaluation_key_path.is_file():
131
- error_message = "Error ❌: Please generate the key first!"
 
 
 
132
  return {anonymized_text_output: gr.update(value=error_message)}
133
 
134
- encryted_query_path = KEYS_DIR / "encrypted_quantized_query"
135
- if not encryted_query_path.is_file():
136
- error_message = "Error ❌: Please encrypt your query first!"
 
 
137
  return {anonymized_text_output: gr.update(value=error_message)}
138
 
139
- anonymizer.run_server_and_decrypt_output(query_box)
 
140
 
141
- anonymized_text = read_pickle(KEYS_DIR / "reconstructed_sentence")
142
-
143
- # Removing Spaces Before Punctuation:
144
- anonymized_text = re.sub(r"\s([,.!?;:])", r"\1", anonymized_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
- identified_words_with_prob = read_pickle(KEYS_DIR / "identified_words_with_prob")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  # Convert the list of identified words and probabilities into a DataFrame
149
  if identified_words_with_prob:
@@ -152,9 +354,30 @@ def run_fhe_fn(query_box):
152
  )
153
  else:
154
  identified_df = pd.DataFrame(columns=["Identified Words", "Probability"])
 
 
 
155
  return anonymized_text, identified_df
156
 
157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  def query_chatgpt_fn(anonymized_query, anonymized_document):
159
 
160
  evaluation_key_path = KEYS_DIR / "evaluation_key"
@@ -250,7 +473,7 @@ with demo:
250
 
251
  with gr.Accordion("What is encrypted anonymization?", open=False):
252
  gr.Markdown(
253
- """
254
  Anonymization is the process of removing personally identifiable information (PII)
255
  from data to protect individual privacy.
256
 
@@ -268,7 +491,6 @@ with demo:
268
 
269
  gr.Markdown(
270
  "## Step 1: Key generation\n\n"
271
-
272
  """In FHE schemes, two sets of keys are generated. First, the secret keys which are used for
273
  encrypting and decrypting data owned by the client. Second, the evaluation keys that allow
274
  a server to blindly process the encrypted data.
@@ -297,7 +519,7 @@ with demo:
297
  """
298
  )
299
  with gr.Column():
300
- gr.Markdown("**Anonymized document:**")
301
  gr.Markdown(
302
  """You can see below the anonymized text, replaced with hexademical strings, that
303
  will be sent to ChatGPT.
@@ -309,12 +531,14 @@ with demo:
309
  with gr.Row():
310
  with gr.Column():
311
  original_sentences_box = gr.CheckboxGroup(
312
- ORIGINAL_DOCUMENT, value=ORIGINAL_DOCUMENT, show_label=False,
 
 
313
  )
314
 
315
  with gr.Column():
316
- anonymized_doc_box = gr.Textbox(show_label=False,
317
- value=ANONYMIZED_DOCUMENT, interactive=False, lines=11
318
  )
319
 
320
  original_sentences_box.change(
@@ -357,27 +581,16 @@ with demo:
357
  )
358
 
359
  with gr.Column(scale=1, min_width=6):
360
- gr.HTML("<div style='height: 25px;'></div>")
361
-
362
- gr.Markdown(
363
- """
364
- <p align="center">
365
- Encrypt the query locally with FHE
366
- </p>
367
- """
368
- )
369
- encrypt_btn = gr.Button("Encrypt query”")
370
- gr.HTML("<div style='height: 25px;'></div>")
371
 
372
  with gr.Column(scale=5):
373
  output_encrypted_box = gr.Textbox(
374
- label="Encrypted anonymized query that will be sent to the anonymization server:", lines=8
 
375
  )
376
 
377
- encrypt_btn.click(
378
- fn=encrypt_query_fn, inputs=[query_box], outputs=[query_box, output_encrypted_box]
379
- )
380
-
381
  ########################## FHE processing Part ##########################
382
 
383
  gr.Markdown("<hr />")
@@ -395,12 +608,23 @@ with demo:
395
  label="Decrypted anonymized query that will be sent to ChatGPT:", lines=1, interactive=True
396
  )
397
 
398
- identified_words_output = gr.Dataframe(label="Identified words:", visible=False)
 
 
 
 
 
 
 
 
 
 
 
399
 
400
  run_fhe_btn.click(
401
- run_fhe_fn,
402
  inputs=[query_box],
403
- outputs=[anonymized_text_output, identified_words_output],
404
  )
405
 
406
  ########################## ChatGpt Part ##########################
 
1
  """A Gradio app for anonymizing text data using FHE."""
2
 
3
+ import base64
4
  import os
5
  import re
6
+ import subprocess
7
+ import time
8
+ import uuid
9
  from typing import Dict, List
10
+
11
  import gradio as gr
12
+ import numpy
13
  import pandas as pd
14
+ import requests
15
  from fhe_anonymizer import FHEAnonymizer
16
  from openai import OpenAI
17
  from utils_demo import *
18
+
19
  from concrete.ml.deployment import FHEModelClient
20
 
21
 
22
+ # Ensure the directory is clean before starting processes or reading files
23
+ clean_directory()
 
24
 
25
+ anonymizer = FHEAnonymizer()
26
+ client = OpenAI(api_key=os.environ.get("openaikey"))
27
+
28
+ # Start the Uvicorn server hosting the FastAPI app
29
  subprocess.Popen(["uvicorn", "server:app"], cwd=CURRENT_DIR)
30
  time.sleep(3)
31
 
32
+ # Load data from files required for the application
33
+ UUID_MAP = read_json(MAPPING_UUID_PATH)
34
+ ANONYMIZED_DOCUMENT = read_txt(ANONYMIZED_FILE_PATH)
35
+ MAPPING_SENTENCES = read_pickle(MAPPING_SENTENCES_PATH)
36
+ ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n")
37
+
38
+ # 4. Data Processing and Operations (No specific operations shown here, assuming it's part of anonymizer or client usage)
39
 
40
+ # 5. Utilizing External Services or APIs
41
+ # (Assuming client initialization and anonymizer setup are parts of using external services or application-specific logic)
42
 
43
+ # Generate a random user ID for this session
44
+ USER_ID = numpy.random.randint(0, 2**32)
45
 
 
 
 
46
 
47
  def select_static_sentences_fn(selected_sentences: List):
48
 
 
56
 
57
 
58
  def key_gen_fn() -> Dict:
59
+ """Generate keys for a given user."""
60
+
61
+ print("------------ Step 1: Key Generation:")
62
 
63
+ print(f"Your user ID is: {USER_ID}....")
 
 
 
64
 
65
+
66
+ client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
67
  client.load()
68
 
69
  # Creates the private and evaluation keys on the client side
 
74
  assert isinstance(serialized_evaluation_keys, bytes)
75
 
76
  # Save the evaluation key
77
+ evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
78
+
79
+ write_bytes(evaluation_key_path, serialized_evaluation_keys)
 
80
 
81
  # anonymizer.generate_key()
82
 
 
87
  print(error_message)
88
  return {gen_key_btn: gr.update(value=error_message)}
89
  else:
90
+ print("Keys have been generated ✅")
91
  return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
92
 
93
 
94
  def encrypt_query_fn(query):
95
 
96
+ print(f"\n------------ Step 2: Query encryption: {query=}")
 
 
97
 
98
+ if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
99
+ return {output_encrypted_box: gr.update(value="Error ❌: Please generate the key first!")}
 
100
 
101
  if is_user_query_valid(query):
102
+ return {
103
+ query_box: gr.update(
104
+ value=(
105
+ "Unable to process ❌: The request exceeds the length limit or falls "
106
+ "outside the scope of this document. Please refine your query."
107
+ )
108
+ )
109
+ }
110
 
111
  # Retrieve the client API
112
+ client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
113
  client.load()
114
 
115
+ encrypted_tokens = []
116
+
117
  # Pattern to identify words and non-words (including punctuation, spaces, etc.)
118
  tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", query)
 
119
 
120
  for token in tokens:
121
+
122
+ # 1- Ignore non-words tokens
123
  if bool(re.match(r"^\s+$", token)):
124
  continue
125
+
126
+ # 2- Directly append non-word tokens or whitespace to processed_tokens
127
 
128
  # Prediction for each word
129
  emb_x = get_batch_text_representation([token], EMBEDDINGS_MODEL, TOKENIZER)
 
132
 
133
  encrypted_tokens.append(encrypted_x)
134
 
135
+ print(f"Data encrypted ✅ on Client Side")
136
+
137
+ assert len({len(token) for token in encrypted_tokens}) == 1
138
 
139
+ write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_input", b"".join(encrypted_tokens))
140
+ write_bytes(
141
+ KEYS_DIR / f"{USER_ID}/encrypted_input_len", len(encrypted_tokens[0]).to_bytes(10, "big")
142
+ )
143
 
144
+ encrypted_quant_tokens_hex = [token.hex()[500:675] for token in encrypted_tokens]
145
 
146
+ return {
147
+ output_encrypted_box: gr.update(value=" ".join(encrypted_quant_tokens_hex)),
148
+ anonymized_text_output: gr.update(visible=True, value=None),
149
+ identified_words_output_df: gr.update(visible=False, value=None),
150
+ }
151
 
 
152
 
153
+ def send_input_fn(query) -> Dict:
154
+ """Send the encrypted data and the evaluation key to the server."""
155
 
156
+ print("------------ Step 3.1: Send encrypted_data to the Server")
157
+
158
+ evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
159
+ encrypted_input_path = KEYS_DIR / f"{USER_ID}/encrypted_input"
160
+ encrypted_input_len_path = KEYS_DIR / f"{USER_ID}/encrypted_input_len"
161
 
 
162
  if not evaluation_key_path.is_file():
163
+ error_message = (
164
+ "Error Encountered While Sending Data to the Server: "
165
+ f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
166
+ )
167
  return {anonymized_text_output: gr.update(value=error_message)}
168
 
169
+ if not encrypted_input_path.is_file():
170
+ error_message = (
171
+ "Error Encountered While Sending Data to the Server: The data has not been encrypted "
172
+ f"correctly on the client side - {encrypted_input_path.is_file()=}"
173
+ )
174
  return {anonymized_text_output: gr.update(value=error_message)}
175
 
176
+ # Define the data and files to post
177
+ data = {"user_id": USER_ID, "input": query}
178
 
179
+ files = [
180
+ ("files", open(evaluation_key_path, "rb")),
181
+ ("files", open(encrypted_input_path, "rb")),
182
+ ("files", open(encrypted_input_len_path, "rb")),
183
+ ]
184
+
185
+ # Send the encrypted input and evaluation key to the server
186
+ url = SERVER_URL + "send_input"
187
+
188
+ with requests.post(
189
+ url=url,
190
+ data=data,
191
+ files=files,
192
+ ) as resp:
193
+ print("Data sent to the server ✅" if resp.ok else "Error ❌ in sending data to the server")
194
+
195
+
196
+ def run_fhe_in_server_fn() -> Dict:
197
+ """Run in FHE the anonymization of the query"""
198
 
199
+ print("------------ Step 3.2: Run in FHE on the Server Side")
200
+
201
+ evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
202
+ encrypted_input_path = KEYS_DIR / f"{USER_ID}/encrypted_input"
203
+
204
+ if not evaluation_key_path.is_file():
205
+ error_message = (
206
+ "Error Encountered While Sending Data to the Server: "
207
+ f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
208
+ )
209
+ return {anonymized_text_output: gr.update(value=error_message)}
210
+
211
+ if not encrypted_input_path.is_file():
212
+ error_message = (
213
+ "Error Encountered While Sending Data to the Server: The data has not been encrypted "
214
+ f"correctly on the client side - {encrypted_input_path.is_file()=}"
215
+ )
216
+ return {anonymized_text_output: gr.update(value=error_message)}
217
+
218
+ data = {
219
+ "user_id": USER_ID,
220
+ }
221
+
222
+ url = SERVER_URL + "run_fhe"
223
+
224
+ with requests.post(
225
+ url=url,
226
+ data=data,
227
+ ) as response:
228
+ if not response.ok:
229
+ return {
230
+ anonymized_text_output: gr.update(
231
+ value=(
232
+ "⚠️ An error occurred on the Server Side. "
233
+ "Please check connectivity and data transmission."
234
+ ),
235
+ ),
236
+ }
237
+ else:
238
+ time.sleep(1)
239
+ print(f"The query anonymization was computed in {response.json():.2f} s per token.")
240
+
241
+
242
+ def get_output_fn() -> Dict:
243
+
244
+ print("------------ Step 3.3: Get the output from the Server Side")
245
+
246
+ if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
247
+ error_message = (
248
+ "Error Encountered While Sending Data to the Server: "
249
+ "The key has not been generated correctly"
250
+ )
251
+ return {anonymized_text_output: gr.update(value=error_message)}
252
+
253
+ if not (KEYS_DIR / f"{USER_ID}/encrypted_input").is_file():
254
+ error_message = (
255
+ "Error Encountered While Sending Data to the Server: "
256
+ "The data has not been encrypted correctly on the client side"
257
+ )
258
+ return {anonymized_text_output: gr.update(value=error_message)}
259
+
260
+ data = {
261
+ "user_id": USER_ID,
262
+ }
263
+
264
+ # Retrieve the encrypted output
265
+ url = SERVER_URL + "get_output"
266
+ with requests.post(
267
+ url=url,
268
+ data=data,
269
+ ) as response:
270
+ if response.ok:
271
+ print("Data received ✅ from the remote Server")
272
+ response_data = response.json()
273
+ encrypted_output_base64 = response_data["encrypted_output"]
274
+ length_encrypted_output_base64 = response_data["length"]
275
+
276
+ # Decode the base64 encoded data
277
+ encrypted_output = base64.b64decode(encrypted_output_base64)
278
+ length_encrypted_output = base64.b64decode(length_encrypted_output_base64)
279
+
280
+ # Save the encrypted output to bytes in a file as it is too large to pass through
281
+ # regular Gradio buttons (see https://github.com/gradio-app/gradio/issues/1877)
282
+
283
+ write_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output", encrypted_output)
284
+ write_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output_len", length_encrypted_output)
285
+
286
+ else:
287
+ print("Error ❌ in getting data to the server")
288
+
289
+
290
+ def decrypt_fn(text) -> Dict:
291
+ """Dencrypt the data on the `Client Side`."""
292
+
293
+ print("------------ Step 4: Dencrypt the data on the `Client Side`")
294
+
295
+ # Get the encrypted output path
296
+ encrypted_output_path = CLIENT_DIR / f"{USER_ID}_encrypted_output"
297
+
298
+ if not encrypted_output_path.is_file():
299
+ error_message = """⚠️ Please ensure that: \n
300
+ - the connectivity \n
301
+ - the query has been submitted \n
302
+ - the evaluation key has been generated \n
303
+ - the server processed the encrypted data \n
304
+ - the Client received the data from the Server before decrypting the prediction
305
+ """
306
+ print(error_message)
307
+
308
+ return error_message, None
309
+
310
+ # Retrieve the client API
311
+ client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
312
+ client.load()
313
+
314
+ # Load the encrypted output as bytes
315
+ encrypted_output = read_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output")
316
+ length = int.from_bytes(read_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output_len"), "big")
317
+
318
+ tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", text)
319
+
320
+ decrypted_output, identified_words_with_prob = [], []
321
+
322
+ i = 0
323
+ for token in tokens:
324
+
325
+ # Directly append non-word tokens or whitespace to processed_tokens
326
+ if bool(re.match(r"^\s+$", token)):
327
+ continue
328
+ else:
329
+ encrypted_token = encrypted_output[i : i + length]
330
+ prediction_proba = client.deserialize_decrypt_dequantize(encrypted_token)
331
+ probability = prediction_proba[0][1]
332
+ i += length
333
+
334
+ if probability >= 0.77:
335
+ identified_words_with_prob.append((token, probability))
336
+
337
+ # Use the existing UUID if available, otherwise generate a new one
338
+ tmp_uuid = UUID_MAP.get(token, str(uuid.uuid4())[:8])
339
+ decrypted_output.append(tmp_uuid)
340
+ UUID_MAP[token] = tmp_uuid
341
+ else:
342
+ decrypted_output.append(token)
343
+
344
+ # Update the UUID map with query.
345
+ write_json(MAPPING_UUID_PATH, UUID_MAP)
346
+
347
+ # Removing Spaces Before Punctuation:
348
+ anonymized_text = re.sub(r"\s([,.!?;:])", r"\1", " ".join(decrypted_output))
349
 
350
  # Convert the list of identified words and probabilities into a DataFrame
351
  if identified_words_with_prob:
 
354
  )
355
  else:
356
  identified_df = pd.DataFrame(columns=["Identified Words", "Probability"])
357
+
358
+ print(f"Decryption done ✅ on Client Side")
359
+
360
  return anonymized_text, identified_df
361
 
362
 
363
+ def anonymization_with_fn(query):
364
+
365
+ encrypt_query_fn(query)
366
+
367
+ send_input_fn(query)
368
+
369
+ run_fhe_in_server_fn()
370
+
371
+ get_output_fn()
372
+
373
+ anonymized_text, identified_df = decrypt_fn(query)
374
+
375
+ return {
376
+ anonymized_text_output: gr.update(value=anonymized_text),
377
+ identified_words_output_df: gr.update(value=identified_df, visible=True),
378
+ }
379
+
380
+
381
  def query_chatgpt_fn(anonymized_query, anonymized_document):
382
 
383
  evaluation_key_path = KEYS_DIR / "evaluation_key"
 
473
 
474
  with gr.Accordion("What is encrypted anonymization?", open=False):
475
  gr.Markdown(
476
+ """
477
  Anonymization is the process of removing personally identifiable information (PII)
478
  from data to protect individual privacy.
479
 
 
491
 
492
  gr.Markdown(
493
  "## Step 1: Key generation\n\n"
 
494
  """In FHE schemes, two sets of keys are generated. First, the secret keys which are used for
495
  encrypting and decrypting data owned by the client. Second, the evaluation keys that allow
496
  a server to blindly process the encrypted data.
 
519
  """
520
  )
521
  with gr.Column():
522
+ gr.Markdown("**Anonymized document:**")
523
  gr.Markdown(
524
  """You can see below the anonymized text, replaced with hexademical strings, that
525
  will be sent to ChatGPT.
 
531
  with gr.Row():
532
  with gr.Column():
533
  original_sentences_box = gr.CheckboxGroup(
534
+ ORIGINAL_DOCUMENT,
535
+ value=ORIGINAL_DOCUMENT,
536
+ show_label=False,
537
  )
538
 
539
  with gr.Column():
540
+ anonymized_doc_box = gr.Textbox(
541
+ show_label=False, value=ANONYMIZED_DOCUMENT, interactive=False, lines=11
542
  )
543
 
544
  original_sentences_box.change(
 
581
  )
582
 
583
  with gr.Column(scale=1, min_width=6):
584
+ gr.HTML("<div style='height: 77px;'></div>")
585
+ encrypt_btn = gr.Button("Encrypt query")
586
+ # gr.HTML("<div style='height: 50px;'></div>")
 
 
 
 
 
 
 
 
587
 
588
  with gr.Column(scale=5):
589
  output_encrypted_box = gr.Textbox(
590
+ label="Encrypted anonymized query that will be sent to the anonymization server:",
591
+ lines=8,
592
  )
593
 
 
 
 
 
594
  ########################## FHE processing Part ##########################
595
 
596
  gr.Markdown("<hr />")
 
608
  label="Decrypted anonymized query that will be sent to ChatGPT:", lines=1, interactive=True
609
  )
610
 
611
+ identified_words_output_df = gr.Dataframe(label="Identified words:", visible=False)
612
+
613
+ encrypt_btn.click(
614
+ fn=encrypt_query_fn,
615
+ inputs=[query_box],
616
+ outputs=[
617
+ query_box,
618
+ output_encrypted_box,
619
+ anonymized_text_output,
620
+ identified_words_output_df,
621
+ ],
622
+ )
623
 
624
  run_fhe_btn.click(
625
+ anonymization_with_fn,
626
  inputs=[query_box],
627
+ outputs=[anonymized_text_output, identified_words_output_df],
628
  )
629
 
630
  ########################## ChatGpt Part ##########################
files/original_document_uuid_mapping.json CHANGED
@@ -1 +1,19 @@
1
- {"078-05-1126": "d8da62f1", "1234567A": "5e63c327", "16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ": "ac41d58b", "191280342": "59a83e41", "192.168.0.1": "116fe81e", "212": "144a2acc", "4095-2609-9393-4932": "e5b499b0", "555-1234": "d9e5704e", "954567876544": "9eb07461", "David": "ebe99761", "IL150120690000003111111": "5ca977a4", "International": "71d0f51c", "Johnson": "53a9291d", "Kate": "b474d794", "Maine": "6337f12f", "microsoft.com": "0d574451", "test@presidio.site": "1f78e797"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "078-05-1126": "d8da62f1",
3
+ "1234567A": "5e63c327",
4
+ "16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ": "ac41d58b",
5
+ "191280342": "59a83e41",
6
+ "192.168.0.1": "116fe81e",
7
+ "212": "144a2acc",
8
+ "4095-2609-9393-4932": "e5b499b0",
9
+ "555-1234": "d9e5704e",
10
+ "954567876544": "9eb07461",
11
+ "David": "ebe99761",
12
+ "IL150120690000003111111": "5ca977a4",
13
+ "International": "71d0f51c",
14
+ "Johnson": "53a9291d",
15
+ "Kate": "b474d794",
16
+ "Maine": "6337f12f",
17
+ "microsoft.com": "0d574451",
18
+ "test@presidio.site": "1f78e797"
19
+ }
server.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Server that will listen for GET and POST requests from the client."""
2
+
3
+ import base64
4
+ import time
5
+ from typing import List
6
+
7
+ import numpy
8
+ from fastapi import FastAPI, File, Form, UploadFile
9
+ from fastapi.responses import JSONResponse
10
+ from utils_demo import *
11
+ from utils_demo import SERVER_DIR
12
+
13
+ from concrete.ml.deployment import FHEModelServer
14
+
15
+ # Load the FHE server
16
+ FHE_SERVER = FHEModelServer(DEPLOYMENT_DIR)
17
+
18
+ # Initialize an instance of FastAPI
19
+ app = FastAPI()
20
+
21
+ # Define the default route
22
+ @app.get("/")
23
+ def root():
24
+ """
25
+ Root endpoint of the health prediction API.
26
+ Returns:
27
+ dict: The welcome message.
28
+ """
29
+ return {"message": "Welcome to your encrypted anonymization use-case with FHE!"}
30
+
31
+
32
+ @app.post("/send_input")
33
+ def send_input(
34
+ user_id: str = Form(),
35
+ files: List[UploadFile] = File(),
36
+ ):
37
+ """Send the inputs to the server."""
38
+
39
+ # Save the files using the above paths
40
+ write_bytes(SERVER_DIR / f"{user_id}_valuation_key", files[0].file.read())
41
+ write_bytes(SERVER_DIR / f"{user_id}_encrypted_input", files[1].file.read())
42
+ write_bytes(SERVER_DIR / f"{user_id}_encrypted_len_input", files[2].file.read())
43
+
44
+
45
+ @app.post("/run_fhe")
46
+ def run_fhe(
47
+ user_id: str = Form(),
48
+ ):
49
+ """Inference in FHE."""
50
+
51
+ evaluation_key_path = SERVER_DIR / f"{user_id}_valuation_key"
52
+ encrypted_input_path = SERVER_DIR / f"{user_id}_encrypted_input"
53
+ encrypted_input_len_path = SERVER_DIR / f"{user_id}_encrypted_len_input"
54
+
55
+ # Read the files (Evaluation key + Encrypted symptoms) using the above paths
56
+ with encrypted_input_path.open("rb") as encrypted_output_file, evaluation_key_path.open(
57
+ "rb"
58
+ ) as evaluation_key_file, encrypted_input_len_path.open("rb") as lenght:
59
+ evaluation_key = evaluation_key_file.read()
60
+ encrypted_tokens = encrypted_output_file.read()
61
+ length = int.from_bytes(lenght.read(), "big")
62
+
63
+ timing, encrypted_output = [], []
64
+ for i in range(0, len(encrypted_tokens), length):
65
+ enc_x = encrypted_tokens[i : i + length]
66
+ start_time = time.time()
67
+ enc_y = FHE_SERVER.run(enc_x, evaluation_key)
68
+ timing.append(round(time.time() - start_time, 2))
69
+ encrypted_output.append(enc_y)
70
+
71
+ # Write the files
72
+ write_bytes(SERVER_DIR / f"{user_id}_encrypted_output", b"".join(encrypted_output))
73
+ write_bytes(
74
+ SERVER_DIR / f"{user_id}_encrypted_output_len", len(encrypted_output[0]).to_bytes(10, "big")
75
+ )
76
+
77
+ return JSONResponse(content=numpy.mean(timing))
78
+
79
+
80
+ @app.post("/get_output")
81
+ def get_output(user_id: str = Form()):
82
+ """Retrieve the encrypted output from the server."""
83
+
84
+ # Path where the encrypted output is saved
85
+ encrypted_output_path = SERVER_DIR / f"{user_id}_encrypted_output"
86
+ encrypted_output_len_path = SERVER_DIR / f"{user_id}_encrypted_output_len"
87
+
88
+ # Read the file using the above path
89
+ with encrypted_output_path.open("rb") as f:
90
+ encrypted_output = f.read()
91
+
92
+ # Read the file using the above path
93
+ with encrypted_output_len_path.open("rb") as f:
94
+ length = f.read()
95
+
96
+ time.sleep(1)
97
+
98
+ # Encode the binary data to a format suitable for JSON serialization
99
+ content = {
100
+ "encrypted_output": base64.b64encode(encrypted_output).decode("utf-8"),
101
+ "length": base64.b64encode(length).decode("utf-8"),
102
+ }
103
+
104
+ # Send the encrypted output
105
+ return JSONResponse(content)
utils_demo.py CHANGED
@@ -6,38 +6,51 @@ import shutil
6
  import string
7
  from collections import Counter
8
  from pathlib import Path
9
- from transformers import AutoModel, AutoTokenizer
10
 
11
  import numpy as np
12
  import torch
 
13
 
14
- MAX_USER_QUERY_LEN = 80
15
 
16
- # List of example queries for easy access
17
- DEFAULT_QUERIES = {
18
- "Example Query 1": "Who visited microsoft.com on September 18?",
19
- "Example Query 2": "Does Kate have a driving licence?",
20
- "Example Query 3": "What's David Johnson's phone number?",
21
- }
22
 
 
 
23
 
24
- CURRENT_DIR = Path(__file__).parent
 
25
 
26
- DATA_PATH = CURRENT_DIR / "files"
27
- LOGREG_MODEL_PATH = CURRENT_DIR / "models" / "cml_logreg.model"
28
  DEPLOYMENT_DIR = CURRENT_DIR / "deployment"
29
- KEYS_DIR = DEPLOYMENT_DIR / "fhe_keys"
 
 
 
 
 
30
 
 
 
 
 
 
31
  ORIGINAL_FILE_PATH = DATA_PATH / "original_document.txt"
32
  ANONYMIZED_FILE_PATH = DATA_PATH / "anonymized_document.txt"
33
  MAPPING_UUID_PATH = DATA_PATH / "original_document_uuid_mapping.json"
34
  MAPPING_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_anonymized.pkl"
35
  PROMPT_PATH = DATA_PATH / "chatgpt_prompt.txt"
36
 
37
- ALL_DIRS = [KEYS_DIR]
 
 
 
 
 
 
38
 
39
  # Load tokenizer and model
40
- TOKENIZER = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
41
  EMBEDDINGS_MODEL = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
42
 
43
  PUNCTUATION_LIST = list(string.punctuation)
@@ -163,3 +176,15 @@ def write_json(file_name, data):
163
  """Save data to a json file."""
164
  with open(file_name, "w", encoding="utf-8") as file:
165
  json.dump(data, file, indent=4, sort_keys=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import string
7
  from collections import Counter
8
  from pathlib import Path
 
9
 
10
  import numpy as np
11
  import torch
12
+ from transformers import AutoModel, AutoTokenizer
13
 
 
14
 
15
+ from pathlib import Path
 
 
 
 
 
16
 
17
+ # Core Application URL
18
+ SERVER_URL = "http://localhost:8000/"
19
 
20
+ # Maximum length for user queries
21
+ MAX_USER_QUERY_LEN = 80
22
 
23
+ # Base Directories
24
+ CURRENT_DIR = Path(__file__).parent
25
  DEPLOYMENT_DIR = CURRENT_DIR / "deployment"
26
+ DATA_PATH = CURRENT_DIR / "files"
27
+
28
+ # Deployment Directories
29
+ CLIENT_DIR = DEPLOYMENT_DIR / "client_dir"
30
+ SERVER_DIR = DEPLOYMENT_DIR / "server_dir"
31
+ KEYS_DIR = DEPLOYMENT_DIR / ".fhe_keys"
32
 
33
+ # All Directories
34
+ ALL_DIRS = [KEYS_DIR, CLIENT_DIR, SERVER_DIR]
35
+
36
+ # Model and Data Files
37
+ LOGREG_MODEL_PATH = CURRENT_DIR / "models" / "cml_logreg.model"
38
  ORIGINAL_FILE_PATH = DATA_PATH / "original_document.txt"
39
  ANONYMIZED_FILE_PATH = DATA_PATH / "anonymized_document.txt"
40
  MAPPING_UUID_PATH = DATA_PATH / "original_document_uuid_mapping.json"
41
  MAPPING_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_anonymized.pkl"
42
  PROMPT_PATH = DATA_PATH / "chatgpt_prompt.txt"
43
 
44
+
45
+ # List of example queries for easy access
46
+ DEFAULT_QUERIES = {
47
+ "Example Query 1": "Who visited microsoft.com on September 18?",
48
+ "Example Query 2": "Does Kate have a driving licence?",
49
+ "Example Query 3": "What's David Johnson's phone number?",
50
+ }
51
 
52
  # Load tokenizer and model
53
+ TOKENIZER = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
54
  EMBEDDINGS_MODEL = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
55
 
56
  PUNCTUATION_LIST = list(string.punctuation)
 
176
  """Save data to a json file."""
177
  with open(file_name, "w", encoding="utf-8") as file:
178
  json.dump(data, file, indent=4, sort_keys=True)
179
+
180
+
181
+ def write_bytes(path, data):
182
+ """Save binary data."""
183
+ with path.open("wb") as f:
184
+ f.write(data)
185
+
186
+
187
+ def read_bytes(path):
188
+ """Load data from a binary file."""
189
+ with path.open("rb") as f:
190
+ return f.read()