Spaces:
Running
Running
victormiller
commited on
Commit
•
44f2e3d
1
Parent(s):
752d87b
Update common.py
Browse files
common.py
CHANGED
@@ -46,6 +46,27 @@ r"[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:(?:[A
|
|
46 |
ip_address_code = """
|
47 |
r"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"
|
48 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
global_div = Div(
|
50 |
Section(
|
51 |
H2("Global Steps"),
|
@@ -100,12 +121,13 @@ global_div = Div(
|
|
100 |
H2("PII Removal"),
|
101 |
H3("Motivation Behind PII Removal"),
|
102 |
P("PII refers to any information that can be used to identify an individual, such as names, addresses, phone numbers, email addresses, and social security numbers. PII removal is essential for data privacy and security, as well as for compliance with global regulations. By removing PII from the training data, we can reduce the risk of data breaches and unauthorized access to sensitive information. Additionally, models can also generate PII during inference time."),
|
|
|
103 |
),
|
104 |
Section(
|
105 |
H3("Removing PII"),
|
106 |
P("We have removed two types of PII from the dataset: email address and IP address. Regular expressions are used to identify and replace these PII with a generic placeholder. Below is an example of how we removed email addresses from the dataset:"),
|
107 |
P("We have used the following regular expressions to identify and replace PII:"),
|
108 |
-
Ul(Li("Email:"), Li(email_code, style="list-style-type: none"), Li("IP Address:
|
109 |
),
|
110 |
Section(
|
111 |
H2("Normalization Form C (NFC)"),
|
|
|
46 |
ip_address_code = """
|
47 |
r"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"
|
48 |
"""
|
49 |
+
|
50 |
+
pii_table = pd.DataFrame(
|
51 |
+
{
|
52 |
+
"OPII Type": [
|
53 |
+
"Email",
|
54 |
+
"IP Address",
|
55 |
+
],
|
56 |
+
"Examples": [
|
57 |
+
"john.doe@llm360.ai",
|
58 |
+
"172.217.164.110",
|
59 |
+
],
|
60 |
+
"Target": [
|
61 |
+
"firstname.lastname@example.com",
|
62 |
+
"["22.214.171.124" , ...]",
|
63 |
+
],
|
64 |
+
}
|
65 |
+
)
|
66 |
+
|
67 |
+
table_html_pii = pii_table.to_html(index=False, border=0)
|
68 |
+
table_div_pii = Div(NotStr(table_html_pii), style="margin: 40px;")
|
69 |
+
|
70 |
global_div = Div(
|
71 |
Section(
|
72 |
H2("Global Steps"),
|
|
|
121 |
H2("PII Removal"),
|
122 |
H3("Motivation Behind PII Removal"),
|
123 |
P("PII refers to any information that can be used to identify an individual, such as names, addresses, phone numbers, email addresses, and social security numbers. PII removal is essential for data privacy and security, as well as for compliance with global regulations. By removing PII from the training data, we can reduce the risk of data breaches and unauthorized access to sensitive information. Additionally, models can also generate PII during inference time."),
|
124 |
+
table_div_pii,
|
125 |
),
|
126 |
Section(
|
127 |
H3("Removing PII"),
|
128 |
P("We have removed two types of PII from the dataset: email address and IP address. Regular expressions are used to identify and replace these PII with a generic placeholder. Below is an example of how we removed email addresses from the dataset:"),
|
129 |
P("We have used the following regular expressions to identify and replace PII:"),
|
130 |
+
Ul(Li("Email:"), Li(email_code, style="list-style-type: none"), Li("IP Address:"), Li(ip_address_code, style="list-style-type: none")),
|
131 |
),
|
132 |
Section(
|
133 |
H2("Normalization Form C (NFC)"),
|