Spaces:
Runtime error
Runtime error
ziggycross
commited on
Commit
•
a51662f
1
Parent(s):
6c3e9dd
Implemented k_anonymizer.
Browse files- modules.py +55 -2
modules.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
|
3 |
SUPPORTED_TYPES = [".csv", ".json", ".xlsx"]
|
@@ -53,5 +55,56 @@ def data_cleaner(df, drop_missing=False, remove_duplicates=True):
|
|
53 |
if remove_duplicates: df = df.drop_duplicates()
|
54 |
return df
|
55 |
|
56 |
-
def
|
57 |
-
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from itertools import combinations
|
2 |
+
import numpy as np
|
3 |
import pandas as pd
|
4 |
|
5 |
SUPPORTED_TYPES = [".csv", ".json", ".xlsx"]
|
|
|
55 |
if remove_duplicates: df = df.drop_duplicates()
|
56 |
return df
|
57 |
|
58 |
+
def unique_ratio(df, col):
|
59 |
+
return df[col].nunique()/df[col].count()
|
60 |
+
|
61 |
+
def bin_numeric(df, name_col: str, num_bins: int):
|
62 |
+
|
63 |
+
df_copy = df.copy().select_dtypes(include=np.number)
|
64 |
+
|
65 |
+
col_name = df[name_col].sort_values()
|
66 |
+
min_, max_ = col_name.min(), col_name.max()
|
67 |
+
bins = np.array_split(col_name.values, num_bins)
|
68 |
+
pivots = [min_] + [b[0] for b in bins[1:]] + [max_]
|
69 |
+
bins_list = [(pivots[i], pivots[i+1]) for i in range(num_bins)]
|
70 |
+
|
71 |
+
for bin_min, bin_max in bins_list:
|
72 |
+
|
73 |
+
for row in df_copy.index:
|
74 |
+
if bin_min <= df_copy.loc[row, name_col] < bin_max:
|
75 |
+
df.loc[row, name_col] = f"{bin_min} - {bin_max}"
|
76 |
+
|
77 |
+
return df
|
78 |
+
|
79 |
+
def get_kanon_false(df, k=2):
|
80 |
+
df = df.select_dtypes(include=np.number)
|
81 |
+
k_anon_false = set() # columns containing non-unique k-tuples - need anonymization
|
82 |
+
pairwise_combinations = list(combinations(df.columns, k)) # get k-wise combinations of all columns in data
|
83 |
+
check = lambda x: x == k-1
|
84 |
+
|
85 |
+
for k_tuple in pairwise_combinations:
|
86 |
+
|
87 |
+
# if k_tuple in k_anon_false:
|
88 |
+
# continue
|
89 |
+
|
90 |
+
k_pair_counts = df.loc[:, k_tuple].value_counts().tolist() # checks for n_unique_values for each k-tuple
|
91 |
+
|
92 |
+
if any(check(i) for i in k_pair_counts): # if any value corresponding to the k-tuple is >1, i.e. non-unique
|
93 |
+
k_anon_false.add((k_tuple[0], unique_ratio(df, k_tuple[0])))
|
94 |
+
k_anon_false.add((k_tuple[1], unique_ratio(df, k_tuple[1])))
|
95 |
+
|
96 |
+
return sorted(k_anon_false, key = lambda x:x[1], reverse = True)
|
97 |
+
|
98 |
+
def k_anonymize(df, k=2):
|
99 |
+
k_anon_false = get_kanon_false(df)
|
100 |
+
while k_anon_false:
|
101 |
+
for i in k_anon_false:
|
102 |
+
col, _ = i
|
103 |
+
print(f"Binning {col}")
|
104 |
+
df = bin_numeric(df, col, num_bins = 15)
|
105 |
+
k_anon_false = get_kanon_false(df)
|
106 |
+
print(f"Updated sensitivity: {k_anon_false}")
|
107 |
+
return df
|
108 |
+
|
109 |
+
def data_anonymizer(df, k=2):
|
110 |
+
return k_anonymize(df, k)
|