In [1]:
from itertools import combinations
import numpy as np
import os
import pandas as pd

### Load data from file into a pandas df

In [2]:
DATADIR="data/"
FILENAME=None

while FILENAME is None:
    
    file_candidate = input("Enter file name:")
    if file_candidate == "": break
    
    try:
        print(f"Assesing file '{file_candidate}'...".ljust(120), end="\r")
        file_path = DATADIR + file_candidate
        extension = file_candidate.split(".")[-1] 
        match extension:
            case "csv":
                df = pd.read_csv(file_path)
            case "json":
                df = pd.read_json(file_path)
            case "xlsx":
                df = pd.read_excel(file_path)
            case _:
                print(f"Error: Invalid extension '{extension}'")
                continue
        print(f"File '{file_candidate}' loaded successfully.")
        rows, columns = df.shape
        print(f"Found {rows} rows, {columns} columns")
        FILENAME = file_candidate
    except FileNotFoundError:
        print(f"Error: '{file_candidate}' doesn't exist in {os.getcwd()}/{DATADIR}")
    except Exception as error:
        print(f"Error: Unable to read file '{file_candidate}' ({str(type(error))}: {error})".ljust(120))

File 'hr.csv' loaded successfully.                                                                                      
Found 311 rows, 36 columns


### Clean data to remove duplicates and rows with missing values.

In [3]:
DROP_MISSING = False
REMOVE_DUPLICATES = True

df = df.dropna(how="any" if DROP_MISSING else "all")
if REMOVE_DUPLICATES: df = df.drop_duplicates()

### Anonymize data

In [4]:
K = 2
MAX_CATEGORICAL_SIZE = 50
BIN_SIZE = 20
SENSITIVITY_MINIMUM = 2

def column_combinations(df, k):
    return list(combinations(df.columns, k))

def k_redact(df, k):
    kwise_combinations = column_combinations(df, k) 
    
    for columns in kwise_combinations:
        df_search = df.loc[:, columns]
        sensitive_data = [
            (columns, key)
            for key, value
            in df_search.value_counts().to_dict().items()
            if value == 1
            ]
        if not sensitive_data: continue
        for columns, values in sensitive_data:
            for column, value in zip(columns, values):
                df_search = df_search.loc[df[column] == value]
                if df_search.shape[0] == 1:
                    for column in columns:
                        df_search[column] = None
    
    return df

def sensitive_values(series, sensitivity_minimum):
    return {key
        for key, value
        in series.value_counts().to_dict().items()
        if value < sensitivity_minimum
        }

def drop_sensitive(series, sensitivity_minimum):
    series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None

def bin_numeric(df, to_process, bin_size, sensitivity_minimum):
    processed = set()
    rows, _ = df.shape
    num_bins = rows//bin_size
    for column_name in to_process:
        column = df[column_name]
        if column.dtype.kind not in "biufc": continue
        array = sorted(np.array(column))
        array_min, array_max = array[0], array[-1]
        splits = [array_min] + list(np.array_split(array, num_bins)) + [array_max]
        bins = [
            (np.min(split), np.max(split))
            for split
            in (splits[i] for i in range(num_bins))
            ]
        result = [None] * rows
        for bin_min, bin_max in bins:
            for i, value in enumerate(column):
                if bin_min <= value <= bin_max:
                    result[i] = (bin_min, bin_max)
        df[column_name] = result
        drop_sensitive(df[column_name], sensitivity_minimum)
        processed.add(column_name)
    return df, to_process - processed

def find_categorical(df, to_process, max_categorical_size, sensitivity_minimum):
    processed = set()
    for column_name in to_process:
        column = df[column_name]
        if column.nunique() <= max_categorical_size:
            drop_sensitive(column, sensitivity_minimum)
            processed.add(column_name)
    return df, to_process - processed

def redact(df, to_process, sensitivity_minimum):
    processed = set()
    for column_name in to_process:
        column = df[column_name]
        
        is_object = column.dtype == object
        if not is_object: continue

        # Check if any unique values exist, and redact them
        drop_sensitive(column, sensitivity_minimum)
        processed.add(column_name)

    return df, to_process - processed

def anonymize(df, max_categorical_size, bin_size, sensitivity_minimum):
    to_process = set(df.columns)
    df, to_process = redact(df, to_process, sensitivity_minimum)
    df, to_process = find_categorical(df, to_process, max_categorical_size, sensitivity_minimum)
    df, to_process = bin_numeric(df, to_process, bin_size, sensitivity_minimum)
    return df, to_process

def data_anonymizer(df, k, max_categorical_size, bin_size, sensitivity_minimum):
    start_dtypes = df.dtypes.to_dict()
    df, unprocessed = anonymize(df, max_categorical_size, bin_size, sensitivity_minimum)
    df = k_redact(df, k)
    end_dtypes = df.dtypes.to_dict()

    # Type correction
    for column in df.columns:
        start_type, end_type  = start_dtypes[column], end_dtypes[column]
        if start_type == end_type: continue
        if start_type.kind == "i" and end_type.kind == "f":
            df[column] = df[column].astype("Int64")

    return df, unprocessed

df, unprocessed_columns = data_anonymizer(df, K, MAX_CATEGORICAL_SIZE, BIN_SIZE, SENSITIVITY_MINIMUM)
if unprocessed_columns: print(f"Failed to process columns '{unprocessed_columns}'")
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs

Unnamed: 0,Employee_Name,EmpID,MarriedID,MaritalStatusID,GenderID,EmpStatusID,DeptID,PerfScoreID,FromDiversityJobFairID,Salary,...,ManagerName,ManagerID,RecruitmentSource,PerformanceScore,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,LastPerformanceReview_Date,DaysLateLast30,Absences
0,,"(10022, 10042)",0,0,1,1,5,4,0,"(62065, 63381)",...,Michael Albert,22.0,LinkedIn,Exceeds,"(4.52, 4.68)",5,0,1/17/2019,0,1
1,,"(10064, 10084)",1,1,1,5,3,3,0,"(92328, 104437)",...,Simon Roup,4.0,Indeed,Fully Meets,"(4.9, 5.0)",3,6,,0,17
2,,"(10190, 10210)",1,1,0,5,5,3,0,"(64816, 66825)",...,Kissy Sullivan,20.0,LinkedIn,Fully Meets,"(2.9, 3.18)",3,0,,0,3
3,,"(10085, 10105)",1,1,0,1,5,3,0,"(64816, 66825)",...,Elijiah Gray,16.0,Indeed,Fully Meets,"(4.7, 4.88)",5,0,1/3/2019,0,15
4,,"(10064, 10084)",0,2,0,5,5,3,0,"(47837, 51259)",...,Webster Butler,39.0,Google Search,Fully Meets,"(5.0, 5.0)",4,0,2/1/2016,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306,,"(10127, 10147)",0,0,1,1,5,3,0,"(64816, 66825)",...,Kissy Sullivan,20.0,LinkedIn,Fully Meets,"(3.99, 4.1)",4,0,2/28/2019,0,13
307,,,0,0,0,5,5,1,0,"(47837, 51259)",...,Brannon Miller,12.0,Google Search,PIP,"(3.19, 3.5)",2,0,,5,4
308,,"(10001, 10021)",0,0,0,1,3,4,0,,...,Janet King,2.0,Employee Referral,Exceeds,"(4.52, 4.68)",5,6,2/21/2019,0,16
309,,"(10043, 10063)",0,0,0,1,3,3,0,"(77692, 90100)",...,Simon Roup,4.0,Employee Referral,Fully Meets,"(5.0, 5.0)",3,5,2/1/2019,0,11
