{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from itertools import combinations\n", "import numpy as np\n", "import os\n", "import pandas as pd" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Load data from file into a pandas df" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File 'hr.csv' loaded successfully. \n", "Found 311 rows, 36 columns\n" ] } ], "source": [ "DATADIR=\"data/\"\n", "FILENAME=None\n", "\n", "while FILENAME is None:\n", " \n", " file_candidate = input(\"Enter file name:\")\n", " if file_candidate == \"\": break\n", " \n", " try:\n", " print(f\"Assesing file '{file_candidate}'...\".ljust(120), end=\"\\r\")\n", " file_path = DATADIR + file_candidate\n", " extension = file_candidate.split(\".\")[-1] \n", " match extension:\n", " case \"csv\":\n", " df = pd.read_csv(file_path)\n", " case \"json\":\n", " df = pd.read_json(file_path)\n", " case \"xlsx\":\n", " df = pd.read_excel(file_path)\n", " case _:\n", " print(f\"Error: Invalid extension '{extension}'\")\n", " continue\n", " print(f\"File '{file_candidate}' loaded successfully.\")\n", " rows, columns = df.shape\n", " print(f\"Found {rows} rows, {columns} columns\")\n", " FILENAME = file_candidate\n", " except FileNotFoundError:\n", " print(f\"Error: '{file_candidate}' doesn't exist in {os.getcwd()}/{DATADIR}\")\n", " except Exception as error:\n", " print(f\"Error: Unable to read file '{file_candidate}' ({str(type(error))}: {error})\".ljust(120))" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Clean data to remove duplicates and rows with missing values." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "DROP_MISSING = False\n", "REMOVE_DUPLICATES = True\n", "\n", "df = df.dropna(how=\"any\" if DROP_MISSING else \"all\")\n", "if REMOVE_DUPLICATES: df = df.drop_duplicates()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Anonymize data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Employee_NameEmpIDMarriedIDMaritalStatusIDGenderIDEmpStatusIDDeptIDPerfScoreIDFromDiversityJobFairIDSalary...ManagerNameManagerIDRecruitmentSourcePerformanceScoreEngagementSurveyEmpSatisfactionSpecialProjectsCountLastPerformanceReview_DateDaysLateLast30Absences
0None(10022, 10042)0011540(62065, 63381)...Michael Albert22.0LinkedInExceeds(4.52, 4.68)501/17/201901
1None(10064, 10084)1115330(92328, 104437)...Simon Roup4.0IndeedFully Meets(4.9, 5.0)36None017
2None(10190, 10210)1105530(64816, 66825)...Kissy Sullivan20.0LinkedInFully Meets(2.9, 3.18)30None03
3None(10085, 10105)1101530(64816, 66825)...Elijiah Gray16.0IndeedFully Meets(4.7, 4.88)501/3/2019015
4None(10064, 10084)0205530(47837, 51259)...Webster Butler39.0Google SearchFully Meets(5.0, 5.0)402/1/201602
..................................................................
306None(10127, 10147)0011530(64816, 66825)...Kissy Sullivan20.0LinkedInFully Meets(3.99, 4.1)402/28/2019013
307NoneNone0005510(47837, 51259)...Brannon Miller12.0Google SearchPIP(3.19, 3.5)20None54
308None(10001, 10021)0001340None...Janet King2.0Employee ReferralExceeds(4.52, 4.68)562/21/2019016
309None(10043, 10063)0001330(77692, 90100)...Simon Roup4.0Employee ReferralFully Meets(5.0, 5.0)352/1/2019011
310None(10252, 10271)0401530(45046, 47750)...David Stanley14.0LinkedInFully Meets(4.5, 4.52)501/30/201902
\n", "

311 rows × 36 columns

\n", "
" ], "text/plain": [ " Employee_Name EmpID MarriedID MaritalStatusID GenderID \\\n", "0 None (10022, 10042) 0 0 1 \n", "1 None (10064, 10084) 1 1 1 \n", "2 None (10190, 10210) 1 1 0 \n", "3 None (10085, 10105) 1 1 0 \n", "4 None (10064, 10084) 0 2 0 \n", ".. ... ... ... ... ... \n", "306 None (10127, 10147) 0 0 1 \n", "307 None None 0 0 0 \n", "308 None (10001, 10021) 0 0 0 \n", "309 None (10043, 10063) 0 0 0 \n", "310 None (10252, 10271) 0 4 0 \n", "\n", " EmpStatusID DeptID PerfScoreID FromDiversityJobFairID \\\n", "0 1 5 4 0 \n", "1 5 3 3 0 \n", "2 5 5 3 0 \n", "3 1 5 3 0 \n", "4 5 5 3 0 \n", ".. ... ... ... ... \n", "306 1 5 3 0 \n", "307 5 5 1 0 \n", "308 1 3 4 0 \n", "309 1 3 3 0 \n", "310 1 5 3 0 \n", "\n", " Salary ... ManagerName ManagerID RecruitmentSource \\\n", "0 (62065, 63381) ... Michael Albert 22.0 LinkedIn \n", "1 (92328, 104437) ... Simon Roup 4.0 Indeed \n", "2 (64816, 66825) ... Kissy Sullivan 20.0 LinkedIn \n", "3 (64816, 66825) ... Elijiah Gray 16.0 Indeed \n", "4 (47837, 51259) ... Webster Butler 39.0 Google Search \n", ".. ... ... ... ... ... \n", "306 (64816, 66825) ... Kissy Sullivan 20.0 LinkedIn \n", "307 (47837, 51259) ... Brannon Miller 12.0 Google Search \n", "308 None ... Janet King 2.0 Employee Referral \n", "309 (77692, 90100) ... Simon Roup 4.0 Employee Referral \n", "310 (45046, 47750) ... David Stanley 14.0 LinkedIn \n", "\n", " PerformanceScore EngagementSurvey EmpSatisfaction SpecialProjectsCount \\\n", "0 Exceeds (4.52, 4.68) 5 0 \n", "1 Fully Meets (4.9, 5.0) 3 6 \n", "2 Fully Meets (2.9, 3.18) 3 0 \n", "3 Fully Meets (4.7, 4.88) 5 0 \n", "4 Fully Meets (5.0, 5.0) 4 0 \n", ".. ... ... ... ... \n", "306 Fully Meets (3.99, 4.1) 4 0 \n", "307 PIP (3.19, 3.5) 2 0 \n", "308 Exceeds (4.52, 4.68) 5 6 \n", "309 Fully Meets (5.0, 5.0) 3 5 \n", "310 Fully Meets (4.5, 4.52) 5 0 \n", "\n", " LastPerformanceReview_Date DaysLateLast30 Absences \n", "0 1/17/2019 0 1 \n", "1 None 0 17 \n", "2 None 0 3 \n", "3 1/3/2019 0 15 \n", "4 2/1/2016 0 2 \n", ".. ... ... ... \n", "306 2/28/2019 0 13 \n", "307 None 5 4 \n", "308 2/21/2019 0 16 \n", "309 2/1/2019 0 11 \n", "310 1/30/2019 0 2 \n", "\n", "[311 rows x 36 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "K = 2\n", "MAX_CATEGORICAL_SIZE = 50\n", "BIN_SIZE = 20\n", "SENSITIVITY_MINIMUM = 2\n", "\n", "def column_combinations(df, k):\n", " return list(combinations(df.columns, k))\n", "\n", "def k_redact(df, k):\n", " kwise_combinations = column_combinations(df, k) \n", " \n", " for columns in kwise_combinations:\n", " df_search = df.loc[:, columns]\n", " sensitive_data = [\n", " (columns, key)\n", " for key, value\n", " in df_search.value_counts().to_dict().items()\n", " if value == 1\n", " ]\n", " if not sensitive_data: continue\n", " for columns, values in sensitive_data:\n", " for column, value in zip(columns, values):\n", " df_search = df_search.loc[df[column] == value]\n", " if df_search.shape[0] == 1:\n", " for column in columns:\n", " df_search[column] = None\n", " \n", " return df\n", "\n", "def sensitive_values(series, sensitivity_minimum):\n", " return {key\n", " for key, value\n", " in series.value_counts().to_dict().items()\n", " if value < sensitivity_minimum\n", " }\n", "\n", "def drop_sensitive(series, sensitivity_minimum):\n", " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n", "\n", "def bin_numeric(df, to_process, bin_size, sensitivity_minimum):\n", " processed = set()\n", " rows, _ = df.shape\n", " num_bins = rows//bin_size\n", " for column_name in to_process:\n", " column = df[column_name]\n", " if column.dtype.kind not in \"biufc\": continue\n", " array = sorted(np.array(column))\n", " array_min, array_max = array[0], array[-1]\n", " splits = [array_min] + list(np.array_split(array, num_bins)) + [array_max]\n", " bins = [\n", " (np.min(split), np.max(split))\n", " for split\n", " in (splits[i] for i in range(num_bins))\n", " ]\n", " result = [None] * rows\n", " for bin_min, bin_max in bins:\n", " for i, value in enumerate(column):\n", " if bin_min <= value <= bin_max:\n", " result[i] = (bin_min, bin_max)\n", " df[column_name] = result\n", " drop_sensitive(df[column_name], sensitivity_minimum)\n", " processed.add(column_name)\n", " return df, to_process - processed\n", "\n", "def find_categorical(df, to_process, max_categorical_size, sensitivity_minimum):\n", " processed = set()\n", " for column_name in to_process:\n", " column = df[column_name]\n", " if column.nunique() <= max_categorical_size:\n", " drop_sensitive(column, sensitivity_minimum)\n", " processed.add(column_name)\n", " return df, to_process - processed\n", "\n", "def redact(df, to_process, sensitivity_minimum):\n", " processed = set()\n", " for column_name in to_process:\n", " column = df[column_name]\n", " \n", " is_object = column.dtype == object\n", " if not is_object: continue\n", "\n", " # Check if any unique values exist, and redact them\n", " drop_sensitive(column, sensitivity_minimum)\n", " processed.add(column_name)\n", "\n", " return df, to_process - processed\n", "\n", "def anonymize(df, max_categorical_size, bin_size, sensitivity_minimum):\n", " to_process = set(df.columns)\n", " df, to_process = redact(df, to_process, sensitivity_minimum)\n", " df, to_process = find_categorical(df, to_process, max_categorical_size, sensitivity_minimum)\n", " df, to_process = bin_numeric(df, to_process, bin_size, sensitivity_minimum)\n", " return df, to_process\n", "\n", "def data_anonymizer(df, k, max_categorical_size, bin_size, sensitivity_minimum):\n", " start_dtypes = df.dtypes.to_dict()\n", " df, unprocessed = anonymize(df, max_categorical_size, bin_size, sensitivity_minimum)\n", " df = k_redact(df, k)\n", " end_dtypes = df.dtypes.to_dict()\n", "\n", " # Type correction\n", " for column in df.columns:\n", " start_type, end_type = start_dtypes[column], end_dtypes[column]\n", " if start_type == end_type: continue\n", " if start_type.kind == \"i\" and end_type.kind == \"f\":\n", " df[column] = df[column].astype(\"Int64\")\n", "\n", " return df, unprocessed\n", "\n", "df, unprocessed_columns = data_anonymizer(df, K, MAX_CATEGORICAL_SIZE, BIN_SIZE, SENSITIVITY_MINIMUM)\n", "if unprocessed_columns: print(f\"Failed to process columns '{unprocessed_columns}'\")\n", "df" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }