Spaces:
Runtime error
Runtime error
File size: 5,666 Bytes
a51662f 7ad6c98 9ac3994 7ad6c98 3b7db7f 7ad6c98 9ac3994 7ad6c98 9ac3994 7ad6c98 9ac3994 7ad6c98 6c3e9dd 003953a a51662f 003953a a51662f 003953a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
from itertools import combinations
import numpy as np
import pandas as pd
SUPPORTED_TYPES = [".csv", ".json", ".xlsx"]
def hello_world(): return "hello world!"
def load_file(file):
"""
Takes a file given by Streamlit and loads into a DataFrame.
Returns a DataFrame, metadata, and result string.
@param file: File uploaded into streamlit.
@rtype: tuple
@return: A tuple of format (pd.DataFrame, (str, str), str).
"""
df = None
if file is None: return df, ("", ""), ""
filename = file.name
extension = filename.split(".")[-1]
metadata = (filename, extension)
import_functions = {
"csv": pd.read_csv,
"json": pd.read_json,
"xlsx": pd.read_excel
}
try:
reader = import_functions.get(extension, None)
if reader is None:
return df, metadata, f"Error: Invalid extension '{extension}'"
df = reader(file)
rows, columns = df.shape
return df, metadata, f"File '{filename}' loaded successfully.\nFound {rows} rows, {columns} columns."
except Exception as error:
return df, metadata, f"Error: Unable to read file '{filename}' ({type(error)}: {error})"
def data_cleaner(df, drop_missing=False, remove_duplicates=True):
"""
Takes a DataFrame and removes empty and duplicate entries.
@type df: pd.DataFrame
@param df: A DataFrame of uncleaned data.
@type drop_missing: bool
@param drop_missing: Determines if rows with any missing values are dropped ("any"), or just empty rows ("all").
@type remove_duplicates: bool
@param remove_duplicates: Determines if duplicate rows are removed.
@rtype: pd.DataFrame
@return: A DataFrame with requested cleaning applied
"""
df = df.dropna(how="any" if drop_missing else "all")
if remove_duplicates: df = df.drop_duplicates()
return df
def column_combinations(df, k):
return list(combinations(df.columns, k))
def k_redact(df, k):
kwise_combinations = column_combinations(df, k)
for columns in kwise_combinations:
df_search = df.loc[:, columns]
sensitive_data = [
(columns, key)
for key, value
in df_search.value_counts().to_dict().items()
if value == 1
]
if not sensitive_data: continue
for columns, values in sensitive_data:
for column, value in zip(columns, values):
df_search = df_search.loc[df[column] == value]
if df_search.shape[0] == 1:
for column in columns:
df_search[column] = None
return df
def sensitive_values(series, sensitivity_minimum):
return {key
for key, value
in series.value_counts().to_dict().items()
if value < sensitivity_minimum
}
def drop_sensitive(series, sensitivity_minimum):
series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None
def bin_numeric(df, to_process, bin_size, sensitivity_minimum):
processed = set()
rows, _ = df.shape
num_bins = rows//bin_size
for column_name in to_process:
column = df[column_name]
if column.dtype.kind not in "biufc": continue
array = sorted(np.array(column))
array_min, array_max = array[0], array[-1]
splits = [array_min] + list(np.array_split(array, num_bins)) + [array_max]
bins = [
(np.min(split), np.max(split))
for split
in (splits[i] for i in range(num_bins))
]
result = [None] * rows
for bin_min, bin_max in bins:
for i, value in enumerate(column):
if bin_min <= value <= bin_max:
result[i] = (bin_min, bin_max)
df[column_name] = result
drop_sensitive(df[column_name], sensitivity_minimum)
processed.add(column_name)
return df, to_process - processed
def find_categorical(df, to_process, max_categorical_size, sensitivity_minimum):
processed = set()
for column_name in to_process:
column = df[column_name]
if column.nunique() <= max_categorical_size:
drop_sensitive(column, sensitivity_minimum)
processed.add(column_name)
return df, to_process - processed
def redact(df, to_process, sensitivity_minimum):
processed = set()
for column_name in to_process:
column = df[column_name]
is_object = column.dtype == object
if not is_object: continue
# Check if any unique values exist, and redact them
drop_sensitive(column, sensitivity_minimum)
processed.add(column_name)
return df, to_process - processed
def anonymize(df, max_categorical_size, bin_size, sensitivity_minimum):
to_process = set(df.columns)
df, to_process = redact(df, to_process, sensitivity_minimum)
df, to_process = find_categorical(df, to_process, max_categorical_size, sensitivity_minimum)
df, to_process = bin_numeric(df, to_process, bin_size, sensitivity_minimum)
return df, to_process
def data_anonymizer(df, k, max_categorical_size, bin_size, sensitivity_minimum):
start_dtypes = df.dtypes.to_dict()
df, unprocessed = anonymize(df, max_categorical_size, bin_size, sensitivity_minimum)
df = k_redact(df, k)
end_dtypes = df.dtypes.to_dict()
# Type correction
for column in df.columns:
start_type, end_type = start_dtypes[column], end_dtypes[column]
if start_type == end_type: continue
if start_type.kind == "i" and end_type.kind == "f":
df[column] = df[column].astype("Int64")
return df, unprocessed
|