File size: 3,751 Bytes
7ad6c98
 
1cddef5
7ad6c98
1cddef5
8f994b4
 
 
003953a
4db793d
1cddef5
 
4db793d
6c3e9dd
1cddef5
aa72b6d
 
1cddef5
aa72b6d
1cddef5
7ad6c98
6c3e9dd
 
1cddef5
 
6c3e9dd
1cddef5
003953a
 
 
aa72b6d
 
1cddef5
aa72b6d
1cddef5
6c3e9dd
1170641
 
 
 
aa72b6d
6c3e9dd
 
 
aa72b6d
 
 
 
 
6c3e9dd
aa72b6d
003953a
aa72b6d
6c3e9dd
aa72b6d
 
 
 
6c3e9dd
 
 
 
 
 
003953a
6c3e9dd
003953a
6c3e9dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1cddef5
aa72b6d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import modules
import streamlit as st
from streamlit_extras.let_it_rain import rain

# Options
DISCLAIMER = """
    *This app processes data using 2-anonymity, an implementation of the k-anonymity framework. While this is a great start to anonymizing your data, it is by no means perfect, and should be used with caution. For example, some sets of sensitive features which may clearly be identified by a human could be missed by our algorithm. Please keep this in mind.*
    """
K = 2

# Page Config
st.set_page_config(layout="wide")

### FILE LOADER for sidebar
with st.sidebar:
    st.header("🕵️ 2anonymity")
    st.markdown("*Clean and anonymize data*")
    with st.container() as upload:
        file = st.file_uploader(f"Upload dataset:", type=modules.SUPPORTED_TYPES, label_visibility="collapsed")
        df, (filename, extension), result = modules.load_file(file)

### MAIN
if df is None: # Await file to be uploaded
    rain("🤠")
else:
    ### PRE-TRANSFORM features for sidebar
    with st.sidebar:
        # Options for data loading
        with st.container() as loading_options:
            st.markdown("### Data loading options:")
            remove_duplicates = st.checkbox("Remove duplicate rows", value=True)
            drop_missing = st.checkbox("Remove rows with missing values", value=False)

        # Options for data optimization
        with st.container() as anonymizing_options:
            st.markdown("### Anonymizing options:")
            max_categorical_size = st.slider("Categorical Variable Threshold", min_value=2, max_value=200, value=50, step=1)
            bin_size = st.slider("Bin Size", min_value=2, max_value=200, value=20, step=1)
            redaction_selection = st.selectbox("Redaction strength", ["Low", "Medium", "High", "Extreme"])
            sensitivity_minimum = {"Low": 2, "Medium": 4, "High": 6, "Extreme": 12}[redaction_selection]
    
    
    ### DATA PREVIEW AND TRANSFORM
    # Preview data before transform
    with st.container() as before_data:
        s = df.style
        s = s.set_properties(**{'background-color': '#fce4e4'})
        st.dataframe(s)

    # Transform data
    df = modules.data_cleaner(df, drop_missing, remove_duplicates)
    df, unprocessed = modules.data_anonymizer(df, K, max_categorical_size, bin_size, sensitivity_minimum)

    # Preview data after before_data
    with st.container() as after_data:
        s = df.style
        s = s.set_properties(**{'background-color': '#e4fce4'})
        st.dataframe(s)


    ### POST-TRANSFORM features for sidebar
    with st.sidebar:
        # Options for download
        with st.container() as download_header:
            st.markdown("### Download options:")
            output_extension = st.selectbox("File type", [".csv", ".json", ".xlsx"])
            if unprocessed: st.markdown(f"Error encountered when processing columns {str(unprocessed)}")
 
        # Prepare file for download
        with st.container() as downloader:
            if output_extension == ".csv": output_file = df.to_csv().encode("utf-8")
            elif output_extension == ".json": output_file = df.to_json().encode("utf-8")
            elif output_extension == ".xlsx": output_file = df.to_excel().encode("utf-8")
            output_filename = f"""{filename.split(".")[:-1][0]}-clean{output_extension}"""
            st.download_button("Download", output_file, file_name=output_filename)
        
        # Add a disclaimer for data security
        with st.container() as disclaimer:
            st.markdown(
                f"""
                Disclaimer:  
                {DISCLAIMER}
                """
                )
        
# Attribution
st.sidebar.markdown("Created by team #2hack2furious for the hackthethreat2023")