ziggycross commited on
Commit
7ad6c98
1 Parent(s): 80e8771

Created basic web app for data cleaning.

Browse files
Files changed (2) hide show
  1. app.py +28 -0
  2. modules.py +70 -0
app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import modules
2
+ import pandas as pd
3
+ import streamlit as st
4
+
5
+ st.markdown(
6
+ """
7
+ # 2hack2furious anonymiser
8
+
9
+ upload a dataset and get a cleaned dataset back.
10
+ """
11
+ )
12
+
13
+ uploaded_file = st.file_uploader(f"Upload dataset:", type=modules.SUPPORTED_TYPES)
14
+
15
+ df, (filename, extension), result = modules.load_file(uploaded_file)
16
+ st.text(result)
17
+
18
+ st.title("Before:")
19
+ st.dataframe(df)
20
+
21
+ st.title("After:")
22
+ df = modules.data_cleaner(df)
23
+ st.dataframe(df)
24
+
25
+ st.download_button("Download cleaned data", modules.create_file(df, extension), file_name=filename)
26
+
27
+ st.markdown("---")
28
+ st.text("Created by team #2hack2furious for the hackthethreat2023")
modules.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ SUPPORTED_TYPES = [".csv", ".json", ".xlsx"]
4
+
5
+ def load_file(file):
6
+ """
7
+ Takes a file given by Streamlit and loads into a DataFrame.
8
+ Returns a DataFrame, metadata, and result string.
9
+
10
+ @param file: File uploaded into streamlit.
11
+ @rtype: tuple
12
+ @return: A tuple of format (pd.DataFrame, (str, str), str).
13
+ """
14
+ df = pd.DataFrame()
15
+
16
+ if file is None: return df, ""
17
+
18
+ filename = file.name
19
+ extension = filename.split(".")[-1]
20
+ metadata = (filename, extension)
21
+
22
+ try:
23
+ match extension:
24
+ case "csv":
25
+ df = pd.read_csv(file)
26
+ case "json":
27
+ df = pd.read_json(file)
28
+ case "xlsx":
29
+ df = pd.read_excel(file)
30
+ case _:
31
+ return df, metadata, f"Error: Invalid extension '{extension}'"
32
+ rows, columns = df.shape
33
+ return df, metadata, f"File '{filename}' loaded successfully.\nFound {rows} rows, {columns} columns."
34
+ except Exception as error:
35
+ return df, metadata, f"Error: Unable to read file '{filename}' ({type(error)}: {error})"
36
+
37
+ def create_file(df, extension):
38
+ """
39
+ Prepares a dataframe from streamlit for download.
40
+
41
+ @type df: pd.DataFrame
42
+ @param df: A DataFrame to package into a file.
43
+ @type extension: pd.DataFrame
44
+ @param extension: The desired filetype.
45
+ @return: A file container ready for download.
46
+ """
47
+ match extension:
48
+ case "csv":
49
+ return df.to_csv()
50
+ case "json":
51
+ return df.to_json()
52
+ case "xlsx":
53
+ return df.to_excel()
54
+
55
+ def data_cleaner(df, drop_missing=False, remove_duplicates=True):
56
+ """
57
+ Takes a DataFrame and removes empty and duplicate entries.
58
+
59
+ @type df: pd.DataFrame
60
+ @param df: A DataFrame of uncleaned data.
61
+ @type drop_missing: bool
62
+ @param drop_missing: Determines if rows with any missing values are dropped ("any"), or just empty rows ("all").
63
+ @type remove_duplicates: bool
64
+ @param remove_duplicates: Determines if duplicate rows are removed.
65
+ @rtype: pd.DataFrame
66
+ @return: A DataFrame with requested cleaning applied
67
+ """
68
+ df = df.dropna(how="any" if drop_missing else "all")
69
+ if remove_duplicates: df = df.drop_duplicates()
70
+ return df