File size: 9,274 Bytes
afca08a
 
 
 
 
 
1fef517
 
 
 
 
 
 
 
afca08a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1fef517
 
 
 
afca08a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1fef517
 
 
 
afca08a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import streamlit as st
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

#Load data
fraud = pd.read_csv('fraud_test.csv')

# Define the percentage of data you want to sample
sample_percentage = 50  # Adjust this percentage as needed

# Randomly sample the data based on the percentage
data = fraud.sample(frac=sample_percentage/100, random_state=22)

# def annotate_bar(ax, custom_y_func, font_size=14):

#     for p in ax.patches:
#         # Calculate annotation
#         value = str(round(p.get_height(), 1))
#         x = (p.get_x() + p.get_width() / 2) * 0.99
#         y = ((p.get_y() + p.get_height() / 2) * 0.99)
        
#         y = custom_y_func(y)
#         ax.annotate(
#             value,
#             (x, y),
#             color="black",
#             size=font_size, ha='center', va='center'
#         )

def eda_page():

    st.title("Eksploratory Data Analysis")
    st.write('Analyze the DataFrame for Better Understanding')
    st.markdown("<h2><b>Top 10 Transaction Amount</b></h2>", unsafe_allow_html=True)

    # TOP Transaction Amount
    columns = ['job', 'state', 'city', 'merchant']
    fraud_labels = ['Not Fraud', 'Fraud']

    for col in columns:
        st.subheader(f"Top 10 transaction amount by {col}")
        fig, ax = plt.subplots(1, 2, figsize=(30, 5))
        for i, fraud_label in enumerate(fraud_labels):
            temp_data = data[data['is_fraud'] == (0 if fraud_label == 'Not Fraud' else 1)]
            top = temp_data.groupby(col)['amt'].sum().nlargest(10)
            ax[i].bar(top.index, top.values, color='#a1c9f4')
            ax[i].set_title(fraud_label)
            ax[i].set_xlabel(col)
            ax[i].set_ylabel('Amount')
            if col == 'state':
                ax[i].tick_params(axis='x', rotation=0)
            else:
                ax[i].tick_params(axis='x', rotation=90)
        st.pyplot(fig)
    st.write("**Explanation**:")
    markdown_text = """

    * From the top 10 transaction amount by job we can see `Science Writer` have the most fraud with over 10.000 transaction amount meanwhile `Film/Video editor` are the most non fraud with almost 160.000 transaction

    * From the top 10 transaction amount by state we can see `NY` have the most fraud with almost 60.000 transaction amount meanwhile `TX`are the most non fraud with above 1.400.000 transaction

    * From the top 10 transaction amount by city we can see `Camden` have the most fraud with over 10.000 transaction amount meanwhile `Meridian` are the most non fraud with almost 100.000 transaction

    * From the top 10 transaction amount by merchant we can see `Heathcote, Yost and Kertzmann` have most fraud with almost 10.000 transaction amount meanwhile `Killback-LLC` are the most non fraud with over 80.000 transaction

    """
    st.markdown(markdown_text)

    st.markdown("<h2><b>Top 10 Transaction Count</b></h2>", unsafe_allow_html=True)
    # By Transaction count
    columns = ['job', 'state', 'city', 'merchant']
    columns_name = ['Job', 'State', 'City', 'Merchant']
    fraud = ['Not Fraud', 'Fraud']

    for col, name in zip(columns, columns_name):
        st.subheader(f"Top 10 transaction by {name}")
        fig, ax = plt.subplots(1, 2, figsize=(30, 5))
        sns.set_palette("pastel")
        for i, fraud_label in enumerate(fraud):
            temp_data = data[data['is_fraud'] == (0 if fraud_label == 'Not Fraud' else 1)]
            top = temp_data.groupby(col).size().nlargest(10)
            ax[i].bar(top.index, top.values, color='#a1c9f4')
            ax[i].set_title(fraud_label)
            ax[i].set_xlabel(name)
            ax[i].set_ylabel('Count')
            if col == 'state':
                ax[i].tick_params(axis='x', rotation=0)
            else:
                ax[i].tick_params(axis='x', rotation=90)
        st.pyplot(fig)

    st.write("")  # Add a blank line

    st.write("**Explanation**:")
    markdown_text = """

    * From the top 10 transaction by job we can see `Color Technologist` have the most fraud with over 20 transaction meanwhile `Film/Video editor` are the most not fraud with over 2.000 transaction

    * From the top 10 transaction by state we can see `NY` have the most fraud with over 80 transaction meanwhile `TX`are the most not fraud with 20.000 transaction

    * From the top 10 transaction by city we can see `Camden` have the most fraud over 20 transaction meanwhile `Birmingham` are the most not fraud with almost 1.200 transaction

    * From the top 10 transaction by merchant we can see `Healthcore LLC.` have most fraud with 10 transaction meanwhile `Killback LLC.` are the most not fraud with almost 1.000 transaction

    """
    st.markdown(markdown_text)

    st.markdown("<h2><b>Total Number and Amount for Fraud and Non Fraud Transaction</b></h2>", unsafe_allow_html=True)
    
    def annotate_bar(ax, custom_y_func, font_size=14):
        for p in ax.patches:
            value = str(round(p.get_height(), 1))
            x = (p.get_x() + p.get_width() / 2) * 0.99
            y = ((p.get_y() + p.get_height() / 2) * 0.99)
            y = custom_y_func(y)
            ax.annotate(value, (x, y), color="black", size=font_size, ha='center', va='center')

    # Fraud and Not Fraud Transactions
    st.header("Fraud and Not Fraud Transactions Count")
    data_fraud_count = data['is_fraud'].apply(lambda x: "Fraud" if x == 1 else 'Not Fraud').value_counts().reset_index()
    fig, ax = plt.subplots(figsize=(15, 5))
    sns.barplot(data=data_fraud_count, x='is_fraud', y='count', color='#c6def8', ax=ax)
    annotate_bar(ax, lambda y: 15000 if y < 10000 else y, font_size=14)
    ax.set_title("Total number of transaction for fraud and not fraud transaction", fontsize=12, fontweight='bold')
    ax.set_ylabel("Transaction count")
    ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
    st.pyplot(fig)

    # Fraud and Not Fraud Amount
    st.header("Fraud and Not Fraud Transactions Amount")
    data_fraud_amount = data.groupby('is_fraud')['amt'].sum().reset_index()
    fig, ax = plt.subplots(figsize=(15, 5))
    sns.barplot(data=data_fraud_amount, x='is_fraud', y='amt', color='#c6def8', ax=ax)
    annotate_bar(ax, lambda y: 1900000 if y < 1200000 else y, font_size=12)
    ax.set_title("Total transaction amount for fraud and not fraud transaction", fontsize=12, fontweight='bold')
    ax.set_ylabel("Transaction amount")
    ax.set_xticklabels(['Not Fraud', 'Fraud'], rotation=0)
    st.pyplot(fig)

    st.write("**Explanation**:")
    markdown_text = """

    Based on visualisation above:

    * There is 276743 total number of transaction `not fraud` and 1117 `fraud` transaction

    * There is 18745296.5 total transaction amount of `not fraud` and 1117 `fraud` transaction

    """
    st.markdown(markdown_text)

    # Calculate age
    data['dob'] = pd.to_datetime(data['dob'])
    data['age'] = (2020 - data['dob'].dt.year)

    def apply_age_group(age):
        if age <= 18:
            return 'Teenager'
        elif age <= 25:
            return "Young Adult"
        elif age <= 64:
            return "Adult"
        else:
            return "Elder"

    data['age_group'] = data['age'].apply(apply_age_group)

    # Overview of dataset by month, gender, and category
    st.header("Overview of dataset by Age, gender, and category")
    columns = ['gender', 'category', 'age', 'age_group']
    columns_name = ['gender', 'category', 'age', 'age group']
    name = ['Not Fraud', 'Fraud']

    for col in columns:
        st.subheader("Distribution of transaction by " + columns_name[columns.index(col)])
        fig, ax = plt.subplots(1, 2, figsize=(15, 5))  # Create a subplot with 2 columns
        for i in range(0, 2):
            data_1 = data[data['is_fraud'] == i]
            if col == 'gender':
                ax[i].pie(data_1[col].value_counts(), labels=['Female', 'Male'], autopct='%1.1f%%')
            elif col == 'age_group':
                ax[i].pie(data_1[col].value_counts(), labels=data_1[col].value_counts().index, autopct='%1.1f%%')
            elif col == 'category':
                sns.countplot(data=data_1, y=col, order=data_1[col].value_counts().index, ax=ax[i])
            else:
                sns.histplot(data=data_1, x=col, ax=ax[i])
            ax[i].set_title(name[i])
            ax[i].set_xlabel(columns_name[columns.index(col)])
            if col == 'category':
                ax[i].set_xticklabels(ax[i].get_xticklabels(), rotation=90)
        st.pyplot(fig)

    st.write("**Explanation**:")
    markdown_text = """

    Based on visualisation above we can see:

    - There is 54,8% transaction of `female` and 45,2% transaction of `male` in `not fraud` and `fraud`

    - Most distribution of `not fraud` transaction by category is from `gas_transport` meanwhile in fraud is from shopping_net

    - In distribution transaction by age mostly between 30-40 in `fraud` and between 45-50 for `not fraud`

    - By age group mostly `not fraud` transaction is from Adult with 73,9% and `fraud` also from Adult with 74,6%

    """
    st.markdown(markdown_text)