File size: 6,465 Bytes
05828e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# set path
import glob, os, sys; 
sys.path.append('../utils')

#import needed libraries
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import streamlit as st
from utils.vulnerability_classifier import load_vulnerabilityClassifier, vulnerability_classification
import logging
logger = logging.getLogger(__name__)
from utils.config import get_classifier_params
from utils.preprocessing import paraLengthCheck
from io import BytesIO
import xlsxwriter
import plotly.express as px
import plotly.graph_objects as go
from utils.vulnerability_classifier import label_dict



# Declare all the necessary variables
classifier_identifier = 'vulnerability'
params  = get_classifier_params(classifier_identifier)

@st.cache_data
def to_excel(df,sectorlist):
    len_df = len(df)
    output = BytesIO()
    writer = pd.ExcelWriter(output, engine='xlsxwriter')
    df.to_excel(writer, index=False, sheet_name='Sheet1')
    workbook = writer.book
    worksheet = writer.sheets['Sheet1']
    worksheet.data_validation('S2:S{}'.format(len_df), 
                              {'validate': 'list', 
                               'source': ['No', 'Yes', 'Discard']})
    worksheet.data_validation('X2:X{}'.format(len_df), 
                              {'validate': 'list', 
                               'source': sectorlist + ['Blank']})
    worksheet.data_validation('T2:T{}'.format(len_df), 
                              {'validate': 'list', 
                               'source': sectorlist + ['Blank']})
    worksheet.data_validation('U2:U{}'.format(len_df), 
                              {'validate': 'list', 
                               'source': sectorlist + ['Blank']})                               
    worksheet.data_validation('V2:V{}'.format(len_df), 
                              {'validate': 'list', 
                               'source': sectorlist + ['Blank']})
    worksheet.data_validation('W2:U{}'.format(len_df), 
                              {'validate': 'list', 
                               'source': sectorlist + ['Blank']})                            
    writer.save()
    processed_data = output.getvalue()
    return processed_data

def app():

    ### Main app code ###
    with st.container():

            # If a document has been processed                   
            if 'key0' in st.session_state:

                # Run vulnerability classifier
                df = st.session_state.key0
                classifier = load_vulnerabilityClassifier(classifier_name=params['model_name'])
                st.session_state['{}_classifier'.format(classifier_identifier)] = classifier

    
                # Get the predictions    
                df = vulnerability_classification(haystack_doc=df,
                                            threshold= params['threshold'])


                # Store df in session state with key1
                st.session_state.key1 = df


def vulnerability_display(): 
    
    # Get the vulnerability df
    df = st.session_state['key1']

    # Filter the dataframe to only show the paragraphs with references
    df_filtered = df[df['Vulnerability Label'].apply(lambda x: len(x) > 0 and 'Other' not in x)]

    # Rename column 
    df_filtered.rename(columns={'Vulnerability Label': 'Group(s)'}, inplace=True)

    # Header
    st.subheader("Explore references to vulnerable groups:")
        

    # Text 
    num_paragraphs = len(df['Vulnerability Label'])
    num_references = len(df_filtered['Group(s)'])
   
    st.markdown(f"""<div style="text-align: justify;">The document contains a
            total of <span style="color: red;">{num_paragraphs}</span> paragraphs.
            We identified <span style="color: red;">{num_references}</span>
            references to groups in vulnerable situations.</div>
            <br>
            <div style="text-align: justify;">We are searching for references related 
            to the following groups: (1) Agricultural communities, (2) Children, (3)
            Ethnic, racial and other minorities, (4) Fishery communities, (5) Informal sector
            workers, (6) Members of indigenous and local communities, (7) Migrants and 
            displaced persons, (8) Older persons, (9) Persons living in poverty, (10) 
            Persons living with disabilities, (11) Persons with pre-existing health conditions, 
            (12) Residents of drought-prone regions, (13) Rural populations, (14) Sexual 
            minorities (LGBTQI+), (15) Urban populations, (16) Women and other genders.</div>
            <br>
            <div style="text-align: justify;">The chart below shows the groups for which 
            references were found and the number of references identified.
            For a more detailed view in the text, see the paragraphs and 
            their respective labels in the table underneath.</div>""", unsafe_allow_html=True)

        
    ### Bar chart
                
    # # Create a df that stores all the labels
    df_labels = pd.DataFrame(list(label_dict.items()), columns=['Label ID', 'Label'])

    # Count how often each label appears in the "Group identified" column
    group_counts = {}

    # Iterate through each sublist
    for index, row in df_filtered.iterrows():
        
        # Iterate through each group in the sublist
        for sublist in row['Group(s)']:
            
            # Update the count in the dictionary
            group_counts[sublist] = group_counts.get(sublist, 0) + 1

    # Create a new dataframe from group_counts
    df_label_count = pd.DataFrame(list(group_counts.items()), columns=['Label', 'Count'])

    # Merge the label counts with the df_label DataFrame
    df_label_count = df_labels.merge(df_label_count, on='Label', how='left')

    # Exclude the "Other" group and all groups that do not have a label
    df_bar_chart = df_label_count[df_label_count['Label'] != 'Other']
    df_bar_chart = df_bar_chart.dropna(subset=['Count'])

    
    # Bar chart
    fig = go.Figure()

    fig.add_trace(go.Bar(
        y=df_bar_chart.Label,
        x=df_bar_chart.Count,
        orientation='h', 
        marker=dict(color='purple'),
    ))
    
    # Customize layout
    fig.update_layout(
        title='Number of references identified',
        xaxis_title='Number of references',
        yaxis_title='Group',
    )

    # Show the plot
    #fig.show()

    #Show plot
    st.plotly_chart(fig, use_container_width=True)