cpv_test / appStore /vulnerability_analysis.py
leavoigt's picture
Update appStore/vulnerability_analysis.py
bf7f98d
raw
history blame
5.7 kB
# set path
import glob, os, sys;
sys.path.append('../utils')
#import needed libraries
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import streamlit as st
from utils.vulnerability_classifier import load_vulnerabilityClassifier, vulnerability_classification
import logging
logger = logging.getLogger(__name__)
from utils.config import get_classifier_params
from utils.preprocessing import paraLengthCheck
from io import BytesIO
import xlsxwriter
import plotly.express as px
from utils.vulnerability_classifier import label_dict
# Declare all the necessary variables
classifier_identifier = 'vulnerability'
params = get_classifier_params(classifier_identifier)
@st.cache_data
def to_excel(df,sectorlist):
len_df = len(df)
output = BytesIO()
writer = pd.ExcelWriter(output, engine='xlsxwriter')
df.to_excel(writer, index=False, sheet_name='Sheet1')
workbook = writer.book
worksheet = writer.sheets['Sheet1']
worksheet.data_validation('S2:S{}'.format(len_df),
{'validate': 'list',
'source': ['No', 'Yes', 'Discard']})
worksheet.data_validation('X2:X{}'.format(len_df),
{'validate': 'list',
'source': sectorlist + ['Blank']})
worksheet.data_validation('T2:T{}'.format(len_df),
{'validate': 'list',
'source': sectorlist + ['Blank']})
worksheet.data_validation('U2:U{}'.format(len_df),
{'validate': 'list',
'source': sectorlist + ['Blank']})
worksheet.data_validation('V2:V{}'.format(len_df),
{'validate': 'list',
'source': sectorlist + ['Blank']})
worksheet.data_validation('W2:U{}'.format(len_df),
{'validate': 'list',
'source': sectorlist + ['Blank']})
writer.save()
processed_data = output.getvalue()
return processed_data
def app():
### Main app code ###
with st.container():
# If a document has been processed
if 'key0' in st.session_state:
# Run vulnerability classifier
df = st.session_state.key0
classifier = load_vulnerabilityClassifier(classifier_name=params['model_name'])
st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
# Get the predictions
df = vulnerability_classification(haystack_doc=df,
threshold= params['threshold'])
# Filter the dataframe to only show the paragraphs with references
df_filtered = df[df['Vulnerability Label'].apply(lambda x: len(x) > 0 and 'Other' not in x)]
# Store df in session state with key1
st.session_state.key1 = df_filtered
def vulnerability_display():
# Assign dataframe a name
df_vul = st.session_state['key0']
#st.write(df_vul)
# Header
st.subheader("Explore references to vulnerable groups:")
col1, col2 = st.columns([1,1])
with col1:
# Text
num_paragraphs = len(df_vul['Vulnerability Label'])
num_references = df_vul['Vulnerability Label'].apply(lambda x: 'Other' not in x).sum()
st.markdown(f"""<div style="text-align: justify;"> The document contains a
total of <span style="color: red;">{num_paragraphs}</span> paragraphs.
We identified <span style="color: red;">{num_references}</span>
references to groups in vulnerable situations.</div>
<br>
In the chart on the right you can see how often each group has been references.
For a more detailed view in the text, see the paragraphs and
their respective labels in the table below.</div>""", unsafe_allow_html=True)
with col2:
### Bar chart
# # Create a df that stores all the labels
df_labels = pd.DataFrame(list(label_dict.items()), columns=['Label ID', 'Label'])
# Count how often each label appears in the "Vulnerability Labels" column
group_counts = {}
# Iterate through each sublist
for index, row in df_vul.iterrows():
# Iterate through each group in the sublist
for sublist in row['Vulnerability Label']:
# Update the count in the dictionary
group_counts[sublist] = group_counts.get(sublist, 0) + 1
# Create a new dataframe from group_counts
df_label_count = pd.DataFrame(list(group_counts.items()), columns=['Label', 'Count'])
# Merge the label counts with the df_label DataFrame
df_label_count = df_labels.merge(df_label_count, on='Label', how='left')
# Exclude the "Other" group
df_bar_chart = df_label_count[df_label_count['Label'] != 'Other']
# Bar chart
fig = px.bar(df_bar_chart,
x='Label',
y='Count',
title='How many references have been found for each group?',
labels={'Count': 'Frequency'})
#Show plot
st.plotly_chart(fig, use_container_width=True)
# ### Table
st.write(df_vul[df_vul['Vulnerability Label'].apply(lambda x: 'Other' not in x)])