Spaces:

TeresaK
/

cpv_test

Runtime error

App Files Files Community

cpv_test / appStore /vulnerability_analysis.py

leavoigt

Update appStore/vulnerability_analysis.py

bf7f98d 11 months ago

raw

history blame

5.7 kB

	# set path
	import glob, os, sys;
	sys.path.append('../utils')

	#import needed libraries
	import seaborn as sns
	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	import streamlit as st
	from utils.vulnerability_classifier import load_vulnerabilityClassifier, vulnerability_classification
	import logging
	logger = logging.getLogger(__name__)
	from utils.config import get_classifier_params
	from utils.preprocessing import paraLengthCheck
	from io import BytesIO
	import xlsxwriter
	import plotly.express as px
	from utils.vulnerability_classifier import label_dict



	# Declare all the necessary variables
	classifier_identifier = 'vulnerability'
	params = get_classifier_params(classifier_identifier)

	@st.cache_data
	def to_excel(df,sectorlist):
	len_df = len(df)
	output = BytesIO()
	writer = pd.ExcelWriter(output, engine='xlsxwriter')
	df.to_excel(writer, index=False, sheet_name='Sheet1')
	workbook = writer.book
	worksheet = writer.sheets['Sheet1']
	worksheet.data_validation('S2:S{}'.format(len_df),
	{'validate': 'list',
	'source': ['No', 'Yes', 'Discard']})
	worksheet.data_validation('X2:X{}'.format(len_df),
	{'validate': 'list',
	'source': sectorlist + ['Blank']})
	worksheet.data_validation('T2:T{}'.format(len_df),
	{'validate': 'list',
	'source': sectorlist + ['Blank']})
	worksheet.data_validation('U2:U{}'.format(len_df),
	{'validate': 'list',
	'source': sectorlist + ['Blank']})
	worksheet.data_validation('V2:V{}'.format(len_df),
	{'validate': 'list',
	'source': sectorlist + ['Blank']})
	worksheet.data_validation('W2:U{}'.format(len_df),
	{'validate': 'list',
	'source': sectorlist + ['Blank']})
	writer.save()
	processed_data = output.getvalue()
	return processed_data

	def app():

	### Main app code ###
	with st.container():

	# If a document has been processed
	if 'key0' in st.session_state:

	# Run vulnerability classifier
	df = st.session_state.key0
	classifier = load_vulnerabilityClassifier(classifier_name=params['model_name'])
	st.session_state['{}_classifier'.format(classifier_identifier)] = classifier


	# Get the predictions
	df = vulnerability_classification(haystack_doc=df,
	threshold= params['threshold'])

	# Filter the dataframe to only show the paragraphs with references
	df_filtered = df[df['Vulnerability Label'].apply(lambda x: len(x) > 0 and 'Other' not in x)]

	# Store df in session state with key1
	st.session_state.key1 = df_filtered


	def vulnerability_display():

	# Assign dataframe a name
	df_vul = st.session_state['key0']

	#st.write(df_vul)

	# Header
	st.subheader("Explore references to vulnerable groups:")

	col1, col2 = st.columns([1,1])

	with col1:


	# Text
	num_paragraphs = len(df_vul['Vulnerability Label'])
	num_references = df_vul['Vulnerability Label'].apply(lambda x: 'Other' not in x).sum()

	st.markdown(f"""<div style="text-align: justify;"> The document contains a
	total of <span style="color: red;">{num_paragraphs}</span> paragraphs.
	We identified <span style="color: red;">{num_references}</span>
	references to groups in vulnerable situations.</div>
	<br>
	In the chart on the right you can see how often each group has been references.
	For a more detailed view in the text, see the paragraphs and
	their respective labels in the table below.</div>""", unsafe_allow_html=True)

	with col2:

	### Bar chart

	# # Create a df that stores all the labels
	df_labels = pd.DataFrame(list(label_dict.items()), columns=['Label ID', 'Label'])

	# Count how often each label appears in the "Vulnerability Labels" column
	group_counts = {}

	# Iterate through each sublist
	for index, row in df_vul.iterrows():

	# Iterate through each group in the sublist
	for sublist in row['Vulnerability Label']:

	# Update the count in the dictionary
	group_counts[sublist] = group_counts.get(sublist, 0) + 1

	# Create a new dataframe from group_counts
	df_label_count = pd.DataFrame(list(group_counts.items()), columns=['Label', 'Count'])

	# Merge the label counts with the df_label DataFrame
	df_label_count = df_labels.merge(df_label_count, on='Label', how='left')

	# Exclude the "Other" group
	df_bar_chart = df_label_count[df_label_count['Label'] != 'Other']

	# Bar chart
	fig = px.bar(df_bar_chart,
	x='Label',
	y='Count',
	title='How many references have been found for each group?',
	labels={'Count': 'Frequency'})

	#Show plot
	st.plotly_chart(fig, use_container_width=True)

	# ### Table
	st.write(df_vul[df_vul['Vulnerability Label'].apply(lambda x: 'Other' not in x)])