import glob, os, sys; |
sys.path.append('../utils') |
import seaborn as sns |
import matplotlib.pyplot as plt |
import numpy as np |
import pandas as pd |
import streamlit as st |
from st_aggrid import AgGrid |
from utils.target_classifier import load_targetClassifier, target_classification |
import logging |
logger = logging.getLogger(__name__) |
from utils.config import get_classifier_params |
from io import BytesIO |
import xlsxwriter |
import plotly.express as px |
from pandas.api.types import ( |
is_categorical_dtype, |
is_datetime64_any_dtype, |
is_numeric_dtype, |
is_object_dtype, |
is_list_like) |
classifier_identifier = 'target' |
params = get_classifier_params(classifier_identifier) |
_lab_dict = { |
} |
def to_excel(df): |
len_df = len(df) |
output = BytesIO() |
writer = pd.ExcelWriter(output, engine='xlsxwriter') |
df.to_excel(writer, index=False, sheet_name='rawdata') |
if 'target_hits' in st.session_state: |
target_hits = st.session_state['target_hits'] |
if 'keep' in target_hits.columns: |
target_hits = target_hits[target_hits.keep == True] |
target_hits = target_hits.reset_index(drop=True) |
target_hits.drop(columns = ['keep'], inplace=True) |
target_hits.to_excel(writer,index=False,sheet_name = 'Target') |
else: |
target_hits = target_hits.sort_values(by=['Target Score'], ascending=False) |
target_hits = target_hits.reset_index(drop=True) |
target_hits.to_excel(writer,index=False,sheet_name = 'Target') |
else: |
target_hits = df[df['Target Label'] == True] |
target_hits.drop(columns=['Target Label','Netzero Score','GHG Score','Action Label', |
'Action Score','Policies_Plans Label','Indicator Label', |
'Policies_Plans Score','Conditional Score'],inplace=True) |
target_hits = target_hits.sort_values(by=['Target Score'], ascending=False) |
target_hits = target_hits.reset_index(drop=True) |
target_hits.to_excel(writer,index=False,sheet_name = 'Target') |
if 'action_hits' in st.session_state: |
action_hits = st.session_state['action_hits'] |
if 'keep' in action_hits.columns: |
action_hits = action_hits[action_hits.keep == True] |
action_hits = action_hits.reset_index(drop=True) |
action_hits.drop(columns = ['keep'], inplace=True) |
action_hits.to_excel(writer,index=False,sheet_name = 'Action') |
else: |
action_hits = action_hits.sort_values(by=['Action Score'], ascending=False) |
action_hits = action_hits.reset_index(drop=True) |
action_hits.to_excel(writer,index=False,sheet_name = 'Action') |
else: |
action_hits = df[df['Action Label'] == True] |
action_hits.drop(columns=['Target Label','Target Score','Netzero Score', |
'Netzero Label','GHG Label', |
'GHG Score','Action Label','Policies_Plans Label', |
'Policies_Plans Score','Conditional Score'],inplace=True) |
action_hits = action_hits.sort_values(by=['Action Score'], ascending=False) |
action_hits = action_hits.reset_index(drop=True) |
action_hits.to_excel(writer,index=False,sheet_name = 'Action') |
workbook = writer.book |
writer.save() |
processed_data = output.getvalue() |
return processed_data |
def app(): |
with st.container(): |
if 'key0' in st.session_state: |
df = st.session_state.key0 |
classifier = load_targetClassifier(classifier_name=params['model_name']) |
st.session_state['{}_classifier'.format(classifier_identifier)] = classifier |
if len(df) > 100: |
warning_msg = ": This might take sometime, please sit back and relax." |
else: |
warning_msg = "" |
df = target_classification(haystack_doc=df, |
threshold= params['threshold']) |
st.session_state.key1 = df |
def filter_for_tracs(df): |
sector_list = ['Transport','Energy','Economy-wide'] |
df['check'] = df['Sector Label'].apply(lambda x: any(i in x for i in sector_list)) |
df = df[df.check == True].reset_index(drop=True) |
df['Sector Label'] = df['Sector Label'].apply(lambda x: [i for i in x if i in sector_list]) |
df.drop(columns = ['check'],inplace=True) |
return df |
def target_display(): |
if 'key1' in st.session_state: |
df = st.session_state.key1 |
st.caption(""" **{}** is splitted into **{}** paragraphs/text chunks."""\ |
.format(os.path.basename(st.session_state['filename']), |
len(df))) |
hits = df[df['Target Label'] == 'TARGET'].reset_index(drop=True) |
range_val = min(5,len(hits)) |
if range_val !=0: |
count_target = sum(hits['Target Label'] == 'TARGET') |
count_netzero = sum(hits['Netzero Label'] == 'NETZERO TARGET') |
count_ghg = sum(hits['GHG Label'] == 'GHG') |
count_transport = sum([True if 'Transport' in x else False |
for x in hits['Sector Label']]) |
c1, c2 = st.columns([1,1]) |
with c1: |
st.write('**Target Paragraphs**: `{}`'.format(count_target)) |
st.write('**NetZero Related Paragraphs**: `{}`'.format(count_netzero)) |
with c2: |
st.write('**GHG Target Related Paragraphs**: `{}`'.format(count_ghg)) |
st.write('**Transport Related Paragraphs**: `{}`'.format(count_transport)) |
hits.drop(columns=['Target Label','Netzero Score','GHG Score','Action Label', |
'Action Score','Policies_Plans Label','Indicator Label', |
'Policies_Plans Score','Conditional Score'],inplace=True) |
hits = hits.sort_values(by=['Target Score'], ascending=False) |
hits = hits.reset_index(drop=True) |
st.write('----------------') |
st.caption("Filter table to select rows to keep for Target category") |
hits = filter_for_tracs(hits) |
convert_type = {'Netzero Label': 'category', |
'Conditional Label':'category', |
'GHG Label':'category', |
} |
hits = hits.astype(convert_type) |
filter_dataframe(hits) |
with st.sidebar: |
st.write('-------------') |
df_xlsx = to_excel(df) |
st.download_button(label='📥 Download Result', |
data=df_xlsx , |
file_name= os.path.splitext(os.path.basename(st.session_state['filename']))[0]+'.xlsx') |
def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame: |
""" |
Adds a UI on top of a dataframe to let viewers filter columns |
Args: |
df (pd.DataFrame): Original dataframe |
Returns: |
pd.DataFrame: Filtered dataframe |
""" |
modify = st.checkbox("Add filters") |
if not modify: |
st.session_state['target_hits'] = df |
return |
modification_container = st.container() |
with modification_container: |
cols = list(set(df.columns) -{'page','Extracted Text'}) |
cols.sort() |
to_filter_columns = st.multiselect("Filter dataframe on", cols |
) |
for column in to_filter_columns: |
left, right = st.columns((1, 20)) |
left.write("↳") |
if is_categorical_dtype(df[column]): |
user_cat_input = right.multiselect( |
f"Values for {column}", |
df[column].unique(), |
default=list(df[column].unique()), |
) |
df = df[df[column].isin(user_cat_input)] |
elif is_numeric_dtype(df[column]): |
_min = float(df[column].min()) |
_max = float(df[column].max()) |
step = (_max - _min) / 100 |
user_num_input = right.slider( |
f"Values for {column}", |
_min, |
_max, |
(_min, _max), |
step=step, |
) |
df = df[df[column].between(*user_num_input)] |
elif is_list_like(df[column]) & (type(df[column][0]) == list) : |
list_vals = set(x for lst in df[column].tolist() for x in lst) |
user_multi_input = right.multiselect( |
f"Values for {column}", |
list_vals, |
default=list_vals, |
) |
df['check'] = df[column].apply(lambda x: any(i in x for i in user_multi_input)) |
df = df[df.check == True] |
df.drop(columns = ['check'],inplace=True) |
else: |
user_text_input = right.text_input( |
f"Substring or regex in {column}", |
) |
if user_text_input: |
df = df[df[column].str.lower().str.contains(user_text_input)] |
df = df.reset_index(drop=True) |
st.session_state['target_hits'] = df |
df['IKI_Netzero'] = df.apply(lambda x: 'T_NETZERO' if ((x['Netzero Label'] == 'NETZERO TARGET') & |
(x['Conditional Label'] == 'UNCONDITIONAL')) |
else 'T_NETZERO_C' if ((x['Netzero Label'] == 'NETZERO TARGET') & |
(x['Conditional Label'] == 'CONDITIONAL') |
) |
else None, axis=1 |
) |
def check_t(s,c): |
temp = [] |
if (('Transport' in s) & (c== 'UNCONDITIONAL')): |
temp.append('T_Transport_Unc') |
if (('Transport' in s) & (c == 'CONDITIONAL')): |
temp.append('T_Transport_C') |
if (('Economy-wide' in s) & (c == 'CONDITIONAL')): |
temp.append('T_Economy_C') |
if (('Economy-wide' in s) & (c == 'UNCONDITIONAL')): |
temp.append('T_Economy_Unc') |
if (('Energy' in s) & (c == 'CONDITIONAL')): |
temp.append('T_Energy_C') |
if (('Energy' in s) & (c == 'UNCONDITIONAL')): |
temp.append('T_Economy_Unc') |
return temp |
df['IKI_Target'] = df.apply(lambda x:check_t(x['Sector Label'], x['Conditional Label']), |
axis=1 ) |
df['keep'] = True |
df = df[['text','IKI_Netzero','IKI_Target','Target Score','Netzero Label','GHG Label', |
'Conditional Label','Sector Label','Adapt-Mitig Label','page','keep']] |
st.dataframe(df) |
st.session_state['target_hits'] = df |
return |