|
|
|
import glob, os, sys; |
|
sys.path.append('../utils') |
|
|
|
|
|
import seaborn as sns |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
import pandas as pd |
|
import streamlit as st |
|
from st_aggrid import AgGrid |
|
from utils.target_classifier import load_targetClassifier, target_classification |
|
import logging |
|
logger = logging.getLogger(__name__) |
|
from utils.config import get_classifier_params |
|
from io import BytesIO |
|
import xlsxwriter |
|
import plotly.express as px |
|
from pandas.api.types import ( |
|
is_categorical_dtype, |
|
is_datetime64_any_dtype, |
|
is_numeric_dtype, |
|
is_object_dtype, |
|
is_list_like) |
|
|
|
|
|
classifier_identifier = 'target' |
|
params = get_classifier_params(classifier_identifier) |
|
|
|
|
|
_lab_dict = { |
|
'NEGATIVE':'NO TARGET INFO', |
|
'TARGET':'TARGET', |
|
} |
|
|
|
|
|
def to_excel(df): |
|
|
|
|
|
|
|
|
|
|
|
len_df = len(df) |
|
output = BytesIO() |
|
writer = pd.ExcelWriter(output, engine='xlsxwriter') |
|
df.to_excel(writer, index=False, sheet_name='rawdata') |
|
if 'target_hits' in st.session_state: |
|
target_hits = st.session_state['target_hits'] |
|
if 'keep' in target_hits.columns: |
|
|
|
target_hits = target_hits[target_hits.keep == True] |
|
target_hits = target_hits.reset_index(drop=True) |
|
target_hits.drop(columns = ['keep'], inplace=True) |
|
target_hits.to_excel(writer,index=False,sheet_name = 'Target') |
|
else: |
|
|
|
target_hits = target_hits.sort_values(by=['Target Score'], ascending=False) |
|
target_hits = target_hits.reset_index(drop=True) |
|
target_hits.to_excel(writer,index=False,sheet_name = 'Target') |
|
|
|
else: |
|
target_hits = df[df['Target Label'] == True] |
|
target_hits.drop(columns=['Target Label','Netzero Score','GHG Score','Action Label', |
|
'Action Score','Policies_Plans Label','Indicator Label', |
|
'Policies_Plans Score','Conditional Score'],inplace=True) |
|
target_hits = target_hits.sort_values(by=['Target Score'], ascending=False) |
|
target_hits = target_hits.reset_index(drop=True) |
|
target_hits.to_excel(writer,index=False,sheet_name = 'Target') |
|
|
|
|
|
if 'action_hits' in st.session_state: |
|
action_hits = st.session_state['action_hits'] |
|
if 'keep' in action_hits.columns: |
|
action_hits = action_hits[action_hits.keep == True] |
|
action_hits = action_hits.reset_index(drop=True) |
|
action_hits.drop(columns = ['keep'], inplace=True) |
|
action_hits.to_excel(writer,index=False,sheet_name = 'Action') |
|
else: |
|
action_hits = action_hits.sort_values(by=['Action Score'], ascending=False) |
|
action_hits = action_hits.reset_index(drop=True) |
|
action_hits.to_excel(writer,index=False,sheet_name = 'Action') |
|
else: |
|
action_hits = df[df['Action Label'] == True] |
|
action_hits.drop(columns=['Target Label','Target Score','Netzero Score', |
|
'Netzero Label','GHG Label', |
|
'GHG Score','Action Label','Policies_Plans Label', |
|
'Policies_Plans Score','Conditional Score'],inplace=True) |
|
action_hits = action_hits.sort_values(by=['Action Score'], ascending=False) |
|
action_hits = action_hits.reset_index(drop=True) |
|
action_hits.to_excel(writer,index=False,sheet_name = 'Action') |
|
|
|
|
|
workbook = writer.book |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
writer.save() |
|
processed_data = output.getvalue() |
|
return processed_data |
|
|
|
def app(): |
|
|
|
with st.container(): |
|
if 'key0' in st.session_state: |
|
df = st.session_state.key0 |
|
|
|
|
|
classifier = load_targetClassifier(classifier_name=params['model_name']) |
|
st.session_state['{}_classifier'.format(classifier_identifier)] = classifier |
|
if len(df) > 100: |
|
warning_msg = ": This might take sometime, please sit back and relax." |
|
else: |
|
warning_msg = "" |
|
|
|
df = target_classification(haystack_doc=df, |
|
threshold= params['threshold']) |
|
st.session_state.key1 = df |
|
|
|
def filter_for_tracs(df): |
|
sector_list = ['Transport','Energy','Economy-wide'] |
|
df['check'] = df['Sector Label'].apply(lambda x: any(i in x for i in sector_list)) |
|
df = df[df.check == True].reset_index(drop=True) |
|
df['Sector Label'] = df['Sector Label'].apply(lambda x: [i for i in x if i in sector_list]) |
|
df.drop(columns = ['check'],inplace=True) |
|
return df |
|
|
|
def target_display(): |
|
if 'key1' in st.session_state: |
|
df = st.session_state.key1 |
|
st.caption(""" **{}** is splitted into **{}** paragraphs/text chunks."""\ |
|
.format(os.path.basename(st.session_state['filename']), |
|
len(df))) |
|
hits = df[df['Target Label'] == 'TARGET'].reset_index(drop=True) |
|
range_val = min(5,len(hits)) |
|
if range_val !=0: |
|
|
|
count_target = sum(hits['Target Label'] == 'TARGET') |
|
count_netzero = sum(hits['Netzero Label'] == 'NETZERO TARGET') |
|
count_ghg = sum(hits['GHG Label'] == 'GHG') |
|
count_transport = sum([True if 'Transport' in x else False |
|
for x in hits['Sector Label']]) |
|
|
|
c1, c2 = st.columns([1,1]) |
|
with c1: |
|
st.write('**Target Paragraphs**: `{}`'.format(count_target)) |
|
st.write('**NetZero Related Paragraphs**: `{}`'.format(count_netzero)) |
|
with c2: |
|
st.write('**GHG Target Related Paragraphs**: `{}`'.format(count_ghg)) |
|
st.write('**Transport Related Paragraphs**: `{}`'.format(count_transport)) |
|
|
|
hits.drop(columns=['Target Label','Netzero Score','GHG Score','Action Label', |
|
'Action Score','Policies_Plans Label','Indicator Label', |
|
'Policies_Plans Score','Conditional Score'],inplace=True) |
|
hits = hits.sort_values(by=['Target Score'], ascending=False) |
|
hits = hits.reset_index(drop=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.write('----------------') |
|
|
|
|
|
st.caption("Filter table to select rows to keep for Target category") |
|
hits = filter_for_tracs(hits) |
|
convert_type = {'Netzero Label': 'category', |
|
'Conditional Label':'category', |
|
'GHG Label':'category', |
|
} |
|
hits = hits.astype(convert_type) |
|
filter_dataframe(hits) |
|
|
|
|
|
|
|
|
|
|
|
|
|
with st.sidebar: |
|
st.write('-------------') |
|
df_xlsx = to_excel(df) |
|
st.download_button(label='📥 Download Result', |
|
data=df_xlsx , |
|
file_name= os.path.splitext(os.path.basename(st.session_state['filename']))[0]+'.xlsx') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame: |
|
""" |
|
Adds a UI on top of a dataframe to let viewers filter columns |
|
|
|
Args: |
|
df (pd.DataFrame): Original dataframe |
|
|
|
Returns: |
|
pd.DataFrame: Filtered dataframe |
|
""" |
|
modify = st.checkbox("Add filters") |
|
|
|
if not modify: |
|
st.session_state['target_hits'] = df |
|
return |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
modification_container = st.container() |
|
|
|
with modification_container: |
|
cols = list(set(df.columns) -{'page','Extracted Text'}) |
|
cols.sort() |
|
to_filter_columns = st.multiselect("Filter dataframe on", cols |
|
) |
|
for column in to_filter_columns: |
|
left, right = st.columns((1, 20)) |
|
left.write("↳") |
|
|
|
if is_categorical_dtype(df[column]): |
|
|
|
user_cat_input = right.multiselect( |
|
f"Values for {column}", |
|
df[column].unique(), |
|
default=list(df[column].unique()), |
|
) |
|
df = df[df[column].isin(user_cat_input)] |
|
elif is_numeric_dtype(df[column]): |
|
_min = float(df[column].min()) |
|
_max = float(df[column].max()) |
|
step = (_max - _min) / 100 |
|
user_num_input = right.slider( |
|
f"Values for {column}", |
|
_min, |
|
_max, |
|
(_min, _max), |
|
step=step, |
|
) |
|
df = df[df[column].between(*user_num_input)] |
|
elif is_list_like(df[column]) & (type(df[column][0]) == list) : |
|
list_vals = set(x for lst in df[column].tolist() for x in lst) |
|
user_multi_input = right.multiselect( |
|
f"Values for {column}", |
|
list_vals, |
|
default=list_vals, |
|
) |
|
df['check'] = df[column].apply(lambda x: any(i in x for i in user_multi_input)) |
|
df = df[df.check == True] |
|
df.drop(columns = ['check'],inplace=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else: |
|
user_text_input = right.text_input( |
|
f"Substring or regex in {column}", |
|
) |
|
if user_text_input: |
|
df = df[df[column].str.lower().str.contains(user_text_input)] |
|
|
|
df = df.reset_index(drop=True) |
|
|
|
st.session_state['target_hits'] = df |
|
df['IKI_Netzero'] = df.apply(lambda x: 'T_NETZERO' if ((x['Netzero Label'] == 'NETZERO TARGET') & |
|
(x['Conditional Label'] == 'UNCONDITIONAL')) |
|
else 'T_NETZERO_C' if ((x['Netzero Label'] == 'NETZERO TARGET') & |
|
(x['Conditional Label'] == 'CONDITIONAL') |
|
) |
|
else None, axis=1 |
|
) |
|
def check_t(s,c): |
|
temp = [] |
|
if (('Transport' in s) & (c== 'UNCONDITIONAL')): |
|
temp.append('T_Transport_Unc') |
|
if (('Transport' in s) & (c == 'CONDITIONAL')): |
|
temp.append('T_Transport_C') |
|
if (('Economy-wide' in s) & (c == 'CONDITIONAL')): |
|
temp.append('T_Economy_C') |
|
if (('Economy-wide' in s) & (c == 'UNCONDITIONAL')): |
|
temp.append('T_Economy_Unc') |
|
if (('Energy' in s) & (c == 'CONDITIONAL')): |
|
temp.append('T_Energy_C') |
|
if (('Energy' in s) & (c == 'UNCONDITIONAL')): |
|
temp.append('T_Economy_Unc') |
|
return temp |
|
df['IKI_Target'] = df.apply(lambda x:check_t(x['Sector Label'], x['Conditional Label']), |
|
axis=1 ) |
|
|
|
|
|
df['keep'] = True |
|
|
|
|
|
df = df[['text','IKI_Netzero','IKI_Target','Target Score','Netzero Label','GHG Label', |
|
'Conditional Label','Sector Label','Adapt-Mitig Label','page','keep']] |
|
st.dataframe(df) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.session_state['target_hits'] = df |
|
|
|
return |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|