Spaces:
Sleeping
Sleeping
File size: 8,848 Bytes
d41bb77 511da63 21ee22a d41bb77 511da63 d41bb77 511da63 d41bb77 511da63 d41bb77 511da63 d41bb77 511da63 d41bb77 511da63 d41bb77 511da63 d41bb77 511da63 d41bb77 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import pandas as pd
import streamlit as st
import numpy as np
import plotly.express as px
from yaml import safe_load
import user_evaluation_variables
databaseDF = None
from pathlib import Path
EVAL_DATABASE_DIR = Path("data")
EVAL_DATABASE_DIR.mkdir(parents=True, exist_ok=True)
# GEN_EVAL_DATABASE_PATH = EVAL_DATABASE_DIR / f"general_eval_database.yaml"
# TASK_EVAL_DATABASE_PATH = EVAL_DATABASE_DIR / f"task_oriented_eval_database.yaml"
GEN_EVAL_DATABASE_PATH = 'user_data/general_eval_database.yaml'
TASK_EVAL_DATABASE_PATH = 'user_data/general_eval_database.yaml'
def get_evaluation_id(evalType, debugging):
global GEN_EVAL_DATABASE_PATH
global TASK_EVAL_DATABASE_PATH
if evalType == 'general':
DFPath = GEN_EVAL_DATABASE_PATH
else:
DFPath = TASK_EVAL_DATABASE_PATH
df = add_user_evalID_columns_to_df(None, DFPath, False)
evalColumn = [int(x.split('_')[1]) for x in list(df['Eval. ID'])]
newEvalID = max(evalColumn) + 1
if evalType == 'general':
newEvalID = 'G_'+str(newEvalID).zfill(len(list(df['Eval. ID'])[0].split('_')[1]))
else:
newEvalID = 'T_' + str(newEvalID).zfill(len(list(df['Eval. ID'])[0].split('_')[1]))
if debugging:
st.write(df['Eval. ID'])
st.write(evalColumn)
st.write("current last EVAL ID:", df['Eval. ID'].iloc[-1])
st.write("NEW EVAL ID:", newEvalID)
return newEvalID
def dataframe_with_selections(df):
df_with_selections = df.copy()
df_with_selections.insert(0, "Select", True)
# Get dataframe row-selections from user with st.data_editor
edited_df = st.data_editor(
df_with_selections,
hide_index=True,
column_config={"Select": st.column_config.CheckboxColumn(required=True)},
disabled=df.columns,
)
# Filter the dataframe using the temporary column, then drop the column
selected_rows = edited_df[edited_df.Select]
return selected_rows.drop('Select', axis=1)
def add_user_evalID_columns_to_df(df, evalDataPath, personalFLAG):
with open(evalDataPath, 'r') as f:
yamlData = safe_load(f)
for user in yamlData['evaluations']['username']:
if df is None:
df = pd.DataFrame(yamlData['evaluations']['username'][user]).T
df.insert(0, "Eval. ID", list(yamlData['evaluations']['username'][user].keys()), True)
df.insert(0, "User", [user for i in range(len(yamlData['evaluations']['username'][user]))],
True)
else:
df = pd.concat([df, pd.DataFrame(yamlData['evaluations']['username'][user]).T],
ignore_index=True)
evalIDIterator = 0
for index, row in df.iterrows():
if row['User'] is np.nan:
df.loc[index, 'User'] = user
if row['Eval. ID'] is np.nan:
df.loc[index, 'Eval. ID'] = list(yamlData['evaluations']['username'][user].keys())[
evalIDIterator]
evalIDIterator += 1
if personalFLAG:
df.drop(df[df['User'] != user_evaluation_variables.USERNAME].index, inplace=True)
if len(df) == 0:
st.warning("It looks like you haven't conducted any evaluations! Run some evaluations and refresh this page."
"If the problem persists, please contact support. ", icon="⚠️")
return df
def initialise_page(tab):
global databaseDF
global GEN_EVAL_DATABASE_PATH
global TASK_EVAL_DATABASE_PATH
with tab:
c1, c2 = st.columns(2)
with c1:
st.subheader("\U0001F30E General Bias")
with st.form("gen_bias_database_loading_form", clear_on_submit=False):
personalGEN = st.form_submit_button("Personal Evaluations")
communityGEN = st.form_submit_button("TBYB Community Evaluations")
if personalGEN:
databaseDF = None
databaseDF = add_user_evalID_columns_to_df(databaseDF, GEN_EVAL_DATABASE_PATH,True)[["User", "Eval. ID", "Model", "Model Type", "Resolution", "No. Samples", "Inference Steps",
"Objects", "Actions", "Occupations", "Dist. Bias", "Hallucination", "Gen. Miss Rate",
"Run Time", "Date", "Time"]]
if communityGEN:
databaseDF = None
databaseDF = add_user_evalID_columns_to_df(databaseDF, GEN_EVAL_DATABASE_PATH, False)[["User", "Eval. ID", "Model", "Model Type", "Resolution", "No. Samples", "Inference Steps",
"Objects", "Actions", "Occupations", "Dist. Bias", "Hallucination", "Gen. Miss Rate",
"Run Time", "Date", "Time"]]
with c2:
st.subheader("\U0001F3AF Task-Oriented Bias")
with st.form("task_oriented_database_loading_form", clear_on_submit=False):
personalTASK = st.form_submit_button("Personal Evaluations")
communityTASK = st.form_submit_button("TBYB Community Evaluations")
if personalTASK:
databaseDF = None
databaseDF = add_user_evalID_columns_to_df(databaseDF, TASK_EVAL_DATABASE_PATH, True)[["User", "Eval. ID", "Model", "Model Type", "Resolution", "No. Samples", "Inference Steps",
"Target", "Dist. Bias", "Hallucination", "Gen. Miss Rate", "Run Time", "Date", "Time"]]
if communityTASK:
databaseDF = None
databaseDF = add_user_evalID_columns_to_df(databaseDF, TASK_EVAL_DATABASE_PATH,False)[["User", "Eval. ID", "Model", "Model Type", "Resolution", "No. Samples", "Inference Steps",
"Target", "Dist. Bias", "Hallucination", "Gen. Miss Rate", "Run Time", "Date", "Time"]]
if databaseDF is not None:
selection = dataframe_with_selections(databaseDF)
normalised = st.toggle('Normalize Data (better for direct comparisons)')
submitCOMPARE = st.button("Compare Selected Models")
if submitCOMPARE:
plot_comparison_graphs(tab, selection, normalised)
def normalise_data(rawValues, metric):
rawValues = list(map(float, rawValues))
normalisedValues = []
# Normalise the raw data
for x in rawValues:
if (max(rawValues) - min(rawValues)) == 0:
normX = 1
else:
if metric in ['HJ','MG']:
normX = (x - min(rawValues)) / (max(rawValues) - min(rawValues))
else:
normX = 1 - ((x - min(rawValues)) / (max(rawValues) - min(rawValues)))
normalisedValues.append(normX)
return normalisedValues
def plot_comparison_graphs(tab, data,normalise):
BDColor = ['#59DC23', ] * len(data['Dist. Bias'].tolist())
HJColor = ['#2359DC', ] * len(data['Hallucination'].tolist())
MGColor = ['#DC2359', ] * len(data['Gen. Miss Rate'].tolist())
if not normalise:
BDData = data['Dist. Bias']
HJData = data['Hallucination']
MGData = data['Gen. Miss Rate']
else:
data['Dist. Bias'] = normalise_data(data['Dist. Bias'], 'BD')
data['Hallucination'] = normalise_data(data['Hallucination'], 'HJ')
data['Gen. Miss Rate'] = normalise_data(data['Gen. Miss Rate'], 'MG')
with tab:
st.write("Selected evaluations for comparison:")
st.write(data)
BDFig = px.bar(x=data['Eval. ID'], y=data['Dist. Bias'],color_discrete_sequence=BDColor).update_layout(
xaxis_title=r'Evaluation ID', yaxis_title=r'Distribution Bias', title=r'Distribution Bias Comparison')
st.plotly_chart(BDFig, theme="streamlit",use_container_width=True)
HJFig = px.bar(x=data['Eval. ID'], y=data['Hallucination'],color_discrete_sequence=HJColor).update_layout(
xaxis_title=r'Evaluation ID', yaxis_title=r'Jaccard Hallucination', title=r'Jaccard Hallucination Comparison')
st.plotly_chart(HJFig, theme="streamlit",use_container_width=True)
MGFig = px.bar(x=data['Eval. ID'], y=data['Gen. Miss Rate'],color_discrete_sequence=MGColor).update_layout(
xaxis_title=r'Evaluation ID', yaxis_title=r'Generative Miss Rate', title=r'Generative Miss Rate Comparison')
st.plotly_chart(MGFig, theme="streamlit",use_container_width=True)
if normalise:
Full3DFig = px.scatter_3d(data, x='Dist. Bias', y='Hallucination', z='Gen. Miss Rate',
width=800, height=800,color='Eval. ID',title='3D Text-to-Image Model Bias Comparison')
st.plotly_chart(Full3DFig, theme="streamlit",use_container_width=True)
|