Spaces:
Sleeping
Sleeping
import streamlit as st | |
st.set_page_config(layout="wide") | |
import streamlit_authenticator as stauth | |
from uuid import uuid4 | |
import model_comparison as MCOMP | |
import model_loading as MLOAD | |
import model_inferencing as MINFER | |
import user_evaluation_variables | |
from pathlib import Path | |
import tab_manager | |
import yaml | |
from yaml.loader import SafeLoader | |
from PIL import Image | |
from huggingface_hub import CommitScheduler | |
AUTHENTICATOR = None | |
TBYB_LOGO = Image.open('./assets/TBYB_logo_light.png') | |
USER_LOGGED_IN = False | |
USER_DATABASE_PATH = './data/user_database.yaml' | |
USER_DATABASE_DIR = Path("user_database") | |
USER_DATABASE_DIR.mkdir(parents=True, exist_ok=True) | |
USER_DATABASE_PATH = USER_DATABASE_DIR / f"tbyb-users-{uuid4()}.json" | |
USER_DATABASE_UPDATE_SCHEDULER = CommitScheduler( | |
repo_id="try-before-you-bias-data", | |
repo_type="dataset", | |
folder_path=USER_DATABASE_DIR, | |
path_in_repo="data", | |
every=2, | |
) | |
def create_new_user(authenticator, users): | |
try: | |
if authenticator.register_user('Register user', preauthorization=False): | |
st.success('User registered successfully') | |
except Exception as e: | |
st.error(e) | |
with USER_DATABASE_UPDATE_SCHEDULER.lock: | |
with USER_DATABASE_PATH.open('w') as file: | |
yaml.dump(users, file, default_flow_style=False) | |
def forgot_password(authenticator, users): | |
try: | |
username_of_forgotten_password, email_of_forgotten_password, new_random_password = authenticator.forgot_password( | |
'Forgot password') | |
if username_of_forgotten_password: | |
st.success('New password to be sent securely') | |
# Random password should be transferred to user securely | |
except Exception as e: | |
st.error(e) | |
with open(USER_DATABASE_PATH, 'w') as file: | |
yaml.dump(users, file, default_flow_style=False) | |
def update_account_details(authenticator, users): | |
if st.session_state["authentication_status"]: | |
try: | |
if authenticator.update_user_details(st.session_state["username"], 'Update user details'): | |
st.success('Entries updated successfully') | |
except Exception as e: | |
st.error(e) | |
with open(USER_DATABASE_PATH, 'w') as file: | |
yaml.dump(users, file, default_flow_style=False) | |
def reset_password(authenticator, users): | |
if st.session_state["authentication_status"]: | |
try: | |
if authenticator.reset_password(st.session_state["username"], 'Reset password'): | |
st.success('Password modified successfully') | |
except Exception as e: | |
st.error(e) | |
with open(USER_DATABASE_PATH, 'w') as file: | |
yaml.dump(users, file, default_flow_style=False) | |
def user_login_create(): | |
global AUTHENTICATOR | |
global TBYB_LOGO | |
global USER_LOGGED_IN | |
users = None | |
with open(userDataFile) as file: | |
users = yaml.load(file, Loader=SafeLoader) | |
AUTHENTICATOR = stauth.Authenticate( | |
users['credentials'], | |
users['cookie']['name'], | |
users['cookie']['key'], | |
users['cookie']['expiry_days'], | |
users['preauthorized'] | |
) | |
with st.sidebar: | |
st.image(TBYB_LOGO, width=70) | |
loginTab, registerTab, detailsTab = st.tabs(["Log in", "Register", "Account details"]) | |
with loginTab: | |
name, authentication_status, username = AUTHENTICATOR.login('Login', 'main') | |
if authentication_status: | |
AUTHENTICATOR.logout('Logout', 'main') | |
st.write(f'Welcome *{name}*') | |
user_evaluation_variables.USERNAME = username | |
USER_LOGGED_IN = True | |
elif authentication_status == False: | |
st.error('Username/password is incorrect') | |
forgot_password(AUTHENTICATOR, users) | |
elif authentication_status == None: | |
st.warning('Please enter your username and password') | |
forgot_password(AUTHENTICATOR, users) | |
if not authentication_status: | |
with registerTab: | |
create_new_user(AUTHENTICATOR, users) | |
else: | |
with detailsTab: | |
st.write('**Username:** ', username) | |
st.write('**Name:** ', name) | |
st.write('**Email:** ', users['credentials']['usernames'][username]['email']) | |
# update_account_details(AUTHENTICATOR, users) | |
reset_password(AUTHENTICATOR, users) | |
return USER_LOGGED_IN | |
def setup_page_banner(): | |
global USER_LOGGED_IN | |
# for tab in [tab1, tab2, tab3, tab4, tab5]: | |
c1,c2,c3,c4,c5,c6,c7,c8,c9 = st.columns(9) | |
with c5: | |
st.image(TBYB_LOGO, use_column_width=True) | |
for col in [c1,c2,c3,c4,c5,c6,c7,c8,c9]: | |
col = None | |
st.title('Try Before You Bias (TBYB)') | |
st.write('*A Quantitative T2I Bias Evaluation Tool*') | |
def setup_how_to(): | |
expander = st.expander("How to Use") | |
expander.write("1. Login to your TBYB Account using the bar on the right\n" | |
"2. Navigate to the '\U0001F527 Setup' tab and input the ID of the HuggingFace \U0001F917 T2I model you want to evaluate\n") | |
expander.image(Image.open('./assets/HF_MODEL_ID_EXAMPLE.png')) | |
expander.write("3. Test your chosen model by generating an image using an input prompt e.g.: 'A corgi with some cool sunglasses'\n") | |
expander.image(Image.open('./assets/lykon_corgi.png')) | |
expander.write("4. Navigate to the '\U0001F30E General Eval.' or '\U0001F3AF Task-Oriented Eval.' tabs " | |
" to evaluate your model once it has been loaded\n" | |
"5. Once you have generated some evaluation images, head over to the '\U0001F4C1 Generated Images' tab to have a look at them\n" | |
"6. To check out your evaluations or all of the TBYB Community evaluations, head over to the '\U0001F4CA Model Comparison' tab\n" | |
"7. For more information about the evaluation process, see our paper at --PAPER HYPERLINK-- or navigate to the " | |
" '\U0001F4F0 Additional Information' tab for a TL;DR.\n" | |
"8. For any questions or to report any bugs/issues. Please contact jordan.vice@uwa.edu.au.\n") | |
def setup_additional_information_tab(tab): | |
with tab: | |
st.header("1. Quantifying Bias in Text-to-Image (T2I) Generative Models") | |
st.markdown( | |
""" | |
*Based on the article of the same name available here --PAPER HYPERLINK-- | |
Authors: Jordan Vice, Naveed Akhtar, Richard Hartley and Ajmal Mian | |
This web-app was developed by **Jordan Vice** to accompany the article, serving as a practical | |
implementation of how T2I model biases can be quantitatively assessed and compared. Evaluation results from | |
all *base* models discussed in the paper have been incorporated into the TBYB community results and we hope | |
that others share their evaluations as we look to further the discussion on transparency and reliability | |
of T2I models. | |
""") | |
st.header('2. A (very) Brief Summary') | |
st.image(Image.open('./assets/TBYB_flowchart.png')) | |
st.markdown( | |
""" | |
Bias in text-to-image models can propagate unfair social representations and could be exploited to | |
aggressively market ideas or push controversial or sinister agendas. Existing T2I model bias evaluation | |
methods focused on social biases. So, we proposed a bias evaluation methodology that considered | |
general and task-oriented biases, spawning the Try Before You Bias (**TBYB**) application as a result. | |
""" | |
) | |
st.markdown( | |
""" | |
We proposed three novel metrics to quantify T2I model biases: | |
1. Distribution Bias - $B_D$ | |
2. Jaccard Hallucination - $H_J$ | |
3. Generative Miss Rate - $M_G$ | |
Open the appropriate drop-down menu to understand the logic and inspiration behind metric. | |
""" | |
) | |
c1,c2,c3 = st.columns(3) | |
with c1: | |
with st.expander("Distribution Bias - $B_D$"): | |
st.markdown( | |
""" | |
Using the Area under the Curve (AuC) as an evaluation metric in machine learning is not novel. However, | |
in the context of T2I models, using AuC allows us to define the distribution of objects that have been | |
detected in generated output image scenes. | |
So, everytime an object is detected in a scene, we update a dictionary (which is available for | |
download after running an evaluation). After evaluating a full set of images, you can use this | |
information to determine what objects appear more frequently than others. | |
After all images are evaluated, we sort the objects in descending order and normalize the data. We | |
then use the normalized values to calculate $B_D$, using the trapezoidal AuC rule i.e.: | |
$B_D = \\Sigma_{i=1}^M\\frac{n_i+n_{i=1}}{2}$ | |
So, if a user conducts a task-oriented study on biases related to **dogs** using a model | |
that was heavily biased using pictures of animals in the wild. You might find that after running | |
evaluations, the most common objects detected were trees and grass - even if these objects weren't | |
specified in the prompt. This would result in a very low $B_D$ in comparison to a model that for | |
example was trained on images of dogs and animals in various different scenarios $\\rightarrow$ | |
which would result in a *higher* $B_D$ in comparison. | |
""" | |
) | |
with c2: | |
with st.expander("Jaccard Hallucination - $H_J$"): | |
st.markdown( | |
""" | |
Hallucination is a very common phenomena that is discussed in relation to generative AI, particularly | |
in relation to some of the most popular large language models. Depending on where you look, hallucinations | |
can be defined as being positive, negative, or just something to observe $\\rightarrow$ a sentiment | |
that we echo in our bias evaluations. | |
Now, how does hallucination tie into bias? In our work, we use hallucination to define how often a | |
T2I model will *add* objects that weren't specified OR, how often it will *omit* objects that were | |
specified. This indicates that there could be an innate shift in bias in the model, causing it to | |
add or omit certain objects. | |
Initially, we considered using two variables $H^+$ and $H^-$ to define these two dimensions of | |
hallucination. Then, we considered the Jaccard similarity coefficient, which | |
measures the similarity *and* diversity of two sets of objects/samples - defining this as | |
Jaccard Hallucination - $H_J$. | |
Simply put, we define the set of objects detected in the input prompt and then detect the objects in | |
the corresponding output image. Then, we determine the intersect over union. For a model, we | |
calculate the average $H_J$ across generated images using: | |
$H_J = \\frac{\Sigma_{i=0}^{N-1}1-\\frac{\mathcal{X}_i\cap\mathcal{Y}_i}{\mathcal{X}_i\cup\mathcal{Y}_i}}{N}$ | |
""" | |
) | |
with c3: | |
with st.expander("Generative Miss Rate - $M_G$"): | |
st.markdown( | |
""" | |
Whenever fairness and trust are discussed in the context of machine learning and AI systems, | |
performance is always highlighted as a key metric - regardless of the downstream task. So, in terms | |
of evaluating bias, we thought that it would be important to see if there was a correlation | |
between bias and performance (as we predicted). And while the other metrics do evaluate biases | |
in terms of misalignment, they do not consider the relationship between bias and performance. | |
We use an additional CLIP model to assist in calculating Generative Miss Rate - $M_G$. Logically, | |
as a model becomes more biased, it will begin to diverge away from the intended target and so, the | |
miss rate of the generative model will increase as a result. This was a major consideration when | |
designing this metric. | |
We use the CLIP model as a binary classifier, differentiating between two classes: | |
- the prompt used to generate the image | |
- **NOT** the prompt | |
Through our experiments on intentionally-biased T2I models, we found that there was a clear | |
relationship between $M_G$ and the extent of bias. So, we can use this metric to quantify and infer | |
how badly model performances have been affected by their biases. | |
""" | |
) | |
st.header('3. TBYB Constraints') | |
st.markdown( | |
""" | |
While we have attempted to design a comprehensive, automated bias evaluation tool. We must acknowledge that | |
in its infancy, TBYB has some constraints: | |
- We have not checked the validity of *every* single T2I model and model type on HuggingFace so we cannot | |
promise that all T2I models will work - if you run into any issues that you think should be possible, feel | |
free to reach out! | |
- Currently, a model_index.json file is required to load models and use them with TBYB, we will look to | |
address other models in future works | |
- TBYB only works on T2I models hosted on HuggingFace, other model repositories are not currently supported | |
- Adaptor models are not currently supported, we will look to add evaluation functionalities of these | |
models in the future. | |
- Download, generation, inference and evaluation times are all hardware dependent. | |
Keep in mind that these constraints may be removed or added to any time. | |
""") | |
st.header('4. Misuse, Malicious Use, and Out-of-Scope Use') | |
st.markdown( | |
""" | |
Given this application is used for the assessment of T2I biases and relies on | |
pre-trained models available on HuggingFace, we are not responsible for any content generated | |
by public-facing models that have been used to generate images using this application. | |
TBYB is proposed as an auxiliary tool to assess model biases and thus, if a chosen model is found to output | |
insensitive, disturbing, distressing or offensive images that propagate harmful stereotypes or | |
representations of marginalised groups, please address your concerns to the model providers. | |
However, given the TBYB tool is designed for bias quantification and is driven by transparency, it would be | |
beneficial to the TBYB community to share evaluations of biased T2I models! | |
We share no association with HuggingFace \U0001F917, we only use their services as a model repository, | |
given their growth in popularity in the computer science community recently. | |
For further questions/queries or if you want to simply strike a conversation, | |
please reach out to Jordan Vice at: jordan.vice@uwa.edu.au""") | |
setup_page_banner() | |
setup_how_to() | |
if user_login_create(): | |
tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["\U0001F527 Setup", "\U0001F30E General Eval.", "\U0001F3AF Task-Oriented Eval.", | |
"\U0001F4CA Model Comparison", "\U0001F4C1 Generated Images", "\U0001F4F0 Additional Information"]) | |
setup_additional_information_tab(tab6) | |
# PLASTER THE LOGO EVERYWHERE | |
tab2.subheader("General Bias Evaluation") | |
tab2.write("Waiting for \U0001F527 Setup to be complete...") | |
tab3.subheader("Task-Oriented Bias Evaluation") | |
tab3.write("Waiting for \U0001F527 Setup to be complete...") | |
tab4.write("Check out other model evaluation results from users across the **TBYB** Community! \U0001F30E ") | |
tab4.write("You can also just compare your own model evaluations by clicking the '*Personal Evaluation*' buttons") | |
MCOMP.initialise_page(tab4) | |
tab5.subheader("Generated Images from General and Task-Oriented Bias Evaluations") | |
tab5.write("Waiting for \U0001F527 Setup to be complete...") | |
with tab1: | |
with st.form("model_definition_form", clear_on_submit=True): | |
modelID = st.text_input('Input the HuggingFace \U0001F917 T2I model_id for the model you ' | |
'want to analyse e.g.: "runwayml/stable-diffusion-v1-5"') | |
submitted1 = st.form_submit_button("Submit") | |
if modelID: | |
with st.spinner('Checking if ' + modelID + ' is valid and downloading it (if required)'): | |
modelLoaded = MLOAD.check_if_model_exists(modelID) | |
if modelLoaded is not None: | |
# st.write("Located " + modelID + " model_index.json file") | |
st.write("Located " + modelID) | |
modelType = MLOAD.get_model_info(modelLoaded) | |
if modelType is not None: | |
st.write("Model is of Type: ", modelType) | |
if submitted1: | |
MINFER.TargetModel = MLOAD.import_model(modelID, modelType) | |
if MINFER.TargetModel is not None: | |
st.write("Text-to-image pipeline looks like this:") | |
st.write(MINFER.TargetModel) | |
user_evaluation_variables.MODEL = modelID | |
user_evaluation_variables.MODEL_TYPE = modelType | |
else: | |
st.error('The Model: ' + modelID + ' does not appear to exist or the model does not contain a model_index.json file.' | |
' Please check that that HuggingFace repo ID is valid.' | |
' For more help, please see the "How to Use" Tab above.', icon="🚨") | |
if modelID: | |
with st.form("example_image_gen_form", clear_on_submit=True): | |
testPrompt = st.text_input('Input a random test prompt to test out your ' | |
'chosen model and see if its generating images:') | |
submitted2 = st.form_submit_button("Submit") | |
if testPrompt and submitted2: | |
with st.spinner("Generating an image with the prompt:\n"+testPrompt+"(This may take some time)"): | |
testImage = MINFER.generate_test_image(MINFER.TargetModel, testPrompt) | |
st.image(testImage, caption='Model: ' + modelID + ' Prompt: ' + testPrompt) | |
st.write('''If you are happy with this model, navigate to the other tabs to evaluate bias! | |
Otherwise, feel free to load up a different model and run it again''') | |
if MINFER.TargetModel is not None: | |
tab_manager.completed_setup([tab2, tab3, tab4, tab5], modelID) | |
else: | |
MCOMP.databaseDF = None | |
user_evaluation_variables.reset_variables('general') | |
user_evaluation_variables.reset_variables('task-oriented') | |
st.write('') | |
st.warning('Log in or register your email to get started! ', icon="⚠️") |