Spaces:
Sleeping
Sleeping
File size: 20,732 Bytes
0560487 14bd377 0560487 14bd377 0560487 14bd377 0560487 14bd377 0560487 14bd377 0560487 14bd377 0560487 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 |
import streamlit as st
st.set_page_config(layout="wide")
import streamlit_authenticator as stauth
import pandas as pd
import numpy as np
import uuid
import model_comparison as MCOMP
import model_loading as MLOAD
import model_inferencing as MINFER
import user_evaluation_variables
from pathlib import Path
import tab_manager
import yaml
from yaml.loader import SafeLoader
from PIL import Image
from huggingface_hub import CommitScheduler, HfApi, CommitOperationAdd
AUTHENTICATOR = None
TBYB_LOGO = Image.open('./assets/TBYB_logo_light.png')
USER_LOGGED_IN = False
USER_DATABASE_PATH = './data/user_database.yaml'
userDataFile = Path("data/") / f"user_database_{uuid.uuid4()}.yaml"
userDataFolder = userDataFile.parent
USER_DATABASE_UPDATE_SCHEDULER = CommitScheduler(
repo_id="JVice/try-before-you-bias",
repo_type="dataset",
folder_path=userDataFolder,
path_in_repo="data",
every=5,
)
def create_new_user(authenticator, users):
try:
if authenticator.register_user('Register user', preauthorization=False):
st.success('User registered successfully')
except Exception as e:
st.error(e)
with USER_DATABASE_UPDATE_SCHEDULER.lock:
with open(userDataFile, 'w') as file:
yaml.dump(users, file, default_flow_style=False)
# HfApi().create_commit(
# repo_id="JVice/try-before-you-bias",
# operations=[CommitOperationAdd(path_in_repo="data/user_database.yaml",
# path_or_fileobj="~/repo/data/user_database.yaml")],
# commit_message="Updating ",
# )
def forgot_password(authenticator, users):
try:
username_of_forgotten_password, email_of_forgotten_password, new_random_password = authenticator.forgot_password(
'Forgot password')
if username_of_forgotten_password:
st.success('New password to be sent securely')
# Random password should be transferred to user securely
except Exception as e:
st.error(e)
with open(USER_DATABASE_PATH, 'w') as file:
yaml.dump(users, file, default_flow_style=False)
def update_account_details(authenticator, users):
if st.session_state["authentication_status"]:
try:
if authenticator.update_user_details(st.session_state["username"], 'Update user details'):
st.success('Entries updated successfully')
except Exception as e:
st.error(e)
with open(USER_DATABASE_PATH, 'w') as file:
yaml.dump(users, file, default_flow_style=False)
def reset_password(authenticator, users):
if st.session_state["authentication_status"]:
try:
if authenticator.reset_password(st.session_state["username"], 'Reset password'):
st.success('Password modified successfully')
except Exception as e:
st.error(e)
with open(USER_DATABASE_PATH, 'w') as file:
yaml.dump(users, file, default_flow_style=False)
def user_login_create():
global AUTHENTICATOR
global TBYB_LOGO
global USER_LOGGED_IN
users = None
with open(userDataFile) as file:
users = yaml.load(file, Loader=SafeLoader)
AUTHENTICATOR = stauth.Authenticate(
users['credentials'],
users['cookie']['name'],
users['cookie']['key'],
users['cookie']['expiry_days'],
users['preauthorized']
)
with st.sidebar:
st.image(TBYB_LOGO, width=70)
loginTab, registerTab, detailsTab = st.tabs(["Log in", "Register", "Account details"])
with loginTab:
name, authentication_status, username = AUTHENTICATOR.login('Login', 'main')
if authentication_status:
AUTHENTICATOR.logout('Logout', 'main')
st.write(f'Welcome *{name}*')
user_evaluation_variables.USERNAME = username
USER_LOGGED_IN = True
elif authentication_status == False:
st.error('Username/password is incorrect')
forgot_password(AUTHENTICATOR, users)
elif authentication_status == None:
st.warning('Please enter your username and password')
forgot_password(AUTHENTICATOR, users)
if not authentication_status:
with registerTab:
create_new_user(AUTHENTICATOR, users)
else:
with detailsTab:
st.write('**Username:** ', username)
st.write('**Name:** ', name)
st.write('**Email:** ', users['credentials']['usernames'][username]['email'])
# update_account_details(AUTHENTICATOR, users)
reset_password(AUTHENTICATOR, users)
return USER_LOGGED_IN
def setup_page_banner():
global USER_LOGGED_IN
# for tab in [tab1, tab2, tab3, tab4, tab5]:
c1,c2,c3,c4,c5,c6,c7,c8,c9 = st.columns(9)
with c5:
st.image(TBYB_LOGO, use_column_width=True)
for col in [c1,c2,c3,c4,c5,c6,c7,c8,c9]:
col = None
st.title('Try Before You Bias (TBYB)')
st.write('*A Quantitative T2I Bias Evaluation Tool*')
def setup_how_to():
expander = st.expander("How to Use")
expander.write("1. Login to your TBYB Account using the bar on the right\n"
"2. Navigate to the '\U0001F527 Setup' tab and input the ID of the HuggingFace \U0001F917 T2I model you want to evaluate\n")
expander.image(Image.open('./assets/HF_MODEL_ID_EXAMPLE.png'))
expander.write("3. Test your chosen model by generating an image using an input prompt e.g.: 'A corgi with some cool sunglasses'\n")
expander.image(Image.open('./assets/lykon_corgi.png'))
expander.write("4. Navigate to the '\U0001F30E General Eval.' or '\U0001F3AF Task-Oriented Eval.' tabs "
" to evaluate your model once it has been loaded\n"
"5. Once you have generated some evaluation images, head over to the '\U0001F4C1 Generated Images' tab to have a look at them\n"
"6. To check out your evaluations or all of the TBYB Community evaluations, head over to the '\U0001F4CA Model Comparison' tab\n"
"7. For more information about the evaluation process, see our paper at --PAPER HYPERLINK-- or navigate to the "
" '\U0001F4F0 Additional Information' tab for a TL;DR.\n"
"8. For any questions or to report any bugs/issues. Please contact jordan.vice@uwa.edu.au.\n")
def setup_additional_information_tab(tab):
with tab:
st.header("1. Quantifying Bias in Text-to-Image (T2I) Generative Models")
st.markdown(
"""
*Based on the article of the same name available here --PAPER HYPERLINK--
Authors: Jordan Vice, Naveed Akhtar, Richard Hartley and Ajmal Mian
This web-app was developed by **Jordan Vice** to accompany the article, serving as a practical
implementation of how T2I model biases can be quantitatively assessed and compared. Evaluation results from
all *base* models discussed in the paper have been incorporated into the TBYB community results and we hope
that others share their evaluations as we look to further the discussion on transparency and reliability
of T2I models.
""")
st.header('2. A (very) Brief Summary')
st.image(Image.open('./assets/TBYB_flowchart.png'))
st.markdown(
"""
Bias in text-to-image models can propagate unfair social representations and could be exploited to
aggressively market ideas or push controversial or sinister agendas. Existing T2I model bias evaluation
methods focused on social biases. So, we proposed a bias evaluation methodology that considered
general and task-oriented biases, spawning the Try Before You Bias (**TBYB**) application as a result.
"""
)
st.markdown(
"""
We proposed three novel metrics to quantify T2I model biases:
1. Distribution Bias - $B_D$
2. Jaccard Hallucination - $H_J$
3. Generative Miss Rate - $M_G$
Open the appropriate drop-down menu to understand the logic and inspiration behind metric.
"""
)
c1,c2,c3 = st.columns(3)
with c1:
with st.expander("Distribution Bias - $B_D$"):
st.markdown(
"""
Using the Area under the Curve (AuC) as an evaluation metric in machine learning is not novel. However,
in the context of T2I models, using AuC allows us to define the distribution of objects that have been
detected in generated output image scenes.
So, everytime an object is detected in a scene, we update a dictionary (which is available for
download after running an evaluation). After evaluating a full set of images, you can use this
information to determine what objects appear more frequently than others.
After all images are evaluated, we sort the objects in descending order and normalize the data. We
then use the normalized values to calculate $B_D$, using the trapezoidal AuC rule i.e.:
$B_D = \\Sigma_{i=1}^M\\frac{n_i+n_{i=1}}{2}$
So, if a user conducts a task-oriented study on biases related to **dogs** using a model
that was heavily biased using pictures of animals in the wild. You might find that after running
evaluations, the most common objects detected were trees and grass - even if these objects weren't
specified in the prompt. This would result in a very low $B_D$ in comparison to a model that for
example was trained on images of dogs and animals in various different scenarios $\\rightarrow$
which would result in a *higher* $B_D$ in comparison.
"""
)
with c2:
with st.expander("Jaccard Hallucination - $H_J$"):
st.markdown(
"""
Hallucination is a very common phenomena that is discussed in relation to generative AI, particularly
in relation to some of the most popular large language models. Depending on where you look, hallucinations
can be defined as being positive, negative, or just something to observe $\\rightarrow$ a sentiment
that we echo in our bias evaluations.
Now, how does hallucination tie into bias? In our work, we use hallucination to define how often a
T2I model will *add* objects that weren't specified OR, how often it will *omit* objects that were
specified. This indicates that there could be an innate shift in bias in the model, causing it to
add or omit certain objects.
Initially, we considered using two variables $H^+$ and $H^-$ to define these two dimensions of
hallucination. Then, we considered the Jaccard similarity coefficient, which
measures the similarity *and* diversity of two sets of objects/samples - defining this as
Jaccard Hallucination - $H_J$.
Simply put, we define the set of objects detected in the input prompt and then detect the objects in
the corresponding output image. Then, we determine the intersect over union. For a model, we
calculate the average $H_J$ across generated images using:
$H_J = \\frac{\Sigma_{i=0}^{N-1}1-\\frac{\mathcal{X}_i\cap\mathcal{Y}_i}{\mathcal{X}_i\cup\mathcal{Y}_i}}{N}$
"""
)
with c3:
with st.expander("Generative Miss Rate - $M_G$"):
st.markdown(
"""
Whenever fairness and trust are discussed in the context of machine learning and AI systems,
performance is always highlighted as a key metric - regardless of the downstream task. So, in terms
of evaluating bias, we thought that it would be important to see if there was a correlation
between bias and performance (as we predicted). And while the other metrics do evaluate biases
in terms of misalignment, they do not consider the relationship between bias and performance.
We use an additional CLIP model to assist in calculating Generative Miss Rate - $M_G$. Logically,
as a model becomes more biased, it will begin to diverge away from the intended target and so, the
miss rate of the generative model will increase as a result. This was a major consideration when
designing this metric.
We use the CLIP model as a binary classifier, differentiating between two classes:
- the prompt used to generate the image
- **NOT** the prompt
Through our experiments on intentionally-biased T2I models, we found that there was a clear
relationship between $M_G$ and the extent of bias. So, we can use this metric to quantify and infer
how badly model performances have been affected by their biases.
"""
)
st.header('3. TBYB Constraints')
st.markdown(
"""
While we have attempted to design a comprehensive, automated bias evaluation tool. We must acknowledge that
in its infancy, TBYB has some constraints:
- We have not checked the validity of *every* single T2I model and model type on HuggingFace so we cannot
promise that all T2I models will work - if you run into any issues that you think should be possible, feel
free to reach out!
- Currently, a model_index.json file is required to load models and use them with TBYB, we will look to
address other models in future works
- TBYB only works on T2I models hosted on HuggingFace, other model repositories are not currently supported
- Adaptor models are not currently supported, we will look to add evaluation functionalities of these
models in the future.
- Download, generation, inference and evaluation times are all hardware dependent.
Keep in mind that these constraints may be removed or added to any time.
""")
st.header('4. Misuse, Malicious Use, and Out-of-Scope Use')
st.markdown(
"""
Given this application is used for the assessment of T2I biases and relies on
pre-trained models available on HuggingFace, we are not responsible for any content generated
by public-facing models that have been used to generate images using this application.
TBYB is proposed as an auxiliary tool to assess model biases and thus, if a chosen model is found to output
insensitive, disturbing, distressing or offensive images that propagate harmful stereotypes or
representations of marginalised groups, please address your concerns to the model providers.
However, given the TBYB tool is designed for bias quantification and is driven by transparency, it would be
beneficial to the TBYB community to share evaluations of biased T2I models!
We share no association with HuggingFace \U0001F917, we only use their services as a model repository,
given their growth in popularity in the computer science community recently.
For further questions/queries or if you want to simply strike a conversation,
please reach out to Jordan Vice at: jordan.vice@uwa.edu.au""")
setup_page_banner()
setup_how_to()
if user_login_create():
tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["\U0001F527 Setup", "\U0001F30E General Eval.", "\U0001F3AF Task-Oriented Eval.",
"\U0001F4CA Model Comparison", "\U0001F4C1 Generated Images", "\U0001F4F0 Additional Information"])
setup_additional_information_tab(tab6)
# PLASTER THE LOGO EVERYWHERE
tab2.subheader("General Bias Evaluation")
tab2.write("Waiting for \U0001F527 Setup to be complete...")
tab3.subheader("Task-Oriented Bias Evaluation")
tab3.write("Waiting for \U0001F527 Setup to be complete...")
tab4.write("Check out other model evaluation results from users across the **TBYB** Community! \U0001F30E ")
tab4.write("You can also just compare your own model evaluations by clicking the '*Personal Evaluation*' buttons")
MCOMP.initialise_page(tab4)
tab5.subheader("Generated Images from General and Task-Oriented Bias Evaluations")
tab5.write("Waiting for \U0001F527 Setup to be complete...")
with tab1:
with st.form("model_definition_form", clear_on_submit=True):
modelID = st.text_input('Input the HuggingFace \U0001F917 T2I model_id for the model you '
'want to analyse e.g.: "runwayml/stable-diffusion-v1-5"')
submitted1 = st.form_submit_button("Submit")
if modelID:
with st.spinner('Checking if ' + modelID + ' is valid and downloading it (if required)'):
modelLoaded = MLOAD.check_if_model_exists(modelID)
if modelLoaded is not None:
# st.write("Located " + modelID + " model_index.json file")
st.write("Located " + modelID)
modelType = MLOAD.get_model_info(modelLoaded)
if modelType is not None:
st.write("Model is of Type: ", modelType)
if submitted1:
MINFER.TargetModel = MLOAD.import_model(modelID, modelType)
if MINFER.TargetModel is not None:
st.write("Text-to-image pipeline looks like this:")
st.write(MINFER.TargetModel)
user_evaluation_variables.MODEL = modelID
user_evaluation_variables.MODEL_TYPE = modelType
else:
st.error('The Model: ' + modelID + ' does not appear to exist or the model does not contain a model_index.json file.'
' Please check that that HuggingFace repo ID is valid.'
' For more help, please see the "How to Use" Tab above.', icon="🚨")
if modelID:
with st.form("example_image_gen_form", clear_on_submit=True):
testPrompt = st.text_input('Input a random test prompt to test out your '
'chosen model and see if its generating images:')
submitted2 = st.form_submit_button("Submit")
if testPrompt and submitted2:
with st.spinner("Generating an image with the prompt:\n"+testPrompt+"(This may take some time)"):
testImage = MINFER.generate_test_image(MINFER.TargetModel, testPrompt)
st.image(testImage, caption='Model: ' + modelID + ' Prompt: ' + testPrompt)
st.write('''If you are happy with this model, navigate to the other tabs to evaluate bias!
Otherwise, feel free to load up a different model and run it again''')
if MINFER.TargetModel is not None:
tab_manager.completed_setup([tab2, tab3, tab4, tab5], modelID)
else:
MCOMP.databaseDF = None
user_evaluation_variables.reset_variables('general')
user_evaluation_variables.reset_variables('task-oriented')
st.write('')
st.warning('Log in or register your email to get started! ', icon="⚠️") |