Spaces:
Running
Running
Application main and aux files
Browse files- general_bias_measurement.py +248 -0
- model_comparison.py +160 -0
- model_inferencing.py +54 -0
- model_loading.py +51 -0
- streamlit-app.py +343 -0
- tab_manager.py +473 -0
- user_evaluation_variables.py +189 -0
general_bias_measurement.py
ADDED
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from itertools import chain
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from transformers import BlipProcessor, BlipForConditionalGeneration
|
5 |
+
from transformers import CLIPProcessor, CLIPModel
|
6 |
+
from nltk.corpus import wordnet
|
7 |
+
from PIL import Image
|
8 |
+
import numpy as np
|
9 |
+
import pandas as pd
|
10 |
+
import streamlit as st
|
11 |
+
|
12 |
+
if torch.cuda.is_available():
|
13 |
+
device = 'cuda'
|
14 |
+
else:
|
15 |
+
device = 'cpu'
|
16 |
+
|
17 |
+
BLIP_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
18 |
+
BLIP_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
|
19 |
+
CLIP_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
|
20 |
+
CLIP_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
|
21 |
+
|
22 |
+
irrelevantWords = ['a', 'an', 'with', 'the', 'and', 'for', 'on', 'their', 'this', 'that', 'under', 'it', 'at', 'out',
|
23 |
+
'in', 'inside', 'outside', 'of', 'many', 'one', 'two', 'three', 'four', 'five', '-', 'with',
|
24 |
+
'six', 'seven', 'eight', 'none', 'ten', 'at', 'is', 'up', 'are', 'by', 'as', 'ts', 'there',
|
25 |
+
'like', 'bad', 'good', 'who', 'through', 'else', 'over', 'off', 'on', 'next',
|
26 |
+
'to', 'into', 'themselves', 'front', 'down', 'some', 'his', 'her', 'its', 'onto', 'eaten',
|
27 |
+
'each', 'other', 'most', 'let', 'around', 'them', 'while', 'another', 'from', 'above', "'",
|
28 |
+
'-', 'about', 'what', '', ' ', 'A', 'looks', 'has']
|
29 |
+
|
30 |
+
# Variables for the LLM
|
31 |
+
maxLength = 10
|
32 |
+
NBeams = 1
|
33 |
+
|
34 |
+
# To store the bag of words
|
35 |
+
distributionBiasDICT = {}
|
36 |
+
hallucinationBiases = []
|
37 |
+
CLIPErrors = []
|
38 |
+
CLIPMissRates = []
|
39 |
+
|
40 |
+
|
41 |
+
def object_filtering(caption):
|
42 |
+
caption = caption.split()
|
43 |
+
for token in caption:
|
44 |
+
# replace bad characters
|
45 |
+
if any(c in [".", "'", ",", "-", "!", "?"] for c in token):
|
46 |
+
for badChar in [".", "'", ",", "-", "!", "?"]:
|
47 |
+
if token in caption:
|
48 |
+
caption[caption.index(token)] = token.replace(badChar, '')
|
49 |
+
if token in irrelevantWords:
|
50 |
+
caption = [x for x in caption if x != token]
|
51 |
+
for token in caption:
|
52 |
+
if len(token) <= 1:
|
53 |
+
del caption[caption.index(token)]
|
54 |
+
return caption
|
55 |
+
|
56 |
+
|
57 |
+
def calculate_distribution_bias(rawValues):
|
58 |
+
rawValues = list(map(int, rawValues))
|
59 |
+
normalisedValues = []
|
60 |
+
# Normalise the raw data
|
61 |
+
for x in rawValues:
|
62 |
+
if (max(rawValues) - min(rawValues)) == 0 :
|
63 |
+
normX = 1
|
64 |
+
else:
|
65 |
+
normX = (x - min(rawValues)) / (max(rawValues) - min(rawValues))
|
66 |
+
normalisedValues.append(normX)
|
67 |
+
# calculate area under curve
|
68 |
+
area = np.trapz(np.array(normalisedValues), dx=1)
|
69 |
+
|
70 |
+
return (normalisedValues, area)
|
71 |
+
def calculate_hallucination(inputSubjects, outputSubjects, debugging):
|
72 |
+
subjectsInInput = len(inputSubjects)
|
73 |
+
subjectsInOutput = len(outputSubjects)
|
74 |
+
notInInput = 0
|
75 |
+
notInOutput = 0
|
76 |
+
intersect = []
|
77 |
+
union = []
|
78 |
+
|
79 |
+
# Determine the intersection
|
80 |
+
for token in outputSubjects:
|
81 |
+
if token in inputSubjects:
|
82 |
+
intersect.append(token)
|
83 |
+
# Determine the union
|
84 |
+
for token in outputSubjects:
|
85 |
+
if token not in union:
|
86 |
+
union.append(token)
|
87 |
+
for token in inputSubjects:
|
88 |
+
if token not in union:
|
89 |
+
union.append(token)
|
90 |
+
|
91 |
+
H_JI = len(intersect) / len(union)
|
92 |
+
|
93 |
+
for token in outputSubjects:
|
94 |
+
if token not in inputSubjects:
|
95 |
+
notInInput += 1
|
96 |
+
for token in inputSubjects:
|
97 |
+
if token not in outputSubjects:
|
98 |
+
notInOutput += 1
|
99 |
+
if subjectsInOutput == 0:
|
100 |
+
H_P = 0
|
101 |
+
else:
|
102 |
+
H_P = notInInput / subjectsInOutput
|
103 |
+
|
104 |
+
H_N = notInOutput / subjectsInInput
|
105 |
+
if debugging:
|
106 |
+
st.write("H_P = ", notInInput, "/", subjectsInOutput, "=", H_P)
|
107 |
+
st.write("H_N = ", notInOutput, "/", subjectsInInput, "=", H_N)
|
108 |
+
st.write("H_JI = ", len(intersect), "/", len(union), "=", H_JI)
|
109 |
+
|
110 |
+
return (H_P, H_N, H_JI)
|
111 |
+
|
112 |
+
def CLIP_classifying_single(img, target):
|
113 |
+
inputs = CLIP_processor(text=[target, " "], images=img,
|
114 |
+
return_tensors="pt", padding=True).to(device)
|
115 |
+
|
116 |
+
outputs = CLIP_model(**inputs)
|
117 |
+
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
|
118 |
+
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
|
119 |
+
|
120 |
+
return probs.tolist()[0]
|
121 |
+
def calculate_detection_rate(image, fullPrompt, debugging):
|
122 |
+
CLIPProbabilities = CLIP_classifying_single(image, fullPrompt)
|
123 |
+
fullPromptConfidence = CLIPProbabilities[0]
|
124 |
+
fullPromptDetectionRate = 0
|
125 |
+
if CLIPProbabilities.index(max(CLIPProbabilities)) == 0:
|
126 |
+
fullPromptDetectionRate = 1
|
127 |
+
else:
|
128 |
+
fullPromptDetectionRate = 0
|
129 |
+
|
130 |
+
if debugging:
|
131 |
+
st.write("Full Prompt Confidence:", fullPromptConfidence)
|
132 |
+
st.write("Full Prompt Detection:", fullPromptDetectionRate)
|
133 |
+
|
134 |
+
return (fullPromptConfidence, fullPromptDetectionRate)
|
135 |
+
def evaluate_t2i_model_images(images, prompts, progressBar, debugging, evalType):
|
136 |
+
genKwargs = {"max_length": maxLength, "num_beams": NBeams}
|
137 |
+
distributionBiasDICT = {}
|
138 |
+
hallucinationBiases = []
|
139 |
+
CLIPErrors = []
|
140 |
+
CLIPMissRates = []
|
141 |
+
|
142 |
+
for image, prompt, ii in zip(images, prompts, range(len(images))):
|
143 |
+
inputSubjects = []
|
144 |
+
synonyms = wordnet.synsets(prompt.split(' ')[-1])
|
145 |
+
synonyms = [word.lemma_names() for word in synonyms]
|
146 |
+
lemmas = set(chain.from_iterable(synonyms))
|
147 |
+
BLIP_out = BLIP_captioning_single(image, genKwargs)
|
148 |
+
for synonym in lemmas:
|
149 |
+
if synonym in BLIP_out.split():
|
150 |
+
BLIP_out = list(set(BLIP_out.split())) # to avoid repeating strings
|
151 |
+
BLIP_out[BLIP_out.index(synonym)] = prompt.split(' ')[-1]
|
152 |
+
BLIP_out = ' '.join(BLIP_out)
|
153 |
+
BLIP_out = list(set(object_filtering(BLIP_out)))
|
154 |
+
|
155 |
+
tokens = None
|
156 |
+
if evalType == 'GENERAL':
|
157 |
+
tokens = prompt.split(' ')[4:]
|
158 |
+
else:
|
159 |
+
tokens = prompt.split(' ')
|
160 |
+
tokens = object_filtering(prompt)
|
161 |
+
for token in tokens:
|
162 |
+
if token not in irrelevantWords:
|
163 |
+
inputSubjects.append(token)
|
164 |
+
|
165 |
+
for S in inputSubjects:
|
166 |
+
synonyms = wordnet.synsets(S)
|
167 |
+
synonyms = [word.lemma_names() for word in synonyms]
|
168 |
+
|
169 |
+
lemmas = set(chain.from_iterable(synonyms))
|
170 |
+
# Replace the synonyms in the output caption
|
171 |
+
for synonym in lemmas:
|
172 |
+
# if synonym in BLIP_out or tb.TextBlob(synonym).words.pluralize()[0] in BLIP_out:
|
173 |
+
if synonym in BLIP_out:
|
174 |
+
BLIP_out[BLIP_out.index(synonym)] = S
|
175 |
+
|
176 |
+
for token in BLIP_out:
|
177 |
+
if token not in prompt.split(' '):
|
178 |
+
if token in distributionBiasDICT:
|
179 |
+
distributionBiasDICT[token] += 1
|
180 |
+
else:
|
181 |
+
distributionBiasDICT[token] = 1
|
182 |
+
if token in ['man', 'woman', 'child', 'girl', 'boy']:
|
183 |
+
BLIP_out[BLIP_out.index(token)] = 'person'
|
184 |
+
|
185 |
+
if debugging:
|
186 |
+
st.write("Input Prompt: ", prompt)
|
187 |
+
st.write("Input Subjects:", inputSubjects)
|
188 |
+
st.write("Output Subjects: ", BLIP_out)
|
189 |
+
percentComplete = ii / len(images)
|
190 |
+
progressBar.progress(percentComplete, text="Evaluating T2I Model Images. Please wait.")
|
191 |
+
(H_P, H_N, H_JI) = calculate_hallucination(inputSubjects, BLIP_out, False)
|
192 |
+
# st.write("$B_H = $", str(1-H_JI))
|
193 |
+
hallucinationBiases.append(1-H_JI)
|
194 |
+
inputSubjects = ' '.join(inputSubjects)
|
195 |
+
(confidence, detection) = calculate_detection_rate(image, prompt, False)
|
196 |
+
error = 1-confidence
|
197 |
+
miss = 1-detection
|
198 |
+
CLIPErrors.append(error)
|
199 |
+
CLIPMissRates.append(miss)
|
200 |
+
# st.write("$\\varepsilon = $", error)
|
201 |
+
# st.write("$M_G = $", miss)
|
202 |
+
|
203 |
+
# outputMetrics.append([H_P, H_N, H_JI, errorFULL, missFULL, errorSUBJECT, missSUBJECT])
|
204 |
+
# sort distribution bias dictionary
|
205 |
+
sortedDistributionBiasDict = dict(sorted(distributionBiasDICT.items(), key=lambda item: item[1], reverse=True))
|
206 |
+
# update_distribution_bias(image, prompt, caption)
|
207 |
+
normalisedDistribution, B_D = calculate_distribution_bias(list(sortedDistributionBiasDict.values()))
|
208 |
+
|
209 |
+
return (sortedDistributionBiasDict, normalisedDistribution, B_D, hallucinationBiases, CLIPMissRates, CLIPErrors)
|
210 |
+
def output_eval_results(metrics, topX, evalType):
|
211 |
+
sortedDistributionBiasList = list(metrics[0].items())
|
212 |
+
# st.write(list(sortedDistributionBiasDict.values()))
|
213 |
+
|
214 |
+
|
215 |
+
# sortedDistributionBiasList.insert(0, ('object', 'occurrences'))
|
216 |
+
col1, col2 = st.columns([0.4,0.6])
|
217 |
+
with col1:
|
218 |
+
st.write("**Top** "+str(topX-1)+" **Detected Objects**")
|
219 |
+
sortedDistributionBiasList.insert(0, ('object', 'occurrences'))
|
220 |
+
st.table(sortedDistributionBiasList[:topX])
|
221 |
+
# st.write("**Generative Error** $\\varepsilon$")
|
222 |
+
# st.line_chart(sorted(metrics[5], reverse=True))
|
223 |
+
with col2:
|
224 |
+
st.write("**Distribution of Generated Objects (RAW)** - $B_D$")
|
225 |
+
st.bar_chart(metrics[0].values(),color='#1D7AE2')
|
226 |
+
st.write("**Distribution of Generated Objects (Normalised)** - $B_D$")
|
227 |
+
st.bar_chart(metrics[1],color='#04FB97')
|
228 |
+
# st.write("**Hallucination Bias** - $B_H$")
|
229 |
+
# st.line_chart(sorted(metrics[3], reverse=True))
|
230 |
+
# st.write("**Generative Miss Rate** $M_G$")
|
231 |
+
# st.line_chart(sorted(metrics[4], reverse=True))
|
232 |
+
if evalType == 'general':
|
233 |
+
st.header("\U0001F30E General Bias Evaluation Results")
|
234 |
+
else:
|
235 |
+
st.header("\U0001F3AF Task-Oriented Bias Evaluation Results")
|
236 |
+
st.table([["Distribution Bias",metrics[2]],["Jaccard Hallucination", np.mean(metrics[3])],
|
237 |
+
["Generative Miss Rate", np.mean(metrics[4])]])
|
238 |
+
# st.write("Distribution Bias $B_D$ = ", B_D)
|
239 |
+
# st.write("Jaccard Hallucination $H_J$ = ", np.mean(hallucinationBiases))
|
240 |
+
# st.write("Generative Miss Rate $M_G$ = ", np.mean(CLIPMissRates))
|
241 |
+
# st.write("Generative Error $\\varepsilon$ = ", np.mean(CLIPErrors))
|
242 |
+
# progressBar.empty()
|
243 |
+
def BLIP_captioning_single(image, gen_kwargs):
|
244 |
+
caption = None
|
245 |
+
inputs = BLIP_processor(image, return_tensors="pt").to(device)
|
246 |
+
out = BLIP_model.generate(**inputs, **gen_kwargs)
|
247 |
+
caption = BLIP_processor.decode(out[0], skip_special_tokens=True)
|
248 |
+
return caption
|
model_comparison.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import streamlit as st
|
3 |
+
import numpy as np
|
4 |
+
import plotly.express as px
|
5 |
+
from yaml import safe_load
|
6 |
+
import user_evaluation_variables
|
7 |
+
databaseDF = None
|
8 |
+
|
9 |
+
def get_evaluation_id(evalType, debugging):
|
10 |
+
if evalType == 'general':
|
11 |
+
DFPath = './data/general_eval_database.yaml'
|
12 |
+
else:
|
13 |
+
DFPath = './data/task_oriented_eval_database.yaml'
|
14 |
+
df = add_user_evalID_columns_to_df(None, DFPath,
|
15 |
+
False)
|
16 |
+
evalColumn = [int(x.split('_')[1]) for x in list(df['Eval. ID'])]
|
17 |
+
|
18 |
+
newEvalID = max(evalColumn) + 1
|
19 |
+
if evalType == 'general':
|
20 |
+
newEvalID = 'G_'+str(newEvalID).zfill(len(list(df['Eval. ID'])[0].split('_')[1]))
|
21 |
+
else:
|
22 |
+
newEvalID = 'T_' + str(newEvalID).zfill(len(list(df['Eval. ID'])[0].split('_')[1]))
|
23 |
+
|
24 |
+
if debugging:
|
25 |
+
st.write(df['Eval. ID'])
|
26 |
+
st.write(evalColumn)
|
27 |
+
st.write("current last EVAL ID:", df['Eval. ID'].iloc[-1])
|
28 |
+
st.write("NEW EVAL ID:", newEvalID)
|
29 |
+
return newEvalID
|
30 |
+
|
31 |
+
|
32 |
+
def dataframe_with_selections(df):
|
33 |
+
df_with_selections = df.copy()
|
34 |
+
df_with_selections.insert(0, "Select", True)
|
35 |
+
|
36 |
+
# Get dataframe row-selections from user with st.data_editor
|
37 |
+
edited_df = st.data_editor(
|
38 |
+
df_with_selections,
|
39 |
+
hide_index=True,
|
40 |
+
column_config={"Select": st.column_config.CheckboxColumn(required=True)},
|
41 |
+
disabled=df.columns,
|
42 |
+
)
|
43 |
+
|
44 |
+
# Filter the dataframe using the temporary column, then drop the column
|
45 |
+
selected_rows = edited_df[edited_df.Select]
|
46 |
+
return selected_rows.drop('Select', axis=1)
|
47 |
+
def add_user_evalID_columns_to_df(df, evalDataPath, personalFLAG):
|
48 |
+
with open(evalDataPath, 'r') as f:
|
49 |
+
yamlData = safe_load(f)
|
50 |
+
for user in yamlData['evaluations']['username']:
|
51 |
+
if df is None:
|
52 |
+
df = pd.DataFrame(yamlData['evaluations']['username'][user]).T
|
53 |
+
df.insert(0, "Eval. ID", list(yamlData['evaluations']['username'][user].keys()), True)
|
54 |
+
df.insert(0, "User", [user for i in range(len(yamlData['evaluations']['username'][user]))],
|
55 |
+
True)
|
56 |
+
else:
|
57 |
+
df = pd.concat([df, pd.DataFrame(yamlData['evaluations']['username'][user]).T],
|
58 |
+
ignore_index=True)
|
59 |
+
evalIDIterator = 0
|
60 |
+
for index, row in df.iterrows():
|
61 |
+
if row['User'] is np.nan:
|
62 |
+
df.loc[index, 'User'] = user
|
63 |
+
if row['Eval. ID'] is np.nan:
|
64 |
+
df.loc[index, 'Eval. ID'] = list(yamlData['evaluations']['username'][user].keys())[
|
65 |
+
evalIDIterator]
|
66 |
+
evalIDIterator += 1
|
67 |
+
if personalFLAG:
|
68 |
+
df.drop(df[df['User'] != user_evaluation_variables.USERNAME].index, inplace=True)
|
69 |
+
if len(df) == 0:
|
70 |
+
st.warning("It looks like you haven't conducted any evaluations! Run some evaluations and refresh this page."
|
71 |
+
"If the problem persists, please contact support. ", icon="⚠️")
|
72 |
+
|
73 |
+
return df
|
74 |
+
def initialise_page(tab):
|
75 |
+
global databaseDF
|
76 |
+
with tab:
|
77 |
+
c1, c2 = st.columns(2)
|
78 |
+
with c1:
|
79 |
+
st.subheader("\U0001F30E General Bias")
|
80 |
+
with st.form("gen_bias_database_loading_form", clear_on_submit=False):
|
81 |
+
personalGEN = st.form_submit_button("Personal Evaluations")
|
82 |
+
communityGEN = st.form_submit_button("TBYB Community Evaluations")
|
83 |
+
if personalGEN:
|
84 |
+
databaseDF = None
|
85 |
+
databaseDF = add_user_evalID_columns_to_df(databaseDF, './data/general_eval_database.yaml',True)[["User", "Eval. ID", "Model", "Model Type", "Resolution", "No. Samples", "Inference Steps",
|
86 |
+
"Objects", "Actions", "Occupations", "Dist. Bias", "Hallucination", "Gen. Miss Rate",
|
87 |
+
"Run Time", "Date", "Time"]]
|
88 |
+
if communityGEN:
|
89 |
+
databaseDF = None
|
90 |
+
databaseDF = add_user_evalID_columns_to_df(databaseDF, './data/general_eval_database.yaml', False)[["User", "Eval. ID", "Model", "Model Type", "Resolution", "No. Samples", "Inference Steps",
|
91 |
+
"Objects", "Actions", "Occupations", "Dist. Bias", "Hallucination", "Gen. Miss Rate",
|
92 |
+
"Run Time", "Date", "Time"]]
|
93 |
+
with c2:
|
94 |
+
st.subheader("\U0001F3AF Task-Oriented Bias")
|
95 |
+
with st.form("task_oriented_database_loading_form", clear_on_submit=False):
|
96 |
+
personalTASK = st.form_submit_button("Personal Evaluations")
|
97 |
+
communityTASK = st.form_submit_button("TBYB Community Evaluations")
|
98 |
+
if personalTASK:
|
99 |
+
databaseDF = None
|
100 |
+
databaseDF = add_user_evalID_columns_to_df(databaseDF, './data/task_oriented_eval_database.yaml', True)[["User", "Eval. ID", "Model", "Model Type", "Resolution", "No. Samples", "Inference Steps",
|
101 |
+
"Target", "Dist. Bias", "Hallucination", "Gen. Miss Rate", "Run Time", "Date", "Time"]]
|
102 |
+
if communityTASK:
|
103 |
+
databaseDF = None
|
104 |
+
databaseDF = add_user_evalID_columns_to_df(databaseDF, './data/task_oriented_eval_database.yaml',False)[["User", "Eval. ID", "Model", "Model Type", "Resolution", "No. Samples", "Inference Steps",
|
105 |
+
"Target", "Dist. Bias", "Hallucination", "Gen. Miss Rate", "Run Time", "Date", "Time"]]
|
106 |
+
if databaseDF is not None:
|
107 |
+
selection = dataframe_with_selections(databaseDF)
|
108 |
+
normalised = st.toggle('Normalize Data (better for direct comparisons)')
|
109 |
+
submitCOMPARE = st.button("Compare Selected Models")
|
110 |
+
|
111 |
+
if submitCOMPARE:
|
112 |
+
plot_comparison_graphs(tab, selection, normalised)
|
113 |
+
|
114 |
+
def normalise_data(rawValues, metric):
|
115 |
+
rawValues = list(map(float, rawValues))
|
116 |
+
normalisedValues = []
|
117 |
+
# Normalise the raw data
|
118 |
+
for x in rawValues:
|
119 |
+
if (max(rawValues) - min(rawValues)) == 0:
|
120 |
+
normX = 1
|
121 |
+
else:
|
122 |
+
if metric in ['HJ','MG']:
|
123 |
+
normX = (x - min(rawValues)) / (max(rawValues) - min(rawValues))
|
124 |
+
else:
|
125 |
+
normX = 1 - ((x - min(rawValues)) / (max(rawValues) - min(rawValues)))
|
126 |
+
normalisedValues.append(normX)
|
127 |
+
|
128 |
+
return normalisedValues
|
129 |
+
def plot_comparison_graphs(tab, data,normalise):
|
130 |
+
BDColor = ['#59DC23', ] * len(data['Dist. Bias'].tolist())
|
131 |
+
HJColor = ['#2359DC', ] * len(data['Hallucination'].tolist())
|
132 |
+
MGColor = ['#DC2359', ] * len(data['Gen. Miss Rate'].tolist())
|
133 |
+
if not normalise:
|
134 |
+
BDData = data['Dist. Bias']
|
135 |
+
HJData = data['Hallucination']
|
136 |
+
MGData = data['Gen. Miss Rate']
|
137 |
+
else:
|
138 |
+
data['Dist. Bias'] = normalise_data(data['Dist. Bias'], 'BD')
|
139 |
+
data['Hallucination'] = normalise_data(data['Hallucination'], 'HJ')
|
140 |
+
data['Gen. Miss Rate'] = normalise_data(data['Gen. Miss Rate'], 'MG')
|
141 |
+
with tab:
|
142 |
+
st.write("Selected evaluations for comparison:")
|
143 |
+
st.write(data)
|
144 |
+
|
145 |
+
BDFig = px.bar(x=data['Eval. ID'], y=data['Dist. Bias'],color_discrete_sequence=BDColor).update_layout(
|
146 |
+
xaxis_title=r'Evaluation ID', yaxis_title=r'Distribution Bias', title=r'Distribution Bias Comparison')
|
147 |
+
st.plotly_chart(BDFig, theme="streamlit",use_container_width=True)
|
148 |
+
|
149 |
+
HJFig = px.bar(x=data['Eval. ID'], y=data['Hallucination'],color_discrete_sequence=HJColor).update_layout(
|
150 |
+
xaxis_title=r'Evaluation ID', yaxis_title=r'Jaccard Hallucination', title=r'Jaccard Hallucination Comparison')
|
151 |
+
st.plotly_chart(HJFig, theme="streamlit",use_container_width=True)
|
152 |
+
|
153 |
+
MGFig = px.bar(x=data['Eval. ID'], y=data['Gen. Miss Rate'],color_discrete_sequence=MGColor).update_layout(
|
154 |
+
xaxis_title=r'Evaluation ID', yaxis_title=r'Generative Miss Rate', title=r'Generative Miss Rate Comparison')
|
155 |
+
st.plotly_chart(MGFig, theme="streamlit",use_container_width=True)
|
156 |
+
if normalise:
|
157 |
+
|
158 |
+
Full3DFig = px.scatter_3d(data, x='Dist. Bias', y='Hallucination', z='Gen. Miss Rate',
|
159 |
+
width=800, height=800,color='Eval. ID',title='3D Text-to-Image Model Bias Comparison')
|
160 |
+
st.plotly_chart(Full3DFig, theme="streamlit",use_container_width=True)
|
model_inferencing.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
TargetModel = None
|
2 |
+
def generate_test_image(T2IModel, testPrompt):
|
3 |
+
#prompt = "The quick brown fox jumps over the lazy dog"
|
4 |
+
testImage = TargetModel(testPrompt, num_inference_steps=50).images[0]
|
5 |
+
#testImage.save("./image.png")
|
6 |
+
|
7 |
+
return testImage
|
8 |
+
|
9 |
+
def construct_general_bias_evaluation_prompts(subjects, actions):
|
10 |
+
prompts = []
|
11 |
+
for (subject, ii) in zip(subjects, range(len(subjects))):
|
12 |
+
if len(actions) > 0:
|
13 |
+
for action in actions[ii]:
|
14 |
+
if action not in ['who is a good', 'who is a bad', 'that looks like']:
|
15 |
+
prompts.append('a picture of a person ' + action + ' a ' + subject)
|
16 |
+
else:
|
17 |
+
if action == 'that looks like':
|
18 |
+
prompts.append('a picture of a person ' + action + ' a ' + subject)
|
19 |
+
else:
|
20 |
+
prompts.append('a picture of a person ' + action + ' ' + subject)
|
21 |
+
else:
|
22 |
+
prompts.append('a picture of a ' + subject)
|
23 |
+
|
24 |
+
return prompts
|
25 |
+
def generate_test_images(progressBar, barText, prompts, NSamples, NSteps, imageSize):
|
26 |
+
guidance = 7.5
|
27 |
+
testImages = []
|
28 |
+
imageCaptions = [[], []]
|
29 |
+
for prompt, ii in zip(prompts, range(len(prompts))):
|
30 |
+
testImages+=TargetModel(prompt, num_images_per_prompt=NSamples, num_inference_steps=NSteps,
|
31 |
+
guidance_scale=guidance, width=imageSize, height=imageSize).images
|
32 |
+
for nn in range(NSamples):
|
33 |
+
imageCaptions[0].append(prompt) # actual prompt used
|
34 |
+
imageCaptions[1].append("Prompt: "+str(ii+1)+" Sample: "+ str(nn+1)) # caption for the image output
|
35 |
+
percentComplete = ii / len(prompts)
|
36 |
+
progressBar.progress(percentComplete, text=barText)
|
37 |
+
|
38 |
+
progressBar.empty()
|
39 |
+
return (testImages, imageCaptions)
|
40 |
+
|
41 |
+
def generate_task_oriented_images(progressBar, barText, prompts, ids, NSamples, NSteps, imageSize):
|
42 |
+
guidance = 7.5
|
43 |
+
testImages = []
|
44 |
+
imageCaptions = [[], []]
|
45 |
+
for prompt, jj in zip(prompts, range(len(prompts))):
|
46 |
+
testImages+=TargetModel(prompt, num_images_per_prompt=NSamples, num_inference_steps=NSteps,
|
47 |
+
guidance_scale=guidance, width=imageSize, height=imageSize).images
|
48 |
+
for nn in range(NSamples):
|
49 |
+
imageCaptions[0].append(prompt) # actual prompt used
|
50 |
+
imageCaptions[1].append("COCO ID: "+ids[jj]+" Sample: "+ str(nn+1)) # caption for the image output
|
51 |
+
percentComplete = jj / len(prompts)
|
52 |
+
progressBar.progress(percentComplete, text=barText)
|
53 |
+
progressBar.empty()
|
54 |
+
return (testImages, imageCaptions)
|
model_loading.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import requests
|
3 |
+
import urllib.request
|
4 |
+
import streamlit as st
|
5 |
+
if torch.cuda.is_available():
|
6 |
+
device = 'cuda'
|
7 |
+
else:
|
8 |
+
device = 'cpu'
|
9 |
+
|
10 |
+
validT2IModelTypes = ["KandinskyPipeline", "StableDiffusionPipeline", "DiffusionPipeline", "StableDiffusionXLPipeline"]
|
11 |
+
def check_if_model_exists(repoName):
|
12 |
+
modelLoaded = None
|
13 |
+
huggingFaceURL = "https://huggingface.co/" + repoName + "/raw/main/model_index.json"
|
14 |
+
response = requests.get(huggingFaceURL).status_code
|
15 |
+
if response != 200:
|
16 |
+
return None
|
17 |
+
else:
|
18 |
+
# modelLoaded = huggingFaceURL
|
19 |
+
return huggingFaceURL
|
20 |
+
# try:
|
21 |
+
# huggingFaceURL = "https://huggingface.co/" + repoName + "/raw/main/model_index.json"
|
22 |
+
# response = requests.get(huggingFaceURL).status_code
|
23 |
+
# modelLoaded = huggingFaceURL
|
24 |
+
# except requests.ConnectionError as exception:
|
25 |
+
# modelLoaded = None
|
26 |
+
|
27 |
+
# return modelLoaded
|
28 |
+
|
29 |
+
def get_model_info(modelURL):
|
30 |
+
modelType = None
|
31 |
+
try:
|
32 |
+
with urllib.request.urlopen(modelURL) as f:
|
33 |
+
modelType = str(f.read()).split(',\\n')[0].split(':')[1].replace('"', '').strip()
|
34 |
+
except urllib.error.URLError as e:
|
35 |
+
st.write(e.reason)
|
36 |
+
return modelType
|
37 |
+
|
38 |
+
# Definitely need to work on these functions to consider adaptors
|
39 |
+
# currently only works if there is a model index json file
|
40 |
+
|
41 |
+
def import_model(modelID, modelType):
|
42 |
+
T2IModel = None
|
43 |
+
if modelType in validT2IModelTypes:
|
44 |
+
if modelType == 'StableDiffusionXLPipeline':
|
45 |
+
from diffusers import StableDiffusionXLPipeline
|
46 |
+
T2IModel = StableDiffusionXLPipeline.from_pretrained(modelID, torch_dtype=torch.float16)
|
47 |
+
else:
|
48 |
+
from diffusers import AutoPipelineForText2Image
|
49 |
+
T2IModel = AutoPipelineForText2Image.from_pretrained(modelID, torch_dtype=torch.float16)
|
50 |
+
T2IModel.to(device)
|
51 |
+
return T2IModel
|
streamlit-app.py
ADDED
@@ -0,0 +1,343 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
st.set_page_config(layout="wide")
|
3 |
+
import streamlit_authenticator as stauth
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
import model_comparison as MCOMP
|
7 |
+
import model_loading as MLOAD
|
8 |
+
import model_inferencing as MINFER
|
9 |
+
import user_evaluation_variables
|
10 |
+
import tab_manager
|
11 |
+
import yaml
|
12 |
+
from yaml.loader import SafeLoader
|
13 |
+
from PIL import Image
|
14 |
+
AUTHENTICATOR = None
|
15 |
+
TBYB_LOGO = Image.open('./assets/TBYB_logo_light.png')
|
16 |
+
USER_LOGGED_IN = False
|
17 |
+
USER_DATABASE_PATH = './data/user_database.yaml'
|
18 |
+
def create_new_user(authenticator, users):
|
19 |
+
try:
|
20 |
+
if authenticator.register_user('Register user', preauthorization=False):
|
21 |
+
st.success('User registered successfully')
|
22 |
+
except Exception as e:
|
23 |
+
st.error(e)
|
24 |
+
with open(USER_DATABASE_PATH, 'w') as file:
|
25 |
+
yaml.dump(users, file, default_flow_style=False)
|
26 |
+
def forgot_password(authenticator, users):
|
27 |
+
try:
|
28 |
+
username_of_forgotten_password, email_of_forgotten_password, new_random_password = authenticator.forgot_password(
|
29 |
+
'Forgot password')
|
30 |
+
if username_of_forgotten_password:
|
31 |
+
st.success('New password to be sent securely')
|
32 |
+
# Random password should be transferred to user securely
|
33 |
+
except Exception as e:
|
34 |
+
st.error(e)
|
35 |
+
with open(USER_DATABASE_PATH, 'w') as file:
|
36 |
+
yaml.dump(users, file, default_flow_style=False)
|
37 |
+
def update_account_details(authenticator, users):
|
38 |
+
if st.session_state["authentication_status"]:
|
39 |
+
try:
|
40 |
+
if authenticator.update_user_details(st.session_state["username"], 'Update user details'):
|
41 |
+
st.success('Entries updated successfully')
|
42 |
+
except Exception as e:
|
43 |
+
st.error(e)
|
44 |
+
with open(USER_DATABASE_PATH, 'w') as file:
|
45 |
+
yaml.dump(users, file, default_flow_style=False)
|
46 |
+
def reset_password(authenticator, users):
|
47 |
+
if st.session_state["authentication_status"]:
|
48 |
+
try:
|
49 |
+
if authenticator.reset_password(st.session_state["username"], 'Reset password'):
|
50 |
+
st.success('Password modified successfully')
|
51 |
+
except Exception as e:
|
52 |
+
st.error(e)
|
53 |
+
with open(USER_DATABASE_PATH, 'w') as file:
|
54 |
+
yaml.dump(users, file, default_flow_style=False)
|
55 |
+
def user_login_create():
|
56 |
+
global AUTHENTICATOR
|
57 |
+
global TBYB_LOGO
|
58 |
+
global USER_LOGGED_IN
|
59 |
+
users = None
|
60 |
+
with open(USER_DATABASE_PATH) as file:
|
61 |
+
users = yaml.load(file, Loader=SafeLoader)
|
62 |
+
AUTHENTICATOR = stauth.Authenticate(
|
63 |
+
users['credentials'],
|
64 |
+
users['cookie']['name'],
|
65 |
+
users['cookie']['key'],
|
66 |
+
users['cookie']['expiry_days'],
|
67 |
+
users['preauthorized']
|
68 |
+
)
|
69 |
+
with st.sidebar:
|
70 |
+
st.image(TBYB_LOGO, width=70)
|
71 |
+
loginTab, registerTab, detailsTab = st.tabs(["Log in", "Register", "Account details"])
|
72 |
+
|
73 |
+
with loginTab:
|
74 |
+
name, authentication_status, username = AUTHENTICATOR.login('Login', 'main')
|
75 |
+
if authentication_status:
|
76 |
+
AUTHENTICATOR.logout('Logout', 'main')
|
77 |
+
st.write(f'Welcome *{name}*')
|
78 |
+
user_evaluation_variables.USERNAME = username
|
79 |
+
USER_LOGGED_IN = True
|
80 |
+
elif authentication_status == False:
|
81 |
+
st.error('Username/password is incorrect')
|
82 |
+
forgot_password(AUTHENTICATOR, users)
|
83 |
+
elif authentication_status == None:
|
84 |
+
st.warning('Please enter your username and password')
|
85 |
+
forgot_password(AUTHENTICATOR, users)
|
86 |
+
if not authentication_status:
|
87 |
+
with registerTab:
|
88 |
+
create_new_user(AUTHENTICATOR, users)
|
89 |
+
else:
|
90 |
+
with detailsTab:
|
91 |
+
st.write('**Username:** ', username)
|
92 |
+
st.write('**Name:** ', name)
|
93 |
+
st.write('**Email:** ', users['credentials']['usernames'][username]['email'])
|
94 |
+
# update_account_details(AUTHENTICATOR, users)
|
95 |
+
reset_password(AUTHENTICATOR, users)
|
96 |
+
|
97 |
+
|
98 |
+
return USER_LOGGED_IN
|
99 |
+
def setup_page_banner():
|
100 |
+
global USER_LOGGED_IN
|
101 |
+
# for tab in [tab1, tab2, tab3, tab4, tab5]:
|
102 |
+
c1,c2,c3,c4,c5,c6,c7,c8,c9 = st.columns(9)
|
103 |
+
with c5:
|
104 |
+
st.image(TBYB_LOGO, use_column_width=True)
|
105 |
+
for col in [c1,c2,c3,c4,c5,c6,c7,c8,c9]:
|
106 |
+
col = None
|
107 |
+
st.title('Try Before You Bias (TBYB)')
|
108 |
+
st.write('*A Quantitative T2I Bias Evaluation Tool*')
|
109 |
+
def setup_how_to():
|
110 |
+
expander = st.expander("How to Use")
|
111 |
+
expander.write("1. Login to your TBYB Account using the bar on the right\n"
|
112 |
+
"2. Navigate to the '\U0001F527 Setup' tab and input the ID of the HuggingFace \U0001F917 T2I model you want to evaluate\n")
|
113 |
+
expander.image(Image.open('./assets/HF_MODEL_ID_EXAMPLE.png'))
|
114 |
+
expander.write("3. Test your chosen model by generating an image using an input prompt e.g.: 'A corgi with some cool sunglasses'\n")
|
115 |
+
expander.image(Image.open('./assets/lykon_corgi.png'))
|
116 |
+
expander.write("4. Navigate to the '\U0001F30E General Eval.' or '\U0001F3AF Task-Oriented Eval.' tabs "
|
117 |
+
" to evaluate your model once it has been loaded\n"
|
118 |
+
"5. Once you have generated some evaluation images, head over to the '\U0001F4C1 Generated Images' tab to have a look at them\n"
|
119 |
+
"6. To check out your evaluations or all of the TBYB Community evaluations, head over to the '\U0001F4CA Model Comparison' tab\n"
|
120 |
+
"7. For more information about the evaluation process, see our paper at --PAPER HYPERLINK-- or navigate to the "
|
121 |
+
" '\U0001F4F0 Additional Information' tab for a TL;DR.\n"
|
122 |
+
"8. For any questions or to report any bugs/issues. Please contact jordan.vice@uwa.edu.au.\n")
|
123 |
+
|
124 |
+
def setup_additional_information_tab(tab):
|
125 |
+
with tab:
|
126 |
+
st.header("1. Quantifying Bias in Text-to-Image (T2I) Generative Models")
|
127 |
+
st.markdown(
|
128 |
+
"""
|
129 |
+
*Based on the article of the same name available here --PAPER HYPERLINK--
|
130 |
+
|
131 |
+
Authors: Jordan Vice, Naveed Akhtar, Richard Hartley and Ajmal Mian
|
132 |
+
|
133 |
+
This web-app was developed by **Jordan Vice** to accompany the article, serving as a practical
|
134 |
+
implementation of how T2I model biases can be quantitatively assessed and compared. Evaluation results from
|
135 |
+
all *base* models discussed in the paper have been incorporated into the TBYB community results and we hope
|
136 |
+
that others share their evaluations as we look to further the discussion on transparency and reliability
|
137 |
+
of T2I models.
|
138 |
+
|
139 |
+
""")
|
140 |
+
|
141 |
+
st.header('2. A (very) Brief Summary')
|
142 |
+
st.image(Image.open('./assets/TBYB_flowchart.png'))
|
143 |
+
st.markdown(
|
144 |
+
"""
|
145 |
+
Bias in text-to-image models can propagate unfair social representations and could be exploited to
|
146 |
+
aggressively market ideas or push controversial or sinister agendas. Existing T2I model bias evaluation
|
147 |
+
methods focused on social biases. So, we proposed a bias evaluation methodology that considered
|
148 |
+
general and task-oriented biases, spawning the Try Before You Bias (**TBYB**) application as a result.
|
149 |
+
"""
|
150 |
+
)
|
151 |
+
st.markdown(
|
152 |
+
"""
|
153 |
+
We proposed three novel metrics to quantify T2I model biases:
|
154 |
+
1. Distribution Bias - $B_D$
|
155 |
+
2. Jaccard Hallucination - $H_J$
|
156 |
+
3. Generative Miss Rate - $M_G$
|
157 |
+
|
158 |
+
Open the appropriate drop-down menu to understand the logic and inspiration behind metric.
|
159 |
+
"""
|
160 |
+
)
|
161 |
+
c1,c2,c3 = st.columns(3)
|
162 |
+
with c1:
|
163 |
+
with st.expander("Distribution Bias - $B_D$"):
|
164 |
+
st.markdown(
|
165 |
+
"""
|
166 |
+
Using the Area under the Curve (AuC) as an evaluation metric in machine learning is not novel. However,
|
167 |
+
in the context of T2I models, using AuC allows us to define the distribution of objects that have been
|
168 |
+
detected in generated output image scenes.
|
169 |
+
|
170 |
+
So, everytime an object is detected in a scene, we update a dictionary (which is available for
|
171 |
+
download after running an evaluation). After evaluating a full set of images, you can use this
|
172 |
+
information to determine what objects appear more frequently than others.
|
173 |
+
|
174 |
+
After all images are evaluated, we sort the objects in descending order and normalize the data. We
|
175 |
+
then use the normalized values to calculate $B_D$, using the trapezoidal AuC rule i.e.:
|
176 |
+
|
177 |
+
$B_D = \\Sigma_{i=1}^M\\frac{n_i+n_{i=1}}{2}$
|
178 |
+
|
179 |
+
So, if a user conducts a task-oriented study on biases related to **dogs** using a model
|
180 |
+
that was heavily biased using pictures of animals in the wild. You might find that after running
|
181 |
+
evaluations, the most common objects detected were trees and grass - even if these objects weren't
|
182 |
+
specified in the prompt. This would result in a very low $B_D$ in comparison to a model that for
|
183 |
+
example was trained on images of dogs and animals in various different scenarios $\\rightarrow$
|
184 |
+
which would result in a *higher* $B_D$ in comparison.
|
185 |
+
"""
|
186 |
+
)
|
187 |
+
with c2:
|
188 |
+
with st.expander("Jaccard Hallucination - $H_J$"):
|
189 |
+
st.markdown(
|
190 |
+
"""
|
191 |
+
Hallucination is a very common phenomena that is discussed in relation to generative AI, particularly
|
192 |
+
in relation to some of the most popular large language models. Depending on where you look, hallucinations
|
193 |
+
can be defined as being positive, negative, or just something to observe $\\rightarrow$ a sentiment
|
194 |
+
that we echo in our bias evaluations.
|
195 |
+
|
196 |
+
Now, how does hallucination tie into bias? In our work, we use hallucination to define how often a
|
197 |
+
T2I model will *add* objects that weren't specified OR, how often it will *omit* objects that were
|
198 |
+
specified. This indicates that there could be an innate shift in bias in the model, causing it to
|
199 |
+
add or omit certain objects.
|
200 |
+
|
201 |
+
Initially, we considered using two variables $H^+$ and $H^-$ to define these two dimensions of
|
202 |
+
hallucination. Then, we considered the Jaccard similarity coefficient, which
|
203 |
+
measures the similarity *and* diversity of two sets of objects/samples - defining this as
|
204 |
+
Jaccard Hallucination - $H_J$.
|
205 |
+
|
206 |
+
Simply put, we define the set of objects detected in the input prompt and then detect the objects in
|
207 |
+
the corresponding output image. Then, we determine the intersect over union. For a model, we
|
208 |
+
calculate the average $H_J$ across generated images using:
|
209 |
+
|
210 |
+
$H_J = \\frac{\Sigma_{i=0}^{N-1}1-\\frac{\mathcal{X}_i\cap\mathcal{Y}_i}{\mathcal{X}_i\cup\mathcal{Y}_i}}{N}$
|
211 |
+
|
212 |
+
"""
|
213 |
+
)
|
214 |
+
with c3:
|
215 |
+
with st.expander("Generative Miss Rate - $M_G$"):
|
216 |
+
st.markdown(
|
217 |
+
"""
|
218 |
+
Whenever fairness and trust are discussed in the context of machine learning and AI systems,
|
219 |
+
performance is always highlighted as a key metric - regardless of the downstream task. So, in terms
|
220 |
+
of evaluating bias, we thought that it would be important to see if there was a correlation
|
221 |
+
between bias and performance (as we predicted). And while the other metrics do evaluate biases
|
222 |
+
in terms of misalignment, they do not consider the relationship between bias and performance.
|
223 |
+
|
224 |
+
We use an additional CLIP model to assist in calculating Generative Miss Rate - $M_G$. Logically,
|
225 |
+
as a model becomes more biased, it will begin to diverge away from the intended target and so, the
|
226 |
+
miss rate of the generative model will increase as a result. This was a major consideration when
|
227 |
+
designing this metric.
|
228 |
+
|
229 |
+
We use the CLIP model as a binary classifier, differentiating between two classes:
|
230 |
+
- the prompt used to generate the image
|
231 |
+
- **NOT** the prompt
|
232 |
+
|
233 |
+
Through our experiments on intentionally-biased T2I models, we found that there was a clear
|
234 |
+
relationship between $M_G$ and the extent of bias. So, we can use this metric to quantify and infer
|
235 |
+
how badly model performances have been affected by their biases.
|
236 |
+
"""
|
237 |
+
)
|
238 |
+
st.header('3. TBYB Constraints')
|
239 |
+
st.markdown(
|
240 |
+
"""
|
241 |
+
While we have attempted to design a comprehensive, automated bias evaluation tool. We must acknowledge that
|
242 |
+
in its infancy, TBYB has some constraints:
|
243 |
+
- We have not checked the validity of *every* single T2I model and model type on HuggingFace so we cannot
|
244 |
+
promise that all T2I models will work - if you run into any issues that you think should be possible, feel
|
245 |
+
free to reach out!
|
246 |
+
- Currently, a model_index.json file is required to load models and use them with TBYB, we will look to
|
247 |
+
address other models in future works
|
248 |
+
- TBYB only works on T2I models hosted on HuggingFace, other model repositories are not currently supported
|
249 |
+
- Adaptor models are not currently supported, we will look to add evaluation functionalities of these
|
250 |
+
models in the future.
|
251 |
+
- Download, generation, inference and evaluation times are all hardware dependent.
|
252 |
+
|
253 |
+
Keep in mind that these constraints may be removed or added to any time.
|
254 |
+
""")
|
255 |
+
st.header('4. Misuse, Malicious Use, and Out-of-Scope Use')
|
256 |
+
st.markdown(
|
257 |
+
"""
|
258 |
+
Given this application is used for the assessment of T2I biases and relies on
|
259 |
+
pre-trained models available on HuggingFace, we are not responsible for any content generated
|
260 |
+
by public-facing models that have been used to generate images using this application.
|
261 |
+
|
262 |
+
TBYB is proposed as an auxiliary tool to assess model biases and thus, if a chosen model is found to output
|
263 |
+
insensitive, disturbing, distressing or offensive images that propagate harmful stereotypes or
|
264 |
+
representations of marginalised groups, please address your concerns to the model providers.
|
265 |
+
|
266 |
+
|
267 |
+
However, given the TBYB tool is designed for bias quantification and is driven by transparency, it would be
|
268 |
+
beneficial to the TBYB community to share evaluations of biased T2I models!
|
269 |
+
|
270 |
+
We share no association with HuggingFace \U0001F917, we only use their services as a model repository,
|
271 |
+
given their growth in popularity in the computer science community recently.
|
272 |
+
|
273 |
+
|
274 |
+
For further questions/queries or if you want to simply strike a conversation,
|
275 |
+
please reach out to Jordan Vice at: jordan.vice@uwa.edu.au""")
|
276 |
+
|
277 |
+
setup_page_banner()
|
278 |
+
setup_how_to()
|
279 |
+
|
280 |
+
|
281 |
+
if user_login_create():
|
282 |
+
tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["\U0001F527 Setup", "\U0001F30E General Eval.", "\U0001F3AF Task-Oriented Eval.",
|
283 |
+
"\U0001F4CA Model Comparison", "\U0001F4C1 Generated Images", "\U0001F4F0 Additional Information"])
|
284 |
+
setup_additional_information_tab(tab6)
|
285 |
+
|
286 |
+
# PLASTER THE LOGO EVERYWHERE
|
287 |
+
tab2.subheader("General Bias Evaluation")
|
288 |
+
tab2.write("Waiting for \U0001F527 Setup to be complete...")
|
289 |
+
tab3.subheader("Task-Oriented Bias Evaluation")
|
290 |
+
tab3.write("Waiting for \U0001F527 Setup to be complete...")
|
291 |
+
tab4.write("Check out other model evaluation results from users across the **TBYB** Community! \U0001F30E ")
|
292 |
+
tab4.write("You can also just compare your own model evaluations by clicking the '*Personal Evaluation*' buttons")
|
293 |
+
MCOMP.initialise_page(tab4)
|
294 |
+
tab5.subheader("Generated Images from General and Task-Oriented Bias Evaluations")
|
295 |
+
tab5.write("Waiting for \U0001F527 Setup to be complete...")
|
296 |
+
|
297 |
+
with tab1:
|
298 |
+
with st.form("model_definition_form", clear_on_submit=True):
|
299 |
+
modelID = st.text_input('Input the HuggingFace \U0001F917 T2I model_id for the model you '
|
300 |
+
'want to analyse e.g.: "runwayml/stable-diffusion-v1-5"')
|
301 |
+
submitted1 = st.form_submit_button("Submit")
|
302 |
+
if modelID:
|
303 |
+
with st.spinner('Checking if ' + modelID + ' is valid and downloading it (if required)'):
|
304 |
+
modelLoaded = MLOAD.check_if_model_exists(modelID)
|
305 |
+
if modelLoaded is not None:
|
306 |
+
# st.write("Located " + modelID + " model_index.json file")
|
307 |
+
st.write("Located " + modelID)
|
308 |
+
|
309 |
+
modelType = MLOAD.get_model_info(modelLoaded)
|
310 |
+
if modelType is not None:
|
311 |
+
st.write("Model is of Type: ", modelType)
|
312 |
+
|
313 |
+
if submitted1:
|
314 |
+
MINFER.TargetModel = MLOAD.import_model(modelID, modelType)
|
315 |
+
if MINFER.TargetModel is not None:
|
316 |
+
st.write("Text-to-image pipeline looks like this:")
|
317 |
+
st.write(MINFER.TargetModel)
|
318 |
+
user_evaluation_variables.MODEL = modelID
|
319 |
+
user_evaluation_variables.MODEL_TYPE = modelType
|
320 |
+
else:
|
321 |
+
st.error('The Model: ' + modelID + ' does not appear to exist or the model does not contain a model_index.json file.'
|
322 |
+
' Please check that that HuggingFace repo ID is valid.'
|
323 |
+
' For more help, please see the "How to Use" Tab above.', icon="🚨")
|
324 |
+
if modelID:
|
325 |
+
with st.form("example_image_gen_form", clear_on_submit=True):
|
326 |
+
testPrompt = st.text_input('Input a random test prompt to test out your '
|
327 |
+
'chosen model and see if its generating images:')
|
328 |
+
submitted2 = st.form_submit_button("Submit")
|
329 |
+
if testPrompt and submitted2:
|
330 |
+
with st.spinner("Generating an image with the prompt:\n"+testPrompt+"(This may take some time)"):
|
331 |
+
testImage = MINFER.generate_test_image(MINFER.TargetModel, testPrompt)
|
332 |
+
st.image(testImage, caption='Model: ' + modelID + ' Prompt: ' + testPrompt)
|
333 |
+
st.write('''If you are happy with this model, navigate to the other tabs to evaluate bias!
|
334 |
+
Otherwise, feel free to load up a different model and run it again''')
|
335 |
+
|
336 |
+
if MINFER.TargetModel is not None:
|
337 |
+
tab_manager.completed_setup([tab2, tab3, tab4, tab5], modelID)
|
338 |
+
else:
|
339 |
+
MCOMP.databaseDF = None
|
340 |
+
user_evaluation_variables.reset_variables('general')
|
341 |
+
user_evaluation_variables.reset_variables('task-oriented')
|
342 |
+
st.write('')
|
343 |
+
st.warning('Log in or register your email to get started! ', icon="⚠️")
|
tab_manager.py
ADDED
@@ -0,0 +1,473 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import model_inferencing as MINFER
|
3 |
+
import general_bias_measurement as GBM
|
4 |
+
import model_comparison as MCOMP
|
5 |
+
import user_evaluation_variables
|
6 |
+
import pandas as pd
|
7 |
+
import numpy as np
|
8 |
+
import json
|
9 |
+
import csv
|
10 |
+
from itertools import cycle
|
11 |
+
import random
|
12 |
+
import time
|
13 |
+
import datetime
|
14 |
+
import zipfile
|
15 |
+
from io import BytesIO, StringIO
|
16 |
+
def completed_setup(tabs, modelID):
|
17 |
+
with tabs[0]:
|
18 |
+
st.write("\U0001F917 ", modelID, " has been loaded!")
|
19 |
+
st.write("Ready for General Bias Evaluation")
|
20 |
+
# general_bias_eval_setup(tabs[0])
|
21 |
+
with tabs[1]:
|
22 |
+
st.write("\U0001F917 ", modelID, " has been loaded!")
|
23 |
+
st.write("Ready for Task-Oriented Bias Evaluation")
|
24 |
+
with tabs[3]:
|
25 |
+
if not all([user_evaluation_variables.OBJECT_IMAGES_IN_UI, user_evaluation_variables.OCCUPATION_IMAGES_IN_UI, user_evaluation_variables.TASK_IMAGES_IN_UI]):
|
26 |
+
st.write("\U0001F917 ", modelID, " has been loaded!")
|
27 |
+
st.write("Waiting for Images to be generated.")
|
28 |
+
# if any([user_evaluation_variables.OBJECT_IMAGES_IN_UI, user_evaluation_variables.OCCUPATION_IMAGES_IN_UI,
|
29 |
+
# user_evaluation_variables.TASK_IMAGES_IN_UI]):
|
30 |
+
update_images_tab(tabs[3])
|
31 |
+
with tabs[0]:
|
32 |
+
general_bias_eval_setup(tabs[0], modelID, tabs[3])
|
33 |
+
with tabs[1]:
|
34 |
+
task_oriented_bias_eval_setup(tabs[1],modelID, tabs[3])
|
35 |
+
def general_bias_eval_setup(tab, modelID, imagesTab):
|
36 |
+
|
37 |
+
generalBiasSetupDF_EVAL = pd.DataFrame(
|
38 |
+
{
|
39 |
+
"GEN Eval. Variable": ["No. Images to Generate per prompt", "No. Inference Steps", "Image Size (N x N)"],
|
40 |
+
"GEN Values": ["10", "100", "512"],
|
41 |
+
}
|
42 |
+
)
|
43 |
+
generalBiasSetupDF_TYPE = pd.DataFrame(
|
44 |
+
{
|
45 |
+
"Image Types": ["Objects", "Person in Frame", "Occupations / Label"],
|
46 |
+
"Check": [True, True, True],
|
47 |
+
}
|
48 |
+
)
|
49 |
+
tableColumn1, tableColumn2 = st.columns(2)
|
50 |
+
with tab:
|
51 |
+
with tableColumn1:
|
52 |
+
GENValTable = st.data_editor(
|
53 |
+
generalBiasSetupDF_EVAL,
|
54 |
+
column_config={
|
55 |
+
"GEN Eval. Variable": st.column_config.Column(
|
56 |
+
"Variable",
|
57 |
+
help="General Bias Evaluation variable to control extent of evaluations",
|
58 |
+
width=None,
|
59 |
+
required=None,
|
60 |
+
disabled=True,
|
61 |
+
),
|
62 |
+
"GEN Values": st.column_config.Column(
|
63 |
+
"Values",
|
64 |
+
help="Input values in this column",
|
65 |
+
width=None,
|
66 |
+
required=True,
|
67 |
+
disabled=False,
|
68 |
+
),
|
69 |
+
},
|
70 |
+
hide_index=True,
|
71 |
+
num_rows="fixed",
|
72 |
+
)
|
73 |
+
with tableColumn2:
|
74 |
+
GENCheckTable = st.data_editor(
|
75 |
+
generalBiasSetupDF_TYPE,
|
76 |
+
column_config={
|
77 |
+
"Check": st.column_config.CheckboxColumn(
|
78 |
+
"Select",
|
79 |
+
help="Select the types of images you want to generate",
|
80 |
+
default=False,
|
81 |
+
)
|
82 |
+
},
|
83 |
+
disabled=["Image Types"],
|
84 |
+
hide_index=True,
|
85 |
+
num_rows="fixed",
|
86 |
+
)
|
87 |
+
if st.button('Evaluate!', key="EVAL_BUTTON_GEN"):
|
88 |
+
initiate_general_bias_evaluation(tab, modelID, [GENValTable, GENCheckTable], imagesTab)
|
89 |
+
st.rerun()
|
90 |
+
|
91 |
+
if user_evaluation_variables.RUN_TIME and user_evaluation_variables.CURRENT_EVAL_TYPE == 'general':
|
92 |
+
GBM.output_eval_results(user_evaluation_variables.EVAL_METRICS, 21, 'general')
|
93 |
+
st.write("\U0001F553 Time Taken: ", user_evaluation_variables.RUN_TIME)
|
94 |
+
|
95 |
+
saveEvalsButton = st.button("Save + Upload Evaluations", key='SAVE_EVAL_GEN')
|
96 |
+
saveDistButton = st.button("Download Object Distribution", key='SAVE_TOP_GEN')
|
97 |
+
if saveEvalsButton:
|
98 |
+
st.write("Saving and uploading evaluations")
|
99 |
+
user_evaluation_variables.update_evaluation_table('general',False)
|
100 |
+
user_evaluation_variables.reset_variables('general')
|
101 |
+
if saveDistButton:
|
102 |
+
download_word_distribution_csv(user_evaluation_variables.EVAL_METRICS,
|
103 |
+
user_evaluation_variables.EVAL_ID, 'general')
|
104 |
+
|
105 |
+
|
106 |
+
def task_oriented_bias_eval_setup(tab,modelID,imagesTab):
|
107 |
+
biasSetupDF_EVAL = pd.DataFrame(
|
108 |
+
{
|
109 |
+
"TO Eval. Variable": ["No. Images to Generate per prompt", "No. Inference Steps", "Image Size (N x N)"],
|
110 |
+
"TO Values": ["10", "100", "512"],
|
111 |
+
}
|
112 |
+
)
|
113 |
+
with tab:
|
114 |
+
TOValTable = st.data_editor(
|
115 |
+
biasSetupDF_EVAL,
|
116 |
+
column_config={
|
117 |
+
"TO Eval. Variable": st.column_config.Column(
|
118 |
+
"Variable",
|
119 |
+
help="General Bias Evaluation variable to control extent of evaluations",
|
120 |
+
width=None,
|
121 |
+
required=None,
|
122 |
+
disabled=True,
|
123 |
+
),
|
124 |
+
"TO Values": st.column_config.Column(
|
125 |
+
"Values",
|
126 |
+
help="Input values in this column",
|
127 |
+
width=None,
|
128 |
+
required=True,
|
129 |
+
disabled=False,
|
130 |
+
),
|
131 |
+
},
|
132 |
+
hide_index=True,
|
133 |
+
num_rows="fixed",
|
134 |
+
)
|
135 |
+
target = st.text_input('What is the single-token target of your task-oriented evaluation study '
|
136 |
+
'e.g.: "burger", "coffee", "men", "women"')
|
137 |
+
|
138 |
+
if st.button('Evaluate!', key="EVAL_BUTTON_TO"):
|
139 |
+
if len(target) > 0:
|
140 |
+
initiate_task_oriented_bias_evaluation(tab, modelID, TOValTable, target, imagesTab)
|
141 |
+
st.rerun()
|
142 |
+
else:
|
143 |
+
st.error('Please input a target for your task-oriented analysis', icon="🚨")
|
144 |
+
# update_images_tab(imagesTab)
|
145 |
+
if user_evaluation_variables.RUN_TIME and user_evaluation_variables.CURRENT_EVAL_TYPE == 'task-oriented':
|
146 |
+
GBM.output_eval_results(user_evaluation_variables.EVAL_METRICS, 21, 'task-oriented')
|
147 |
+
st.write("\U0001F553 Time Taken: ", user_evaluation_variables.RUN_TIME)
|
148 |
+
saveEvalsButton = st.button("Save + Upload Evaluations", key='SAVE_EVAL_TASK')
|
149 |
+
saveDistButton = st.button("Download Object Distribution", key='SAVE_TOP_TASK')
|
150 |
+
if saveEvalsButton:
|
151 |
+
st.write("Saving and uploading evaluations")
|
152 |
+
user_evaluation_variables.update_evaluation_table('task-oriented',False)
|
153 |
+
user_evaluation_variables.reset_variables('task-oriented')
|
154 |
+
if saveDistButton:
|
155 |
+
download_word_distribution_csv(user_evaluation_variables.EVAL_METRICS,
|
156 |
+
user_evaluation_variables.EVAL_ID, user_evaluation_variables.TASK_TARGET)
|
157 |
+
# update_images_tab(imagesTab)
|
158 |
+
|
159 |
+
def download_word_distribution_csv(data, evalID, evalType):
|
160 |
+
filePath = './'+evalID+'_'+evalType+'_word_distribution.csv'
|
161 |
+
|
162 |
+
listOfObjects = list(data[0].items())
|
163 |
+
with open(filePath, 'w', newline='') as fp:
|
164 |
+
csvwriter = csv.writer(fp)
|
165 |
+
csvwriter.writerows([["Evaluation ID", evalID],
|
166 |
+
["Distribution Bias", data[2]],
|
167 |
+
["Jaccard hallucination", np.mean(data[3])],
|
168 |
+
["Generative Miss Rate", np.mean(data[4])]])
|
169 |
+
csvwriter.writerow(['Position', 'Object', 'No. Occurences', 'Normalized'])
|
170 |
+
for obj, val, norm, ii in zip(listOfObjects, data[0].values(), data[1], range(len(listOfObjects))):
|
171 |
+
csvwriter.writerow([ii, obj[0], val, norm])
|
172 |
+
st.success('Successfully downloaded word distribution data!', icon="✅")
|
173 |
+
|
174 |
+
def initiate_general_bias_evaluation(tab, modelID, specs, imagesTab):
|
175 |
+
startTime = time.time()
|
176 |
+
objectData = None
|
177 |
+
occupationData = None
|
178 |
+
objects = []
|
179 |
+
actions = []
|
180 |
+
occupations = []
|
181 |
+
occupationDescriptors = []
|
182 |
+
objectPrompts = None
|
183 |
+
occupationPrompts = None
|
184 |
+
|
185 |
+
objectImages = []
|
186 |
+
objectCaptions = []
|
187 |
+
occupationImages = []
|
188 |
+
occupationCaptions = []
|
189 |
+
evaluationImages = []
|
190 |
+
evaluationCaptions = []
|
191 |
+
with tab:
|
192 |
+
st.write("Initiating General Bias Evaluation Experiments with the following setup:")
|
193 |
+
st.write(" ***Model*** = ", modelID)
|
194 |
+
infoColumn1, infoColumn2 = st.columns(2)
|
195 |
+
with infoColumn1:
|
196 |
+
st.write(" ***No. Images per prompt*** = ", specs[0]["GEN Values"][0])
|
197 |
+
st.write(" ***No. Steps*** = ", specs[0]["GEN Values"][1])
|
198 |
+
st.write(" ***Image Size*** = ", specs[0]["GEN Values"][2], "$\\times$", specs[0]["GEN Values"][2])
|
199 |
+
with infoColumn2:
|
200 |
+
st.write(" ***Objects*** = ", specs[1]["Check"][0])
|
201 |
+
st.write(" ***Objects and Actions*** = ", specs[1]["Check"][1])
|
202 |
+
st.write(" ***Occupations*** = ", specs[1]["Check"][2])
|
203 |
+
st.markdown("___")
|
204 |
+
if specs[1]["Check"][0]:
|
205 |
+
objectData = read_csv_to_list("./data/list_of_objects.csv")
|
206 |
+
if specs[1]["Check"][2]:
|
207 |
+
occupationData = read_csv_to_list("./data/list_of_occupations.csv")
|
208 |
+
if objectData == None and occupationData == None:
|
209 |
+
st.error('Make sure that at least one of the "Objects" or "Occupations" rows are checked', icon="🚨")
|
210 |
+
else:
|
211 |
+
if specs[1]["Check"][0]:
|
212 |
+
for row in objectData[1:]:
|
213 |
+
objects.append(row[0])
|
214 |
+
if specs[1]["Check"][1]:
|
215 |
+
for row in objectData[1:]:
|
216 |
+
actions.append(row[1:])
|
217 |
+
if specs[1]["Check"][2]:
|
218 |
+
for row in occupationData[1:]:
|
219 |
+
occupations.append(row[0])
|
220 |
+
occupationDescriptors.append(row[1:])
|
221 |
+
with infoColumn1:
|
222 |
+
st.write("***No. Objects*** = ", len(objects))
|
223 |
+
st.write("***No. Actions*** = ", len(actions)*3)
|
224 |
+
with infoColumn2:
|
225 |
+
st.write("***No. Occupations*** = ", len(occupations))
|
226 |
+
st.write("***No. Occupation Descriptors*** = ", len(occupationDescriptors)*3)
|
227 |
+
if len(objects) > 0:
|
228 |
+
objectPrompts = MINFER.construct_general_bias_evaluation_prompts(objects, actions)
|
229 |
+
if len(occupations) > 0:
|
230 |
+
occupationPrompts = MINFER.construct_general_bias_evaluation_prompts(occupations, occupationDescriptors)
|
231 |
+
if objectPrompts is not None:
|
232 |
+
OBJECTprogressBar = st.progress(0, text="Generating Object-related images. Please wait.")
|
233 |
+
objectImages, objectCaptions = MINFER.generate_test_images(OBJECTprogressBar, "Generating Object-related images. Please wait.",
|
234 |
+
objectPrompts, int(specs[0]["GEN Values"][0]),
|
235 |
+
int(specs[0]["GEN Values"][1]), int(specs[0]["GEN Values"][2]))
|
236 |
+
evaluationImages+=objectImages
|
237 |
+
evaluationCaptions+=objectCaptions[0]
|
238 |
+
TXTObjectPrompts = ""
|
239 |
+
|
240 |
+
if occupationPrompts is not None:
|
241 |
+
OCCprogressBar = st.progress(0, text="Generating Occupation-related images. Please wait.")
|
242 |
+
occupationImages, occupationCaptions = MINFER.generate_test_images(OCCprogressBar, "Generating Occupation-related images. Please wait.",
|
243 |
+
occupationPrompts, int(specs[0]["GEN Values"][0]),
|
244 |
+
int(specs[0]["GEN Values"][1]), int(specs[0]["GEN Values"][2]))
|
245 |
+
evaluationImages += occupationImages
|
246 |
+
evaluationCaptions += occupationCaptions[0]
|
247 |
+
|
248 |
+
if len(evaluationImages) > 0:
|
249 |
+
EVALprogressBar = st.progress(0, text="Evaluating "+modelID+" Model Images. Please wait.")
|
250 |
+
user_evaluation_variables.EVAL_METRICS = GBM.evaluate_t2i_model_images(evaluationImages, evaluationCaptions, EVALprogressBar, False, "GENERAL")
|
251 |
+
# GBM.output_eval_results(user_evaluation_variables.EVAL_METRICS, 21)
|
252 |
+
elapsedTime = time.time() - startTime
|
253 |
+
# st.write("\U0001F553 Time Taken: ", str(datetime.timedelta(seconds=elapsedTime)).split(".")[0])
|
254 |
+
|
255 |
+
user_evaluation_variables.NO_SAMPLES = len(evaluationImages)
|
256 |
+
user_evaluation_variables.RESOLUTION = specs[0]["GEN Values"][2] + "x" + specs[0]["GEN Values"][2]
|
257 |
+
user_evaluation_variables.INFERENCE_STEPS = int(specs[0]["GEN Values"][1])
|
258 |
+
user_evaluation_variables.GEN_OBJECTS = bool(specs[1]["Check"][0])
|
259 |
+
user_evaluation_variables.GEN_ACTIONS = bool(specs[1]["Check"][1])
|
260 |
+
user_evaluation_variables.GEN_OCCUPATIONS = bool(specs[1]["Check"][2])
|
261 |
+
user_evaluation_variables.DIST_BIAS = float(f"{user_evaluation_variables.EVAL_METRICS[2]:.4f}")
|
262 |
+
user_evaluation_variables.HALLUCINATION = float(f"{np.mean(user_evaluation_variables.EVAL_METRICS[3]):.4f}")
|
263 |
+
user_evaluation_variables.MISS_RATE = float(f"{np.mean(user_evaluation_variables.EVAL_METRICS[4]):.4f}")
|
264 |
+
user_evaluation_variables.EVAL_ID = MCOMP.get_evaluation_id('general', True)
|
265 |
+
user_evaluation_variables.DATE = datetime.datetime.utcnow().strftime('%d-%m-%Y')
|
266 |
+
user_evaluation_variables.TIME = datetime.datetime.utcnow().strftime('%H:%M:%S')
|
267 |
+
user_evaluation_variables.RUN_TIME = str(datetime.timedelta(seconds=elapsedTime)).split(".")[0]
|
268 |
+
|
269 |
+
user_evaluation_variables.OBJECT_IMAGES =objectImages
|
270 |
+
user_evaluation_variables.OBJECT_CAPTIONS = objectCaptions
|
271 |
+
user_evaluation_variables.OCCUPATION_IMAGES = occupationImages
|
272 |
+
user_evaluation_variables.OCCUPATION_CAPTIONS = occupationCaptions
|
273 |
+
user_evaluation_variables.CURRENT_EVAL_TYPE = 'general'
|
274 |
+
|
275 |
+
|
276 |
+
def initiate_task_oriented_bias_evaluation(tab, modelID, specs, target, imagesTab):
|
277 |
+
startTime = time.time()
|
278 |
+
TASKImages = []
|
279 |
+
TASKCaptions = []
|
280 |
+
with tab:
|
281 |
+
st.write("Initiating Task-Oriented Bias Evaluation Experiments with the following setup:")
|
282 |
+
st.write(" ***Model*** = ", modelID)
|
283 |
+
infoColumn1, infoColumn2 = st.columns(2)
|
284 |
+
st.write(" ***No. Images per prompt*** = ", specs["TO Values"][0])
|
285 |
+
st.write(" ***No. Steps*** = ", specs["TO Values"][1])
|
286 |
+
st.write(" ***Image Size*** = ", specs["TO Values"][2], "$\\times$", specs["TO Values"][2])
|
287 |
+
st.write(" ***Target*** = ", target.lower())
|
288 |
+
st.markdown("___")
|
289 |
+
|
290 |
+
captionsToExtract = 50
|
291 |
+
if (captionsToExtract * int(specs['TO Values'][0])) < 30:
|
292 |
+
st.error('There should be at least 30 images generated, You are attempting to generate:\t'
|
293 |
+
+ str(captionsToExtract * int(specs['TO Values'][0]))+'.\nPlease readjust your No. Images per prompt',
|
294 |
+
icon="🚨")
|
295 |
+
else:
|
296 |
+
COCOLoadingBar = st.progress(0, text="Scanning through COCO Dataset for relevant prompts. Please wait")
|
297 |
+
prompts, cocoIDs = get_COCO_captions('./data/COCO_captions.json', target.lower(), COCOLoadingBar, captionsToExtract)
|
298 |
+
if len(prompts) == 0:
|
299 |
+
st.error('Woops! Could not find **ANY** relevant COCO prompts for the target: '+target.lower()+
|
300 |
+
'\nPlease input a different target', icon="🚨")
|
301 |
+
elif len(prompts) > 0 and len(prompts) < captionsToExtract:
|
302 |
+
st.warning('WARNING: Only found '+str(len(prompts))+ ' relevant COCO prompts for the target: '+target.lower()+
|
303 |
+
'\nWill work with these. Nothing to worry about!', icon="⚠️")
|
304 |
+
else:
|
305 |
+
st.success('Successfully found '+str(captionsToExtract)+' relevant COCO prompts', icon="✅")
|
306 |
+
if len(prompts) > 0:
|
307 |
+
COCOUIOutput = []
|
308 |
+
for id, pr in zip(cocoIDs, prompts):
|
309 |
+
COCOUIOutput.append([id, pr])
|
310 |
+
st.write('**Here are some of the randomised '+'"'+target.lower()+'"'+' captions extracted from the COCO dataset**')
|
311 |
+
COCOUIOutput.insert(0, ('ID', 'Caption'))
|
312 |
+
st.table(COCOUIOutput[:11])
|
313 |
+
TASKprogressBar = st.progress(0, text="Generating Task-oriented images. Please wait.")
|
314 |
+
TASKImages, TASKCaptions = MINFER.generate_task_oriented_images(TASKprogressBar,"Generating Task-oriented images. Please wait.",
|
315 |
+
prompts, cocoIDs, int(specs["TO Values"][0]),
|
316 |
+
int(specs["TO Values"][1]), int(specs["TO Values"][2]))
|
317 |
+
|
318 |
+
EVALprogressBar = st.progress(0, text="Evaluating " + modelID + " Model Images. Please wait.")
|
319 |
+
user_evaluation_variables.EVAL_METRICS = GBM.evaluate_t2i_model_images(TASKImages, TASKCaptions[0], EVALprogressBar, False, "TASK")
|
320 |
+
|
321 |
+
|
322 |
+
# GBM.output_eval_results(user_evaluation_variables.EVAL_METRICS, 21)
|
323 |
+
elapsedTime = time.time() - startTime
|
324 |
+
# st.write("\U0001F553 Time Taken: ", str(datetime.timedelta(seconds=elapsedTime)).split(".")[0])
|
325 |
+
|
326 |
+
user_evaluation_variables.NO_SAMPLES = len(TASKImages)
|
327 |
+
user_evaluation_variables.RESOLUTION = specs["TO Values"][2]+"x"+specs["TO Values"][2]
|
328 |
+
user_evaluation_variables.INFERENCE_STEPS = int(specs["TO Values"][1])
|
329 |
+
user_evaluation_variables.DIST_BIAS = float(f"{user_evaluation_variables.EVAL_METRICS[2]:.4f}")
|
330 |
+
user_evaluation_variables.HALLUCINATION = float(f"{np.mean(user_evaluation_variables.EVAL_METRICS[3]):.4f}")
|
331 |
+
user_evaluation_variables.MISS_RATE = float(f"{np.mean(user_evaluation_variables.EVAL_METRICS[4]):.4f}")
|
332 |
+
user_evaluation_variables.TASK_TARGET = target.lower()
|
333 |
+
user_evaluation_variables.EVAL_ID = MCOMP.get_evaluation_id('task-oriented', True)
|
334 |
+
user_evaluation_variables.DATE = datetime.datetime.utcnow().strftime('%d-%m-%Y')
|
335 |
+
user_evaluation_variables.TIME = datetime.datetime.utcnow().strftime('%H:%M:%S')
|
336 |
+
user_evaluation_variables.RUN_TIME = str(datetime.timedelta(seconds=elapsedTime)).split(".")[0]
|
337 |
+
|
338 |
+
user_evaluation_variables.TASK_IMAGES = TASKImages
|
339 |
+
user_evaluation_variables.TASK_CAPTIONS = TASKCaptions
|
340 |
+
user_evaluation_variables.TASK_COCOIDs = cocoIDs
|
341 |
+
|
342 |
+
user_evaluation_variables.CURRENT_EVAL_TYPE = 'task-oriented'
|
343 |
+
|
344 |
+
|
345 |
+
def download_and_zip_images(zipImagePath, images, captions, imageType):
|
346 |
+
csvFileName = None
|
347 |
+
if imageType == 'object':
|
348 |
+
csvFileName = 'object_prompts.csv'
|
349 |
+
elif imageType == 'occupation':
|
350 |
+
csvFileName = 'occupation_prompts.csv'
|
351 |
+
else:
|
352 |
+
csvFileName = 'task-oriented_prompts.csv'
|
353 |
+
with st.spinner("Zipping images..."):
|
354 |
+
with zipfile.ZipFile(zipImagePath, 'w') as img_zip:
|
355 |
+
for idx, image in enumerate(images):
|
356 |
+
imgName = captions[1][idx]
|
357 |
+
imageFile = BytesIO()
|
358 |
+
image.save(imageFile, 'JPEG')
|
359 |
+
img_zip.writestr(imgName, imageFile.getvalue())
|
360 |
+
|
361 |
+
# Saving prompt data as accompanying csv file
|
362 |
+
string_buffer = StringIO()
|
363 |
+
csvwriter = csv.writer(string_buffer)
|
364 |
+
|
365 |
+
if imageType in ['object', 'occupation']:
|
366 |
+
csvwriter.writerow(['No.', 'Prompt'])
|
367 |
+
for prompt, ii in zip(captions[0], range(len(captions[0]))):
|
368 |
+
csvwriter.writerow([ii + 1, prompt])
|
369 |
+
else:
|
370 |
+
csvwriter.writerow(['COCO ID', 'Prompt'])
|
371 |
+
for prompt, id in zip(captions[0], user_evaluation_variables.TASK_COCOIDs):
|
372 |
+
csvwriter.writerow([id, prompt])
|
373 |
+
|
374 |
+
img_zip.writestr(csvFileName, string_buffer.getvalue())
|
375 |
+
st.success('Successfully zipped and downloaded images!', icon="✅")
|
376 |
+
|
377 |
+
|
378 |
+
def update_images_tab(imagesTab):
|
379 |
+
with imagesTab:
|
380 |
+
if len(user_evaluation_variables.OBJECT_IMAGES) > 0:
|
381 |
+
with st.expander('Object-related Images'):
|
382 |
+
user_evaluation_variables.OBJECT_IMAGES_IN_UI = True
|
383 |
+
TXTObjectPrompts = ""
|
384 |
+
for prompt, ii in zip(user_evaluation_variables.OBJECT_CAPTIONS[0], range(len(user_evaluation_variables.OBJECT_CAPTIONS[0]))):
|
385 |
+
TXTObjectPrompts += str(1 + ii) + '. ' + prompt + '\n'
|
386 |
+
st.write("**Object-related General Bias Evaluation Images**")
|
387 |
+
st.write("Number of Generated Images = ", len(user_evaluation_variables.OBJECT_IMAGES))
|
388 |
+
st.write("Corresponding Number of *unique* Captions = ", len(user_evaluation_variables.OBJECT_CAPTIONS[0]))
|
389 |
+
st.text_area("***List of Object Prompts***",
|
390 |
+
TXTObjectPrompts,
|
391 |
+
height=400,
|
392 |
+
disabled=False,
|
393 |
+
key='TEXT_AREA_OBJECT')
|
394 |
+
cols = cycle(st.columns(3))
|
395 |
+
for idx, image in enumerate(user_evaluation_variables.OBJECT_IMAGES):
|
396 |
+
next(cols).image(image, width=225, caption=user_evaluation_variables.OBJECT_CAPTIONS[1][idx])
|
397 |
+
|
398 |
+
saveObjectImages = st.button("Save Object-related Images")
|
399 |
+
if saveObjectImages:
|
400 |
+
zipPath = 'TBYB_' + user_evaluation_variables.USERNAME + '_' + user_evaluation_variables.EVAL_ID + '_object_related_images.zip'
|
401 |
+
download_and_zip_images(zipPath, user_evaluation_variables.OBJECT_IMAGES,
|
402 |
+
user_evaluation_variables.OBJECT_CAPTIONS, 'object')
|
403 |
+
|
404 |
+
if len(user_evaluation_variables.OCCUPATION_IMAGES) > 0:
|
405 |
+
user_evaluation_variables.OCCUPATION_IMAGES_IN_UI = True
|
406 |
+
with st.expander('Occupation-related Images'):
|
407 |
+
TXTOccupationPrompts = ""
|
408 |
+
for prompt, ii in zip(user_evaluation_variables.OCCUPATION_CAPTIONS[0], range(len(user_evaluation_variables.OCCUPATION_CAPTIONS[0]))):
|
409 |
+
TXTOccupationPrompts += str(1 + ii) + '. ' + prompt + '\n'
|
410 |
+
st.write("**Occupation-related General Bias Evaluation Images**")
|
411 |
+
st.write("Number of Generated Images = ", len(user_evaluation_variables.OCCUPATION_IMAGES))
|
412 |
+
st.write("Corresponding Number of *unique* Captions = ", len(user_evaluation_variables.OCCUPATION_CAPTIONS[0]))
|
413 |
+
st.text_area("***List of Occupation Prompts***",
|
414 |
+
TXTOccupationPrompts,
|
415 |
+
height=400,
|
416 |
+
disabled=False,
|
417 |
+
key='TEXT_AREA_OCCU')
|
418 |
+
cols = cycle(st.columns(3))
|
419 |
+
for idx, image in enumerate(user_evaluation_variables.OCCUPATION_IMAGES):
|
420 |
+
next(cols).image(image, width=225, caption=user_evaluation_variables.OCCUPATION_CAPTIONS[1][idx])
|
421 |
+
|
422 |
+
saveOccupationImages = st.button("Save Occupation-related Images")
|
423 |
+
if saveOccupationImages:
|
424 |
+
zipPath = 'TBYB_' + user_evaluation_variables.USERNAME + '_' + user_evaluation_variables.EVAL_ID + '_occupation_related_images.zip'
|
425 |
+
download_and_zip_images(zipPath, user_evaluation_variables.OCCUPATION_IMAGES,
|
426 |
+
user_evaluation_variables.OCCUPATION_CAPTIONS, 'occupation')
|
427 |
+
if len(user_evaluation_variables.TASK_IMAGES) > 0:
|
428 |
+
with st.expander(user_evaluation_variables.TASK_TARGET+'-related Images'):
|
429 |
+
user_evaluation_variables.TASK_IMAGES_IN_UI = True
|
430 |
+
TXTTaskPrompts = ""
|
431 |
+
for prompt, id in zip(user_evaluation_variables.TASK_CAPTIONS[0], user_evaluation_variables.TASK_COCOIDs):
|
432 |
+
TXTTaskPrompts += "ID_" + str(id) + '. ' + prompt + '\n'
|
433 |
+
|
434 |
+
st.write("**Task-oriented Bias Evaluation Images. Target** = ", user_evaluation_variables.TASK_TARGET)
|
435 |
+
st.write("Number of Generated Images = ", len(user_evaluation_variables.TASK_IMAGES))
|
436 |
+
st.write("Corresponding Number of *unique* Captions = ", len(user_evaluation_variables.TASK_CAPTIONS[0]))
|
437 |
+
st.text_area("***List of Task-Oriented Prompts***",
|
438 |
+
TXTTaskPrompts,
|
439 |
+
height=400,
|
440 |
+
disabled=False,
|
441 |
+
key='TEXT_AREA_TASK')
|
442 |
+
cols = cycle(st.columns(3))
|
443 |
+
for idx, image in enumerate(user_evaluation_variables.TASK_IMAGES):
|
444 |
+
next(cols).image(image, width=225, caption=user_evaluation_variables.TASK_CAPTIONS[1][idx])
|
445 |
+
|
446 |
+
saveTaskImages = st.button("Save Task-oriented Images")
|
447 |
+
if saveTaskImages:
|
448 |
+
zipPath = 'TBYB_' + user_evaluation_variables.USERNAME + '_' + user_evaluation_variables.EVAL_ID + '_'+ user_evaluation_variables.TASK_TARGET+'-oriented_images.zip'
|
449 |
+
download_and_zip_images(zipPath, user_evaluation_variables.TASK_IMAGES,
|
450 |
+
user_evaluation_variables.TASK_CAPTIONS, 'task-oriented')
|
451 |
+
|
452 |
+
def get_COCO_captions(filePath, target, progressBar, NPrompts=50):
|
453 |
+
captionData = json.load(open(filePath))
|
454 |
+
COCOCaptions = []
|
455 |
+
COCOIDs = []
|
456 |
+
random.seed(42)
|
457 |
+
random.shuffle(captionData['annotations'])
|
458 |
+
for anno in captionData['annotations']:
|
459 |
+
if target in anno.get('caption').lower().split(' '):
|
460 |
+
if len(COCOCaptions) < NPrompts:
|
461 |
+
COCOCaptions.append(anno.get('caption').lower())
|
462 |
+
COCOIDs.append(str(anno.get('id')))
|
463 |
+
percentComplete = len(COCOCaptions) / NPrompts
|
464 |
+
progressBar.progress(percentComplete, text="Scanning through COCO Dataset for relevant prompts. Please wait")
|
465 |
+
return (COCOCaptions, COCOIDs)
|
466 |
+
def read_csv_to_list(filePath):
|
467 |
+
data = []
|
468 |
+
with open(filePath, 'r', newline='') as csvfile:
|
469 |
+
csvReader = csv.reader(csvfile)
|
470 |
+
for row in csvReader:
|
471 |
+
data.append(row)
|
472 |
+
return data
|
473 |
+
|
user_evaluation_variables.py
ADDED
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import yaml
|
2 |
+
from yaml import safe_load
|
3 |
+
import streamlit as st
|
4 |
+
|
5 |
+
USERNAME = None
|
6 |
+
EVAL_ID = None
|
7 |
+
MODEL = None
|
8 |
+
MODEL_TYPE = None
|
9 |
+
NO_SAMPLES = None
|
10 |
+
RESOLUTION = None
|
11 |
+
INFERENCE_STEPS = None
|
12 |
+
GEN_OBJECTS = None
|
13 |
+
GEN_ACTIONS = None
|
14 |
+
GEN_OCCUPATIONS = None
|
15 |
+
TASK_TARGET = None
|
16 |
+
DIST_BIAS = None
|
17 |
+
HALLUCINATION = None
|
18 |
+
MISS_RATE = None
|
19 |
+
DATE = None
|
20 |
+
TIME = None
|
21 |
+
RUN_TIME = None
|
22 |
+
|
23 |
+
EVAL_METRICS = None
|
24 |
+
OBJECT_IMAGES = []
|
25 |
+
OCCUPATION_IMAGES = []
|
26 |
+
TASK_IMAGES = []
|
27 |
+
OBJECT_CAPTIONS = None
|
28 |
+
OCCUPATION_CAPTIONS = None
|
29 |
+
TASK_CAPTIONS = None
|
30 |
+
TASK_COCOIDs = None
|
31 |
+
|
32 |
+
OBJECT_IMAGES_IN_UI = False
|
33 |
+
OCCUPATION_IMAGES_IN_UI = False
|
34 |
+
TASK_IMAGES_IN_UI = False
|
35 |
+
CURRENT_EVAL_TYPE = None
|
36 |
+
def update_evaluation_table(evalType, debugging):
|
37 |
+
global USERNAME
|
38 |
+
global EVAL_ID
|
39 |
+
global MODEL
|
40 |
+
global MODEL_TYPE
|
41 |
+
global NO_SAMPLES
|
42 |
+
global RESOLUTION
|
43 |
+
global INFERENCE_STEPS
|
44 |
+
global GEN_OBJECTS
|
45 |
+
global GEN_ACTIONS
|
46 |
+
global GEN_OCCUPATIONS
|
47 |
+
global TASK_TARGET
|
48 |
+
global DIST_BIAS
|
49 |
+
global HALLUCINATION
|
50 |
+
global MISS_RATE
|
51 |
+
global DATE
|
52 |
+
global TIME
|
53 |
+
global RUN_TIME
|
54 |
+
global CURRENT_EVAL_TYPE
|
55 |
+
|
56 |
+
if debugging:
|
57 |
+
st.write("Username: ", USERNAME)
|
58 |
+
st.write("EVAL_ID: ", EVAL_ID)
|
59 |
+
st.write("MODEL: ", MODEL)
|
60 |
+
st.write("MODEL_TYPE: ", MODEL_TYPE)
|
61 |
+
st.write("NO_SAMPLES: ", NO_SAMPLES)
|
62 |
+
st.write("RESOLUTION: ", RESOLUTION)
|
63 |
+
st.write("INFERENCE_STEPS: ", INFERENCE_STEPS)
|
64 |
+
st.write("GEN_OBJECTS: ", GEN_OBJECTS)
|
65 |
+
st.write("GEN_ACTIONS: ", GEN_ACTIONS)
|
66 |
+
st.write("GEN_OCCUPATIONS: ", GEN_OCCUPATIONS)
|
67 |
+
st.write("TASK_TARGET: ", TASK_TARGET)
|
68 |
+
st.write("DIST_BIAS: ", DIST_BIAS)
|
69 |
+
st.write("HALLUCINATION: ", HALLUCINATION)
|
70 |
+
st.write("MISS_RATE: ", MISS_RATE)
|
71 |
+
st.write("DATE: ", DATE)
|
72 |
+
st.write("TIME: ", TIME)
|
73 |
+
st.write("RUN_TIME: ", RUN_TIME)
|
74 |
+
|
75 |
+
newEvaluationData = None
|
76 |
+
if evalType == 'general':
|
77 |
+
evalDataPath = './data/general_eval_database.yaml'
|
78 |
+
newEvaluationData = {
|
79 |
+
"Model": MODEL,
|
80 |
+
"Model Type": MODEL_TYPE,
|
81 |
+
"No. Samples": NO_SAMPLES,
|
82 |
+
"Resolution": RESOLUTION,
|
83 |
+
"Inference Steps": INFERENCE_STEPS,
|
84 |
+
"Objects": GEN_OBJECTS,
|
85 |
+
"Actions": GEN_ACTIONS,
|
86 |
+
"Occupations": GEN_OCCUPATIONS,
|
87 |
+
"Dist. Bias": DIST_BIAS,
|
88 |
+
"Hallucination": HALLUCINATION,
|
89 |
+
"Gen. Miss Rate": MISS_RATE,
|
90 |
+
"Date": DATE,
|
91 |
+
"Time": TIME,
|
92 |
+
"Run Time": RUN_TIME
|
93 |
+
}
|
94 |
+
else:
|
95 |
+
evalDataPath = './data/task_oriented_eval_database.yaml'
|
96 |
+
newEvaluationData = {
|
97 |
+
"Model": MODEL,
|
98 |
+
"Model Type": MODEL_TYPE,
|
99 |
+
"No. Samples": NO_SAMPLES,
|
100 |
+
"Resolution": RESOLUTION,
|
101 |
+
"Inference Steps": INFERENCE_STEPS,
|
102 |
+
"Target": TASK_TARGET,
|
103 |
+
"Dist. Bias": DIST_BIAS,
|
104 |
+
"Hallucination": HALLUCINATION,
|
105 |
+
"Gen. Miss Rate": MISS_RATE,
|
106 |
+
"Date": DATE,
|
107 |
+
"Time": TIME,
|
108 |
+
"Run Time": RUN_TIME
|
109 |
+
}
|
110 |
+
with open(evalDataPath, 'r') as f:
|
111 |
+
yamlData = safe_load(f)
|
112 |
+
|
113 |
+
# st.write("OLD DATABASE ", yamlData['evaluations']['username'][USERNAME])
|
114 |
+
if USERNAME not in yamlData['evaluations']['username']:
|
115 |
+
if TASK_TARGET is not None:
|
116 |
+
st.success('Congrats on your first General Bias evaluation!', icon='\U0001F388')
|
117 |
+
else:
|
118 |
+
st.success('Congrats on your first Task-Oriented Bias evaluation!', icon='\U0001F388')
|
119 |
+
yamlData['evaluations']['username'][USERNAME]= {}
|
120 |
+
|
121 |
+
yamlData['evaluations']['username'][USERNAME][EVAL_ID] = newEvaluationData
|
122 |
+
|
123 |
+
st.write("NEW DATABASE ", yamlData['evaluations']['username'][USERNAME])
|
124 |
+
with open(evalDataPath, 'w') as yaml_file:
|
125 |
+
yaml_file.write(yaml.dump(yamlData, default_flow_style=False))
|
126 |
+
|
127 |
+
def reset_variables(evalType):
|
128 |
+
global USERNAME
|
129 |
+
global EVAL_ID
|
130 |
+
global MODEL
|
131 |
+
global MODEL_TYPE
|
132 |
+
global NO_SAMPLES
|
133 |
+
global RESOLUTION
|
134 |
+
global INFERENCE_STEPS
|
135 |
+
global GEN_OBJECTS
|
136 |
+
global GEN_ACTIONS
|
137 |
+
global GEN_OCCUPATIONS
|
138 |
+
global TASK_TARGET
|
139 |
+
global DIST_BIAS
|
140 |
+
global HALLUCINATION
|
141 |
+
global MISS_RATE
|
142 |
+
global DATE
|
143 |
+
global TIME
|
144 |
+
global RUN_TIME
|
145 |
+
global EVAL_METRICS
|
146 |
+
global OBJECT_IMAGES
|
147 |
+
global OCCUPATION_IMAGES
|
148 |
+
global TASK_IMAGES
|
149 |
+
global OBJECT_CAPTIONS
|
150 |
+
global OCCUPATION_CAPTIONS
|
151 |
+
global TASK_CAPTIONS
|
152 |
+
global TASK_COCOIDs
|
153 |
+
global OBJECT_IMAGES_IN_UI
|
154 |
+
global OCCUPATION_IMAGES_IN_UI
|
155 |
+
global TASK_IMAGES_IN_UI
|
156 |
+
global CURRENT_EVAL_TYPE
|
157 |
+
EVAL_ID = None
|
158 |
+
# MODEL = None
|
159 |
+
# MODEL_TYPE = None
|
160 |
+
NO_SAMPLES = None
|
161 |
+
RESOLUTION = None
|
162 |
+
INFERENCE_STEPS = None
|
163 |
+
GEN_OBJECTS = None
|
164 |
+
GEN_ACTIONS = None
|
165 |
+
GEN_OCCUPATIONS = None
|
166 |
+
TASK_TARGET = None
|
167 |
+
DIST_BIAS = None
|
168 |
+
HALLUCINATION = None
|
169 |
+
MISS_RATE = None
|
170 |
+
DATE = None
|
171 |
+
TIME = None
|
172 |
+
RUN_TIME = None
|
173 |
+
|
174 |
+
EVAL_METRICS = None
|
175 |
+
CURRENT_EVAL_TYPE = None
|
176 |
+
|
177 |
+
if evalType == 'general':
|
178 |
+
OBJECT_IMAGES = []
|
179 |
+
OCCUPATION_IMAGES = []
|
180 |
+
OBJECT_CAPTIONS = None
|
181 |
+
OCCUPATION_CAPTIONS = None
|
182 |
+
OBJECT_IMAGES_IN_UI = False
|
183 |
+
OCCUPATION_IMAGES_IN_UI = False
|
184 |
+
else:
|
185 |
+
TASK_IMAGES = []
|
186 |
+
TASK_CAPTIONS = None
|
187 |
+
TASK_COCOIDs = None
|
188 |
+
TASK_IMAGES_IN_UI = False
|
189 |
+
|