File size: 19,646 Bytes
a8af1a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
{
  "metadata": {
    "Name": "Model B",
    "Provider": "AI Innovations",
    "Version": "3.0",
    "Release Date": "2023-11-30",
    "Type": "Multimodal AI",
    "Modalities": ["Text-to-Text", "Text-to-Image", "Image-to-Text"]
  },
  "scores": {
    "Bias, Stereotypes, and Representational Harms Evaluation": {
      "Comprehensive Evaluation Methodology": {
        "status": "Yes",
        "source": "Both",
        "applicable_evaluations": [
          "Evaluations at various stages (data collection, preprocessing, model architecture, training, deployment)",
          "Both intrinsic (e.g., embedding analysis) and extrinsic (e.g., downstream task performance) evaluation methods",
          "Multi-level analysis (e.g., word, sentence, document levels for text; pixel, object, scene levels for images)"
        ]
      },
      "Inclusive Protected Class Consideration": {
        "status": "Yes",
        "source": "3P",
        "applicable_evaluations": [
          "Evaluation of non-standard protected classes (e.g., socioeconomic status, education level, regional differences)",
          "Consideration of intersectionality and how identity aspects interact"
        ]
      },
      "Cultural and Linguistic Diversity": {
        "status": "Yes",
        "source": "Both",
        "applicable_evaluations": [
          "Tests of model performance and biases across languages and cultures",
          "Analysis of the impact of different languages/scripts on image generation (for text-to-image models)",
          "Consideration of how protected categories may shift in meaning across regions"
        ]
      },
      "Stereotype and Harmful Association Detection": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Detection of stereotypical word associations in text models or visual representations in image models",
          "Sentiment analysis and toxicity measurements, especially regarding specific groups"
        ]
      },
      "Performance Disparities Assessment": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Detailed breakdowns of performance metrics (accuracy, precision, recall) for various subgroups",
          "Performance analysis for disadvantaged subgroups",
          "Intersectionality considerations in performance analysis"
        ]
      },
      "Bias Mitigation and Impact Analysis": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Documentation of bias mitigation strategies",
          "Analyses of how model updates or mitigations affect bias metrics"
        ]
      },
      "Transparency and Limitations Disclosure": {
        "status": "Yes",
        "source": "Both",
        "applicable_evaluations": [
          "Clear statements on the capabilities and limitations of evaluation methods",
          "Acknowledgment of potential biases from the evaluation tools/processes",
          "Detailed explanations of bias-related metrics, including assumptions or limitations"
        ]
      },
      "Ongoing Evaluation Commitment": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Plans for continual bias assessment as the model is updated or deployed in new contexts",
          "Strategies for incorporating new findings/methodologies in evaluation",
          "Commitments to transparency and regular reporting on bias-related issues"
        ]
      }
    },
    "Cultural Values and Sensitive Content Evaluation": {
      "Hate Speech and Toxicity Evaluation": {
        "status": "Yes",
        "source": "Both",
        "applicable_evaluations": [
          "Assessments of harmful text generation",
          "Evaluations of toxicity, hurtfulness, or offensiveness",
          "Examination of invasive bodily commentary or rejections of identity"
        ]
      },
      "Cultural Value Representation": {
        "status": "Yes",
        "source": "3P",
        "applicable_evaluations": [
          "Use of pre-existing scholarship (e.g., World Values Survey, Geert Hofstede's work)",
          "Assessments of ethical scenarios and political value representation",
          "Evaluations of geopolitical statements and regional representation"
        ]
      },
      "Diverse Cultural Context": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Assessments that don't equate nationality with cultural context",
          "Representation of differing cultural values within countries",
          "Inclusion of marginalized communities' perspectives"
        ]
      },
      "Sensitive Content Identification": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Recognition of topics that vary by culture and viewpoint",
          "Assessment of content related to egregious violence",
          "Evaluation of adult sexual content identification"
        ]
      },
      "Impact of Generated Content": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Assessment of potential harm to targeted viewers",
          "Evaluation of content's potential to normalize harmful ideas",
          "Analysis of possible contributions to online radicalization"
        ]
      },
      "Multidimensional Cultural Analysis": {
        "status": "Yes",
        "source": "Both",
        "applicable_evaluations": [
          "Evaluations at word, sentence, and document levels for text",
          "Analysis at pixel, object, and scene levels for images",
          "Multi-level analysis of cultural representation"
        ]
      }
    },
    "Disparate Performance": {
      "Subpopulation Performance Analysis": {
        "status": "Yes",
        "source": "Both",
        "applicable_evaluations": [
          "Non-aggregated (disaggregated) evaluation results with in-depth breakdowns across subpopulations",
          "Metrics such as subgroup accuracy, calibration, AUC, recall, precision, min-max ratios",
          "Worst-case subgroup performance analysis"
        ]
      },
      "Cross-lingual and Dialect Evaluation": {
        "status": "Yes",
        "source": "3P",
        "applicable_evaluations": [
          "Cross-lingual prompting on standard benchmarks",
          "Examination of performance across dialects",
          "Analysis of hallucination disparity across languages"
        ]
      },
      "Image Generation Quality Assessment": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Examination of generation quality across various concepts",
          "Accuracy of cultural representation in generated images",
          "Assessment of realism across different concepts"
        ]
      },
      "Data Duplication and Bias Analysis": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Analysis of the effect of retaining duplicate examples in the training dataset",
          "Evaluation of model bias towards generating certain phrases or concepts"
        ]
      },
      "Dataset Disparities Evaluation": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Assessment of dataset skew with fewer examples from some subpopulations",
          "Evaluation of feature inconsistencies across subpopulations",
          "Analysis of geographic biases in data collection"
        ]
      },
      "Evaluation of Systemic Issues": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Assessment of disparities due to dataset collection methods",
          "Evaluation of the impact of varying levels of internet access on data representation",
          "Analysis of content filters' effects on data availability"
        ]
      },
      "Long-tail Data Distribution Analysis": {
        "status": "Yes",
        "source": "3P",
        "applicable_evaluations": [
          "Assessment of model performance on rare or uncommon data points",
          "Evaluation of the trade-off between fitting long tails and unintentional memorization"
        ]
      }
    },
    "Environmental Costs and Carbon Emissions Evaluation": {
      "Energy Consumption Measurement": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Measurement of energy used in training, testing, and deploying the system",
          "Evaluation of compute power consumption",
          "Assessment of energy resources used by large-scale systems"
        ]
      },
      "Carbon Footprint Quantification": {
        "status": "Yes",
        "source": "3P",
        "applicable_evaluations": [
          "Use of tools like CodeCarbon or Carbontracker",
          "Measurement of carbon emissions for training and inference",
          "Conversion of energy consumption to carbon emissions"
        ]
      },
      "Hardware Resource Evaluation": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Assessment of CPU, GPU, and TPU usage",
          "Measurement of FLOPS (Floating Point Operations)",
          "Evaluation of package power draw and GPU performance state"
        ]
      },
      "Comprehensive Environmental Impact Assessment": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Use of Life Cycle Assessment (LCA) methodologies",
          "Consideration of supply chains and manufacturing impacts",
          "Evaluation of immediate impacts of applying ML"
        ]
      },
      "Transparency in Environmental Reporting": {
        "status": "Yes",
        "source": "Both",
        "applicable_evaluations": [
          "Disclosure of uncertainty around measured variables",
          "Reporting of marginal costs (e.g., added parameters' contribution to energy consumption)",
          "Transparency about equipment manufacturers and data/hosting centers"
        ]
      },
      "Comprehensive Environmental Impact Metrics": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Discussion of different approaches to measuring environmental impact",
          "Use of diverse measurements beyond energy consumption",
          "Consideration of various factors including lifecycle assessment"
        ]
      }
    },
    "Privacy and Data Protection Evaluation": {
      "Data Minimization and Consent Practices": {
        "status": "Yes",
        "source": "Both",
        "applicable_evaluations": [
          "Implementation of data minimization practices",
          "Use of opt-in data collection methods",
          "Assessment of active consent for collecting, processing, and sharing data"
        ]
      },
      "Memorization and Data Leakage Evaluation": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Examination of the maximum amount of discoverable information given training data",
          "Evaluation of extractable information without training data access",
          "Analysis of out-of-distribution data revelation"
        ]
      },
      "Personal Information Revelation Assessment": {
        "status": "Yes",
        "source": "3P",
        "applicable_evaluations": [
          "Direct prompting tests to reveal Personally Identifiable Information (PII)",
          "Use of tools like ProPILE to audit PII revelation likelihood",
          "Evaluation of the system's ability to infer personal attributes"
        ]
      },
      "Image and Audio Privacy Evaluation": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Assessment of training data memorization in image generation",
          "Use of adversarial Membership Inference Attacks for images",
          "Evaluation of the proportion of generated images with high similarity to training data"
        ]
      },
      "Intellectual Property and Copyright Evaluation": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Assessment of the system's ability to generate copyrighted content",
          "Evaluation of intellectual property concerns in generated content",
          "Analysis of the system's handling of highly sensitive documents"
        ]
      },
      "Retroactive Privacy Protection": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Assessment of the system's capability to retroactively retrain in accordance with privacy policies",
          "Evaluation of processes for removing specific data points upon request",
          "Analysis of the system's adaptability to changing privacy regulations"
        ]
      },
      "Third-party Hosting Privacy Evaluation": {
        "status": "Yes",
        "source": "Both",
        "applicable_evaluations": [
          "Assessment of potential leakage of private input data in generations",
          "Evaluation of system prompt privacy, especially for prompts containing proprietary information",
          "Analysis of the system's handling of sensitive database records in context learning"
        ]
      },
      "Generative AI-Specific Privacy Measures": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Assessment of the applicability of data sanitization techniques to generative models",
          "Evaluation of differential privacy approaches in the context of generative AI",
          "Analysis of novel privacy protection methods designed specifically for generative models"
        ]
      }
    },
    "Financial Costs Evaluation": {
      "Comprehensive Cost Evaluation": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Estimation of infrastructure and hardware costs",
          "Calculation of labor hours from researchers, developers, and crowd workers",
          "Tracking of compute costs using low-cost or standard pricing per instance-hour"
        ]
      },
      "Storage and Training Cost Analysis": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Assessment of storage costs for both datasets and resulting models",
          "Consideration of in-house vs. cloud storage options",
          "Evaluation of training costs based on in-house GPUs or per-hour-priced instances"
        ]
      },
      "Hosting and Inference Cost Evaluation": {
        "status": "Yes",
        "source": "Both",
        "applicable_evaluations": [
          "Evaluation of low-latency serving costs",
          "Assessment of inference costs based on token usage",
          "Consideration of factors such as initial prompt length and requested token response length"
        ]
      },
      "Modality-Specific Cost Analysis": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Assessment of costs related to pixel density and frame usage for image and video",
          "Evaluation of preprocessing costs for audio (e.g., spectrogram generation)",
          "Consideration of model architecture in cost calculations"
        ]
      },
      "Long-term Cost Considerations": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Assessment of pre- and post-deployment costs",
          "Consideration of human labor and hidden costs",
          "Tracking of changes in costs and economy of components over time"
        ]
      },
      "API Cost Evaluation": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Assessment of token-usage based pricing",
          "Evaluation of cost variations based on initial prompt length and requested token response length",
          "Analysis of cost differences across model versions"
        ]
      },
      "Comprehensive Cost Tracking": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Assessment of costs related to broader infrastructure or organizational changes",
          "Evaluation of long-term maintenance and update costs",
          "Analysis of costs associated with complementary technologies or processes"
        ]
      }
    },
    "Data and Content Moderation Labor Evaluation": {
      "Crowdwork Standards Compliance": {
        "status": "Yes",
        "source": "3P",
        "applicable_evaluations": [
          "Assessment of compliance with Criteria for Fairer Microwork",
          "Evaluation against Partnership on AI's Responsible Sourcing of Data Enrichment Services guidelines",
          "Comparison with Oxford Internet Institute's Fairwork Principles"
        ]
      },
      "Crowdworker Demographics and Compensation": {
        "status": "Yes",
        "source": "Both",
        "applicable_evaluations": [
          "Documentation of crowd workers' demographics",
          "Transparency in reporting instructions given to crowdworkers",
          "Assessment of how crowdworkers were evaluated and compensated"
        ]
      },
      "Psychological Support and Content Exposure": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Documentation of immediate trauma support availability",
          "Assessment of long-term professional psychological support provision",
          "Evaluation of practices for controlling exposure to traumatic material"
        ]
      },
      "Transparency in Crowdwork Documentation": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Use of transparent reporting frameworks",
          "Documentation of crowdwork's role in shaping AI system output",
          "Evaluation of the accessibility of crowdwork information"
        ]
      },
      "Crowdwork Stages and Types": {
        "status": "Yes",
        "source": "Both",
        "applicable_evaluations": [
          "Assessment of crowdwork in data gathering, curation, cleaning, and labeling",
          "Evaluation of crowdwork during model development and interim evaluations",
          "Examination of post-deployment crowdwork for output evaluation and correction"
        ]
      },
      "Evaluation of Labor Protection and Regulations": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Assessment of compliance with relevant labor law interventions by jurisdiction",
          "Evaluation of worker classification and associated protections",
          "Analysis of fair work practices and compensation structures"
        ]
      },
      "Outsourcing Impact Evaluation": {
        "status": "Yes",
        "source": "3P",
        "applicable_evaluations": [
          "Assessment of communication barriers created by outsourcing",
          "Evaluation of differences in working conditions between in-house and outsourced labor",
          "Analysis of transparency in reporting structures for outsourced work"
        ]
      },
      "Impact of Precarious Employment": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Assessment of job security and its impact on worker feedback",
          "Evaluation of anonymous reporting systems for substandard working conditions",
          "Analysis of power dynamics between crowdworkers and employers"
        ]
      }
    }
  }
}