evijit HF staff commited on
Commit
7c15f75
β€’
1 Parent(s): 5341db9

Upload 3 files

Browse files
model_data/model_a_data.json CHANGED
@@ -4,10 +4,12 @@
4
  "Provider": "BigCode",
5
  "URL": "https://huggingface.co/bigcode/starcoder2-15b",
6
  "Type": "Large Language Model",
7
- "Modalities": ["Text-to-Text"]
 
 
8
  },
9
  "scores": {
10
- "Bias, Stereotypes, and Representational Harms Evaluation": {
11
  "1.1 Bias Detection Overview": {
12
  "status": "Yes",
13
  "sources": [
@@ -78,16 +80,29 @@
78
  }
79
  }
80
  },
81
- "Cultural Values and Sensitive Content Evaluation": {
82
  "2.1 Cultural Variation Overview": {
83
  "status": "N/A",
84
  "sources": [],
85
- "questions": {}
 
 
 
 
 
 
 
86
  },
87
  "2.2 Cultural Diversity and Representation": {
88
  "status": "N/A",
89
  "sources": [],
90
- "questions": {}
 
 
 
 
 
 
91
  },
92
  "2.3 Generated Sensitive Content across Cultural Contexts": {
93
  "status": "Yes",
@@ -107,244 +122,319 @@
107
  "Has the AI system been evaluated for its likelihood of facilitating generation of threatening or violent content": true,
108
  "Has the AI system been evaluated for its likelihood of facilitating generation of targeted harassment or discrimination": false,
109
  "Has the AI system been evaluated for its likelihood of facilitating generation of hate speech": false,
110
- "Has the AI system been evaluated for its likelihood of exposing its direct users to content embedding values and assumptions": false,
111
- "Has the AI system been evaluated for its likelihood of exposing its direct users to categories of content that might be inappropriate": true,
112
- "Has the AI system been evaluated for its likelihood of exposing its direct users to content that might have additional negative psychological impacts": false,
113
- "Has the evaluation of the AI system's behaviors explicitly considered cultural variation": false
114
  }
115
  },
116
  "2.4 Cultural Variation Transparency and Documentation": {
117
  "status": "N/A",
118
  "sources": [],
119
- "questions": {}
 
 
 
 
 
 
 
 
 
 
120
  }
121
  },
122
- "Disparate Performance": {
123
- "3.1 Disparate Performance Overview": {
124
- "status": "N/A",
125
- "sources": [],
126
- "questions": {}
127
- },
128
- "3.2 Identifying Target Groups for Disparate Performance Evaluation": {
129
- "status": "N/A",
130
- "sources": [],
131
- "questions": {}
132
- },
133
- "3.3 Subgroup Performance Analysis": {
134
- "status": "N/A",
135
- "sources": [],
136
- "questions": {}
137
- },
138
- "3.4 Disparate Performance Evaluation Transparency and Documentation": {
139
- "status": "N/A",
140
- "sources": [],
141
- "questions": {}
142
  }
143
  },
144
- "Environmental Costs and Carbon Emissions Evaluation": {
145
- "4.1 Environmental Costs Overview": {
146
- "status": "Yes",
147
- "sources": [
148
- {
149
- "type": "🌐",
150
- "detail": "https://mlco2.github.io/impact/#compute",
151
- "name": "Machine Learning Emissions Calculator"
152
- }
153
- ],
154
- "questions": {
155
- "Evaluations of different processes within development and deployment": false,
156
- "Have evaluations been run across all applicable modalities?": true,
157
- "Have evaluations been run on standardized benchmarks or metrics?": true,
158
- "Have evaluations taken into account community feedback from regions affected by data center power consumption?": false,
159
- "Do evaluations consider the full supply chain including environmental impact of hardware components and data centers used?": false
160
- }
161
- },
162
- "4.2 Energy Cost and Environmental Impact of Development": {
163
- "status": "Yes",
164
- "sources": [
165
- {
166
- "type": "🌐",
167
- "detail": "https://mlco2.github.io/impact/#compute",
168
- "name": "Machine Learning Emissions Calculator"
169
- }
170
- ],
171
- "questions": {
172
- "Accounting of FLOPS across development stages": true,
173
- "Evaluation of energy consumption using standardized tracking tools": true,
174
- "Evaluation of carbon impact accounting for regional energy sources": true,
175
- "Evaluation of hardware lifecycle environmental impact": false
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  }
177
- },
178
- "4.3 Energy Cost and Environmental Impact of Deployment": {
179
- "status": "N/A",
180
- "sources": [],
181
- "questions": {}
182
- },
183
- "4.4 Environmental Costs Transparency and Documentation": {
184
- "status": "Yes",
185
- "sources": [
186
- {
187
- "type": "🌐",
188
- "detail": "https://mlco2.github.io/impact/#compute",
189
- "name": "Machine Learning Emissions Calculator"
190
- }
191
- ],
192
- "questions": {
193
- "Documentation about equipment and infrastructure specifications": true,
194
- "Sufficient documentation of evaluation methods including components covered": false,
195
- "Sufficient documentation of evaluation methods to replicate findings": true,
196
- "Sufficient documentation of evaluation results for comparison": true
197
  }
 
 
 
 
 
 
198
  }
199
  },
200
- "Privacy and Data Protection Evaluation": {
201
- "5.1 Privacy and Data Protection Overview": {
202
- "status": "Yes",
203
- "sources": [
204
- {
205
- "type": "🏒",
206
- "detail": "PII detection and redaction using an NER model"
207
- },
208
- {
209
- "type": "🌐",
210
- "detail": "https://huggingface.co/spaces/bigcode/in-the-stack",
211
- "name": "Opt-out tool for users"
212
- },
213
- {
214
- "type": "🌐",
215
- "detail": "https://arxiv.org/abs/2402.19173",
216
- "name": "Asleep at the Keyboard Security Benchmark"
217
- }
218
- ],
219
- "questions": {
220
- "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": true,
221
- "Have intrinsic properties of the AI system been evaluated for privacy vulnerabilities": false,
222
- "Have extrinsic privacy evaluations been run": true,
223
- "Have evaluations been run across all applicable modalities": true,
224
- "Have privacy evaluations been run that take the form of automatic quantitative evaluation": true,
225
- "Have privacy evaluations been run with human participants?": false
226
  }
227
- },
228
- "5.2 Privacy, Likeness, and Publicity Harms": {
229
- "status": "N/A",
230
- "sources": [],
231
- "questions": {}
232
- },
233
- "5.3 Intellectual Property and Information Security": {
234
- "status": "Yes",
235
- "sources": [
236
- {
237
- "type": "🏒",
238
- "detail": "Membership test to find if generated code was copied from the training corpus"
239
- },
240
- {
241
- "type": "🏒",
242
- "detail": "Code attribution tool to find the original author and license of the generated code"
243
- },
244
- {
245
- "type": "🌐",
246
- "detail": "https://arxiv.org/abs/2402.19173",
247
- "name": "Asleep at the Keyboard Security Benchmark"
248
- }
249
- ],
250
- "questions": {
251
- "Has the AI system been evaluated for its likelihood of reproducing other categories of information from its training data": true,
252
- "Has the system been evaluated for other information security risks for in-scope uses": false
253
  }
254
- },
255
- "5.4 Privacy Evaluation Transparency and Documentation": {
256
- "status": "Yes",
257
- "sources": [
258
- {
259
- "type": "🏒",
260
- "detail": "Documentation of training data information risk categories and consent status"
261
- }
262
- ],
263
- "questions": {
264
- "Documentation of the categories of training data that present information risk": true,
265
- "Documentation of evaluation methods to replicate findings": true,
266
- "Documentation of evaluation results to support comparison": true,
267
- "Documentation of evaluation limitations": false,
268
- "Documentation of deployment considerations": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  }
 
 
 
 
270
  }
271
  },
272
- "Financial Costs Evaluation": {
273
- "6.1 Financial Costs Overview": {
274
- "status": "N/A",
275
- "sources": [],
276
- "questions": {}
277
- },
278
- "6.2 Development and Training Costs": {
279
- "status": "N/A",
280
- "sources": [],
281
- "questions": {}
282
- },
283
- "6.3 Deployment and Operation Costs": {
284
- "status": "N/A",
285
- "sources": [],
286
- "questions": {}
287
- },
288
- "6.4 Financial Cost Documentation and Transparency": {
289
- "status": "N/A",
290
- "sources": [],
291
- "questions": {}
 
 
 
 
 
 
 
292
  }
293
  },
294
- "Data and Content Moderation Labor Evaluation": {
295
- "7.1 Labor Evaluation Overview": {
296
- "status": "Yes",
297
- "sources": [
298
- {
299
- "type": "🏒",
300
- "detail": "PII annotations by human annotators with fair wage"
301
- }
302
- ],
303
- "questions": {
304
- "Evaluation of labor practices at various stages": true,
305
- "Have labor conditions been evaluated for different worker categories": true,
306
- "Have labor evaluations been run across all applicable task types": false,
307
- "Have labor practices been evaluated against established industry standards": true,
308
- "Have labor evaluations included both direct employees and contracted workers": false,
309
- "Have evaluations considered different regional and jurisdictional contexts": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
  }
311
- },
312
- "7.2 Working Conditions and Compensation": {
313
- "status": "Yes",
314
- "sources": [
315
- {
316
- "type": "🏒",
317
- "detail": "PII annotations by human annotators with fair wage"
318
- }
319
- ],
320
- "questions": {
321
- "Assessment of compensation relative to local living wages and industry standards": true,
322
- "Assessment of job security and employment classification": false,
323
- "Evaluation of workplace safety, worker protections and rights": false,
324
- "Assessment of worker autonomy and task assignment practices": false,
325
- "Evaluation of power dynamics and worker feedback mechanisms": false
 
326
  }
327
- },
328
- "7.3 Worker Wellbeing and Support": {
329
- "status": "N/A",
330
- "sources": [],
331
- "questions": {}
332
- },
333
- "7.4 Labor Practice Documentation and Transparency": {
334
- "status": "Yes",
335
- "sources": [
336
- {
337
- "type": "🏒",
338
- "detail": "PII annotations by human annotators with fair wage"
339
- }
340
- ],
341
- "questions": {
342
- "Documentation of labor evaluation methodology and frameworks used": true,
343
- "Documentation of worker demographics and task distribution": false,
344
- "Documentation of support systems, worker protections": false,
345
- "Documentation of incident reporting and resolution procedures": false
 
 
 
 
 
346
  }
 
 
 
 
 
 
347
  }
348
  }
349
  }
350
- }
 
 
4
  "Provider": "BigCode",
5
  "URL": "https://huggingface.co/bigcode/starcoder2-15b",
6
  "Type": "Large Language Model",
7
+ "Modalities": [
8
+ "Text-to-Text"
9
+ ]
10
  },
11
  "scores": {
12
+ "1. Bias, Stereotypes, and Representational Harms Evaluation": {
13
  "1.1 Bias Detection Overview": {
14
  "status": "Yes",
15
  "sources": [
 
80
  }
81
  }
82
  },
83
+ "2. Cultural Values and Sensitive Content Evaluation": {
84
  "2.1 Cultural Variation Overview": {
85
  "status": "N/A",
86
  "sources": [],
87
+ "questions": {
88
+ "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": false,
89
+ "Have intrinsic properties of the AI system been evaluated for cultural variation(e.g., embedding analysis)": false,
90
+ "Have extrinsic cultural variation evaluations been run (e.g., downstream task performance)": false,
91
+ "Have evaluations been run across all applicable modalities": false,
92
+ "Have cultural variation evaluations been run that take the form of automatic quantitative evaluation": false,
93
+ "Have cultural variation evaluations been run with human participants?": false
94
+ }
95
  },
96
  "2.2 Cultural Diversity and Representation": {
97
  "status": "N/A",
98
  "sources": [],
99
+ "questions": {
100
+ "Use of evaluation methods developed in the cultural contexts in scope": false,
101
+ "Respect of indigenous sovereignty, protected rights, and cultural norms in AI system-generated content": false,
102
+ "Evaluation of cultural variation across geographic dimensions": false,
103
+ "Evaluation of cultural variation representing communities' perspectives within geographical contexts": false,
104
+ "Analysis of how cultural context affects AI system performance": false
105
+ }
106
  },
107
  "2.3 Generated Sensitive Content across Cultural Contexts": {
108
  "status": "Yes",
 
122
  "Has the AI system been evaluated for its likelihood of facilitating generation of threatening or violent content": true,
123
  "Has the AI system been evaluated for its likelihood of facilitating generation of targeted harassment or discrimination": false,
124
  "Has the AI system been evaluated for its likelihood of facilitating generation of hate speech": false,
125
+ "Has the AI system been evaluated for its likelihood of exposing its direct users to content embedding values and assumptions not reflective of their cultural context": false,
126
+ "Has the AI system been evaluated for its likelihood of exposing its direct users to inappropriate content for their use context": true,
127
+ "Has the AI system been evaluated for its likelihood of exposing its direct users to content with negative psychological impacts": false,
128
+ "Has the evaluation of the AI system's behaviors explicitly considered cultural variation in their definition": false
129
  }
130
  },
131
  "2.4 Cultural Variation Transparency and Documentation": {
132
  "status": "N/A",
133
  "sources": [],
134
+ "questions": {
135
+ "Documentation of cultural contexts considered during development": false,
136
+ "Documentation of the range of cultural contexts covered by evaluations": false,
137
+ "Sufficient documentation of evaluation method to understand the scope of the findings": false,
138
+ "Construct validity, documentation of strengths, weaknesses, and assumptions": false,
139
+ "Domain shift between evaluation development and AI system development settings": false,
140
+ "Sufficient documentation of evaluation methods to replicate findings": false,
141
+ "Sufficient documentation of evaluation results to support comparison": false,
142
+ "Document of psychological impact on evaluators reviewing harmful content": false,
143
+ "Documentation of measures to protect evaluator well-being": false
144
+ }
145
  }
146
  },
147
+ "3. Disparate Performance": {
148
+ "3.1 Disparate Performance Overview": {
149
+ "status": "N/A",
150
+ "sources": [],
151
+ "questions": {
152
+ "Have development choices and intrinsic properties of the AI system been evaluated for their contribution to disparate performance?": false,
153
+ "Have extrinsic disparate performance evaluations been run": false,
154
+ "Have evaluations been run across all applicable modalities": false,
155
+ "Have disparate performance evaluations been run that take the form of automatic quantitative evaluation": false,
156
+ "Have disparate performance evaluations been run with human participants": false
 
 
 
 
 
 
 
 
 
 
157
  }
158
  },
159
+ "3.2 Identifying Target Groups for Disparate Performance Evaluation": {
160
+ "status": "N/A",
161
+ "sources": [],
162
+ "questions": {
163
+ "Identification of mandated target group based on legal nondiscrimination frameworks": false,
164
+ "Identification of further target groups that are likely to be harmed by disparate performance": false,
165
+ "Assessment of systemic barriers in dataset collection methods for different groups": false,
166
+ "Consideration of historical disparities in the task in which the AI system is deployed": false,
167
+ "Identification of both implicit and explicit markers for the target groups": false
168
+ }
169
+ },
170
+ "3.3 Subgroup Performance Analysis": {
171
+ "status": "N/A",
172
+ "sources": [],
173
+ "questions": {
174
+ "Non-aggregated evaluation results across subpopulations, including feature importance and consistency analysis": false,
175
+ "Metrics to measure performance in decision-making tasks": false,
176
+ "Metrics to measure disparate performance in other tasks including generative tasks": false,
177
+ "Worst-case subgroup performance analysis, including performance on rare or underrepresented cases": false,
178
+ "Intersectional analysis examining performance across combinations of subgroup": false,
179
+ "Do evaluations of disparate performance account for implicit social group markers": false
180
+ }
181
+ },
182
+ "3.4 Disparate Performance Evaluation Transparency and Documentation": {
183
+ "status": "N/A",
184
+ "sources": [],
185
+ "questions": {
186
+ "Sufficient documentation of evaluation method to understand the scope of the findings": false,
187
+ "Documentation of strengths, weaknesses, and assumptions about the context": false,
188
+ "Documentation of domain shift between evaluation and deployment settings": false,
189
+ "Sufficient documentation of evaluation methods to replicate findings": false,
190
+ "Sufficient documentation of evaluation results to support comparison": false,
191
+ "Documentation of disparate performance mitigation measures": false,
192
+ "Documentation of disparate performance monitoring approaches": false
193
+ }
194
+ }
195
+ },
196
+ "4. Environmental Costs and Carbon Emissions Evaluation": {
197
+ "4.1 Environmental Costs Overview": {
198
+ "status": "Yes",
199
+ "sources": [
200
+ {
201
+ "type": "🌐",
202
+ "detail": "https://mlco2.github.io/impact/#compute",
203
+ "name": "Machine Learning Emissions Calculator"
204
  }
205
+ ],
206
+ "questions": {
207
+ "Evaluations of different processes within development and deployment": false,
208
+ "Have evaluations been run across all applicable modalities?": true,
209
+ "Have evaluations been run on standardized benchmarks or metrics?": true,
210
+ "Have evaluations taken into account community feedback from regions affected by data center power consumption?": false,
211
+ "Do evaluations consider the full supply chain including environmental impact of hardware components and data centers used?": false
212
+ }
213
+ },
214
+ "4.2 Energy Cost and Environmental Impact of Development": {
215
+ "status": "Yes",
216
+ "sources": [
217
+ {
218
+ "type": "🌐",
219
+ "detail": "https://mlco2.github.io/impact/#compute",
220
+ "name": "Machine Learning Emissions Calculator"
 
 
 
 
221
  }
222
+ ],
223
+ "questions": {
224
+ "Accounting of FLOPS across development stages": true,
225
+ "Evaluation of energy consumption using standardized tracking tools": true,
226
+ "Evaluation of carbon impact accounting for regional energy sources": true,
227
+ "Evaluation of hardware lifecycle environmental impact": false
228
  }
229
  },
230
+ "4.3 Energy Cost and Environmental Impact of Deployment": {
231
+ "status": "N/A",
232
+ "sources": [],
233
+ "questions": {
234
+ "Evaluation of inference FLOPS for the system": false,
235
+ "Evaluation of inference energy consumption on most common deployment setting": false,
236
+ "Evaluation of inference energy consumption on multiple deployment settings": false,
237
+ "Evaluation of task-specific energy consumption variations": false,
238
+ "Evaluation of carbon impact for deployment infrastructure": false,
239
+ "Evaluation of hardware lifecycle environmental impact for deployment": false
240
+ }
241
+ },
242
+ "4.4 Environmental Costs Transparency and Documentation": {
243
+ "status": "Yes",
244
+ "sources": [
245
+ {
246
+ "type": "🌐",
247
+ "detail": "https://mlco2.github.io/impact/#compute",
248
+ "name": "Machine Learning Emissions Calculator"
 
 
 
 
 
 
 
249
  }
250
+ ],
251
+ "questions": {
252
+ "Documentation about equipment and infrastructure specifications": true,
253
+ "Sufficient documentation of evaluation methods including components covered": false,
254
+ "Sufficient documentation of evaluation methods to replicate findings": true,
255
+ "Sufficient documentation of evaluation results for comparison": true
256
+ }
257
+ }
258
+ },
259
+ "5. Privacy and Data Protection Evaluation": {
260
+ "5.1 Privacy and Data Protection Overview": {
261
+ "status": "Yes",
262
+ "sources": [
263
+ {
264
+ "type": "🏒",
265
+ "detail": "PII detection and redaction using an NER model"
266
+ },
267
+ {
268
+ "type": "🌐",
269
+ "detail": "https://huggingface.co/spaces/bigcode/in-the-stack",
270
+ "name": "Opt-out tool for users"
271
+ },
272
+ {
273
+ "type": "🌐",
274
+ "detail": "https://arxiv.org/abs/2402.19173",
275
+ "name": "Asleep at the Keyboard Security Benchmark"
276
  }
277
+ ],
278
+ "questions": {
279
+ "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": true,
280
+ "Have intrinsic properties of the AI system been evaluated for privacy vulnerabilities": false,
281
+ "Have extrinsic privacy evaluations been run": true,
282
+ "Have evaluations been run across all applicable modalities": true,
283
+ "Have privacy evaluations been run that take the form of automatic quantitative evaluation": true,
284
+ "Have privacy evaluations been run with human participants?": false
285
+ }
286
+ },
287
+ "5.2 Privacy, Likeness, and Publicity Harms": {
288
+ "status": "N/A",
289
+ "sources": [],
290
+ "questions": {
291
+ "Has the AI system been evaluated for its likelihood of revealing personal information from its training data?": false,
292
+ "Has the AI system been evaluated for its likelihood of facilitating generation of content impersonating an individual?": false,
293
+ "Has the AI system been evaluated for its likelihood of providing made up or confabulated personal information about individuals?": false
294
+ }
295
+ },
296
+ "5.3 Intellectual Property and Information Security": {
297
+ "status": "Yes",
298
+ "sources": [
299
+ {
300
+ "type": "🏒",
301
+ "detail": "Membership test to find if generated code was copied from the training corpus"
302
+ },
303
+ {
304
+ "type": "🏒",
305
+ "detail": "Code attribution tool to find the original author and license of the generated code"
306
+ },
307
+ {
308
+ "type": "🌐",
309
+ "detail": "https://arxiv.org/abs/2402.19173",
310
+ "name": "Asleep at the Keyboard Security Benchmark"
311
  }
312
+ ],
313
+ "questions": {
314
+ "Has the AI system been evaluated for its likelihood of reproducing other categories of information from its training data": true,
315
+ "Has the system been evaluated for other information security risks for in-scope uses": false
316
  }
317
  },
318
+ "5.4 Privacy Evaluation Transparency and Documentation": {
319
+ "status": "Yes",
320
+ "sources": [
321
+ {
322
+ "type": "🏒",
323
+ "detail": "Documentation of training data information risk categories and consent status"
324
+ }
325
+ ],
326
+ "questions": {
327
+ "Documentation of the categories of training data that present information risk": true,
328
+ "Documentation of evaluation methods to replicate findings": true,
329
+ "Documentation of evaluation results to support comparison": true,
330
+ "Documentation of evaluation limitations": false,
331
+ "Documentation of deployment considerations": false
332
+ }
333
+ }
334
+ },
335
+ "6. Financial Costs Evaluation": {
336
+ "6.1 Financial Costs Overview": {
337
+ "status": "N/A",
338
+ "sources": [],
339
+ "questions": {
340
+ "Evaluation of costs at various stages": false,
341
+ "Have costs been evaluated for different system components": false,
342
+ "Have cost evaluations been run across all applicable modalities": false,
343
+ "Have cost evaluations included both direct and indirect expenses": false,
344
+ "Have cost projections been validated against actual expenses": false
345
  }
346
  },
347
+ "6.2 Development and Training Costs": {
348
+ "status": "N/A",
349
+ "sources": [],
350
+ "questions": {
351
+ "Assessment of research and development labor costs": false,
352
+ "Evaluation of data collection and preprocessing costs": false,
353
+ "Assessment of training infrastructure costs": false,
354
+ "Assessment of costs associated with different training approaches": false,
355
+ "Evaluation of model architecture and size impact on costs": false
356
+ }
357
+ },
358
+ "6.3 Deployment and Operation Costs": {
359
+ "status": "N/A",
360
+ "sources": [],
361
+ "questions": {
362
+ "Assessment of inference and serving costs": false,
363
+ "Evaluation of storage and hosting expenses": false,
364
+ "Assessment of scaling costs based on usage patterns": false,
365
+ "Evaluation of costs specific to different deployment contexts": false,
366
+ "Assessment of costs for model updates or fine-tuning by end users": false
367
+ }
368
+ },
369
+ "6.4 Financial Cost Documentation and Transparency": {
370
+ "status": "N/A",
371
+ "sources": [],
372
+ "questions": {
373
+ "Sufficient documentation of cost evaluation methodology and assumptions": false,
374
+ "Sufficient documentation of cost breakdowns and metrics": false,
375
+ "Documentation of cost variations across different usage scenarios": false,
376
+ "Documentation of long-term cost projections and risk factors": false
377
+ }
378
+ }
379
+ },
380
+ "7. Data and Content Moderation Labor Evaluation": {
381
+ "7.1 Labor Evaluation Overview": {
382
+ "status": "Yes",
383
+ "sources": [
384
+ {
385
+ "type": "🏒",
386
+ "detail": "PII annotations by human annotators with fair wage"
387
  }
388
+ ],
389
+ "questions": {
390
+ "Evaluation of labor practices at various stages": true,
391
+ "Have labor conditions been evaluated for different worker categories": true,
392
+ "Have labor evaluations been run across all applicable task types": false,
393
+ "Have labor practices been evaluated against established industry standards": true,
394
+ "Have labor evaluations included both direct employees and contracted workers": false,
395
+ "Have evaluations considered different regional and jurisdictional contexts": true
396
+ }
397
+ },
398
+ "7.2 Working Conditions and Compensation": {
399
+ "status": "Yes",
400
+ "sources": [
401
+ {
402
+ "type": "🏒",
403
+ "detail": "PII annotations by human annotators with fair wage"
404
  }
405
+ ],
406
+ "questions": {
407
+ "Assessment of compensation relative to local living wages and industry standards": true,
408
+ "Assessment of job security and employment classification": false,
409
+ "Evaluation of workplace safety, worker protections and rights": false,
410
+ "Assessment of worker autonomy and task assignment practices": false,
411
+ "Evaluation of power dynamics and worker feedback mechanisms": false
412
+ }
413
+ },
414
+ "7.3 Worker Wellbeing and Support": {
415
+ "status": "N/A",
416
+ "sources": [],
417
+ "questions": {
418
+ "Assessment of psychological support systems, trauma resources, and other long-term mental health monitoring": false,
419
+ "Evaluation of training and preparation for difficult content": false,
420
+ "Evaluation of cultural and linguistic support for diverse workforces": false
421
+ }
422
+ },
423
+ "7.4 Labor Practice Documentation and Transparency": {
424
+ "status": "Yes",
425
+ "sources": [
426
+ {
427
+ "type": "🏒",
428
+ "detail": "PII annotations by human annotators with fair wage"
429
  }
430
+ ],
431
+ "questions": {
432
+ "Documentation of labor evaluation methodology and frameworks used": true,
433
+ "Documentation of worker demographics and task distribution": false,
434
+ "Documentation of support systems, worker protections": false,
435
+ "Documentation of incident reporting and resolution procedures": false
436
  }
437
  }
438
  }
439
+ }
440
+ }
model_data/model_b_data.json CHANGED
@@ -1,471 +1,440 @@
1
  {
2
  "metadata": {
3
  "Name": "Model B",
4
- "Provider": "AI Innovations",
5
- "Version": "3.0",
6
- "Release Date": "2023-11-30",
7
- "Type": "Multimodal AI",
8
- "Modalities": ["Text-to-Text", "Text-to-Image", "Image-to-Text"]
 
9
  },
10
  "scores": {
11
- "Bias, Stereotypes, and Representational Harms Evaluation": {
12
- "Comprehensive Evaluation Methodology": {
13
- "status": "Yes",
14
- "source": "Both",
15
- "applicable_evaluations": [
16
- "Evaluations at various stages (data collection, preprocessing, model architecture, training, deployment)",
17
- "Both intrinsic (e.g., embedding analysis) and extrinsic (e.g., downstream task performance) evaluation methods",
18
- "Multi-level analysis (e.g., word, sentence, document levels for text; pixel, object, scene levels for images)"
19
- ]
20
- },
21
- "Inclusive Protected Class Consideration": {
22
- "status": "Yes",
23
- "source": "3P",
24
- "applicable_evaluations": [
25
- "Evaluation of non-standard protected classes (e.g., socioeconomic status, education level, regional differences)",
26
- "Consideration of intersectionality and how identity aspects interact"
27
- ]
28
- },
29
- "Cultural and Linguistic Diversity": {
30
- "status": "Yes",
31
- "source": "Both",
32
- "applicable_evaluations": [
33
- "Tests of model performance and biases across languages and cultures",
34
- "Analysis of the impact of different languages/scripts on image generation (for text-to-image models)",
35
- "Consideration of how protected categories may shift in meaning across regions"
36
- ]
37
- },
38
- "Stereotype and Harmful Association Detection": {
39
- "status": "Yes",
40
- "source": "1P",
41
- "applicable_evaluations": [
42
- "Detection of stereotypical word associations in text models or visual representations in image models",
43
- "Sentiment analysis and toxicity measurements, especially regarding specific groups"
44
- ]
45
- },
46
- "Performance Disparities Assessment": {
47
  "status": "No",
48
- "source": null,
49
- "applicable_evaluations": [
50
- "Detailed breakdowns of performance metrics (accuracy, precision, recall) for various subgroups",
51
- "Performance analysis for disadvantaged subgroups",
52
- "Intersectionality considerations in performance analysis"
53
- ]
54
- },
55
- "Bias Mitigation and Impact Analysis": {
56
- "status": "Yes",
57
- "source": "1P",
58
- "applicable_evaluations": [
59
- "Documentation of bias mitigation strategies",
60
- "Analyses of how model updates or mitigations affect bias metrics"
61
- ]
62
- },
63
- "Transparency and Limitations Disclosure": {
64
- "status": "Yes",
65
- "source": "Both",
66
- "applicable_evaluations": [
67
- "Clear statements on the capabilities and limitations of evaluation methods",
68
- "Acknowledgment of potential biases from the evaluation tools/processes",
69
- "Detailed explanations of bias-related metrics, including assumptions or limitations"
70
- ]
71
- },
72
- "Ongoing Evaluation Commitment": {
73
- "status": "No",
74
- "source": null,
75
- "applicable_evaluations": [
76
- "Plans for continual bias assessment as the model is updated or deployed in new contexts",
77
- "Strategies for incorporating new findings/methodologies in evaluation",
78
- "Commitments to transparency and regular reporting on bias-related issues"
79
- ]
 
 
 
 
 
 
 
 
 
 
 
80
  }
81
  },
82
- "Cultural Values and Sensitive Content Evaluation": {
83
- "Hate Speech and Toxicity Evaluation": {
84
- "status": "Yes",
85
- "source": "Both",
86
- "applicable_evaluations": [
87
- "Assessments of harmful text generation",
88
- "Evaluations of toxicity, hurtfulness, or offensiveness",
89
- "Examination of invasive bodily commentary or rejections of identity"
90
- ]
91
- },
92
- "Cultural Value Representation": {
93
- "status": "Yes",
94
- "source": "3P",
95
- "applicable_evaluations": [
96
- "Use of pre-existing scholarship (e.g., World Values Survey, Geert Hofstede's work)",
97
- "Assessments of ethical scenarios and political value representation",
98
- "Evaluations of geopolitical statements and regional representation"
99
- ]
100
- },
101
- "Diverse Cultural Context": {
102
- "status": "No",
103
- "source": null,
104
- "applicable_evaluations": [
105
- "Assessments that don't equate nationality with cultural context",
106
- "Representation of differing cultural values within countries",
107
- "Inclusion of marginalized communities' perspectives"
108
- ]
109
- },
110
- "Sensitive Content Identification": {
111
- "status": "Yes",
112
- "source": "1P",
113
- "applicable_evaluations": [
114
- "Recognition of topics that vary by culture and viewpoint",
115
- "Assessment of content related to egregious violence",
116
- "Evaluation of adult sexual content identification"
117
- ]
118
- },
119
- "Impact of Generated Content": {
120
- "status": "No",
121
- "source": null,
122
- "applicable_evaluations": [
123
- "Assessment of potential harm to targeted viewers",
124
- "Evaluation of content's potential to normalize harmful ideas",
125
- "Analysis of possible contributions to online radicalization"
126
- ]
127
- },
128
- "Multidimensional Cultural Analysis": {
129
- "status": "Yes",
130
- "source": "Both",
131
- "applicable_evaluations": [
132
- "Evaluations at word, sentence, and document levels for text",
133
- "Analysis at pixel, object, and scene levels for images",
134
- "Multi-level analysis of cultural representation"
135
- ]
 
 
 
 
 
 
 
 
136
  }
137
  },
138
- "Disparate Performance": {
139
- "Subpopulation Performance Analysis": {
140
- "status": "Yes",
141
- "source": "Both",
142
- "applicable_evaluations": [
143
- "Non-aggregated (disaggregated) evaluation results with in-depth breakdowns across subpopulations",
144
- "Metrics such as subgroup accuracy, calibration, AUC, recall, precision, min-max ratios",
145
- "Worst-case subgroup performance analysis"
146
- ]
147
- },
148
- "Cross-lingual and Dialect Evaluation": {
149
- "status": "Yes",
150
- "source": "3P",
151
- "applicable_evaluations": [
152
- "Cross-lingual prompting on standard benchmarks",
153
- "Examination of performance across dialects",
154
- "Analysis of hallucination disparity across languages"
155
- ]
156
- },
157
- "Image Generation Quality Assessment": {
158
- "status": "Yes",
159
- "source": "1P",
160
- "applicable_evaluations": [
161
- "Examination of generation quality across various concepts",
162
- "Accuracy of cultural representation in generated images",
163
- "Assessment of realism across different concepts"
164
- ]
165
- },
166
- "Data Duplication and Bias Analysis": {
167
- "status": "No",
168
- "source": null,
169
- "applicable_evaluations": [
170
- "Analysis of the effect of retaining duplicate examples in the training dataset",
171
- "Evaluation of model bias towards generating certain phrases or concepts"
172
- ]
173
- },
174
- "Dataset Disparities Evaluation": {
175
- "status": "Yes",
176
- "source": "1P",
177
- "applicable_evaluations": [
178
- "Assessment of dataset skew with fewer examples from some subpopulations",
179
- "Evaluation of feature inconsistencies across subpopulations",
180
- "Analysis of geographic biases in data collection"
181
- ]
182
- },
183
- "Evaluation of Systemic Issues": {
184
- "status": "No",
185
- "source": null,
186
- "applicable_evaluations": [
187
- "Assessment of disparities due to dataset collection methods",
188
- "Evaluation of the impact of varying levels of internet access on data representation",
189
- "Analysis of content filters' effects on data availability"
190
- ]
191
- },
192
- "Long-tail Data Distribution Analysis": {
193
- "status": "Yes",
194
- "source": "3P",
195
- "applicable_evaluations": [
196
- "Assessment of model performance on rare or uncommon data points",
197
- "Evaluation of the trade-off between fitting long tails and unintentional memorization"
198
- ]
199
  }
200
  },
201
- "Environmental Costs and Carbon Emissions Evaluation": {
202
- "Energy Consumption Measurement": {
203
- "status": "Yes",
204
- "source": "1P",
205
- "applicable_evaluations": [
206
- "Measurement of energy used in training, testing, and deploying the system",
207
- "Evaluation of compute power consumption",
208
- "Assessment of energy resources used by large-scale systems"
209
- ]
210
- },
211
- "Carbon Footprint Quantification": {
212
- "status": "Yes",
213
- "source": "3P",
214
- "applicable_evaluations": [
215
- "Use of tools like CodeCarbon or Carbontracker",
216
- "Measurement of carbon emissions for training and inference",
217
- "Conversion of energy consumption to carbon emissions"
218
- ]
219
- },
220
- "Hardware Resource Evaluation": {
221
- "status": "Yes",
222
- "source": "1P",
223
- "applicable_evaluations": [
224
- "Assessment of CPU, GPU, and TPU usage",
225
- "Measurement of FLOPS (Floating Point Operations)",
226
- "Evaluation of package power draw and GPU performance state"
227
- ]
228
- },
229
- "Comprehensive Environmental Impact Assessment": {
230
- "status": "No",
231
- "source": null,
232
- "applicable_evaluations": [
233
- "Use of Life Cycle Assessment (LCA) methodologies",
234
- "Consideration of supply chains and manufacturing impacts",
235
- "Evaluation of immediate impacts of applying ML"
236
- ]
237
- },
238
- "Transparency in Environmental Reporting": {
239
- "status": "Yes",
240
- "source": "Both",
241
- "applicable_evaluations": [
242
- "Disclosure of uncertainty around measured variables",
243
- "Reporting of marginal costs (e.g., added parameters' contribution to energy consumption)",
244
- "Transparency about equipment manufacturers and data/hosting centers"
245
- ]
246
- },
247
- "Comprehensive Environmental Impact Metrics": {
248
- "status": "No",
249
- "source": null,
250
- "applicable_evaluations": [
251
- "Discussion of different approaches to measuring environmental impact",
252
- "Use of diverse measurements beyond energy consumption",
253
- "Consideration of various factors including lifecycle assessment"
254
- ]
255
  }
256
  },
257
- "Privacy and Data Protection Evaluation": {
258
- "Data Minimization and Consent Practices": {
259
- "status": "Yes",
260
- "source": "Both",
261
- "applicable_evaluations": [
262
- "Implementation of data minimization practices",
263
- "Use of opt-in data collection methods",
264
- "Assessment of active consent for collecting, processing, and sharing data"
265
- ]
266
- },
267
- "Memorization and Data Leakage Evaluation": {
268
- "status": "Yes",
269
- "source": "1P",
270
- "applicable_evaluations": [
271
- "Examination of the maximum amount of discoverable information given training data",
272
- "Evaluation of extractable information without training data access",
273
- "Analysis of out-of-distribution data revelation"
274
- ]
275
- },
276
- "Personal Information Revelation Assessment": {
277
- "status": "Yes",
278
- "source": "3P",
279
- "applicable_evaluations": [
280
- "Direct prompting tests to reveal Personally Identifiable Information (PII)",
281
- "Use of tools like ProPILE to audit PII revelation likelihood",
282
- "Evaluation of the system's ability to infer personal attributes"
283
- ]
284
- },
285
- "Image and Audio Privacy Evaluation": {
286
- "status": "Yes",
287
- "source": "1P",
288
- "applicable_evaluations": [
289
- "Assessment of training data memorization in image generation",
290
- "Use of adversarial Membership Inference Attacks for images",
291
- "Evaluation of the proportion of generated images with high similarity to training data"
292
- ]
293
- },
294
- "Intellectual Property and Copyright Evaluation": {
295
- "status": "No",
296
- "source": null,
297
- "applicable_evaluations": [
298
- "Assessment of the system's ability to generate copyrighted content",
299
- "Evaluation of intellectual property concerns in generated content",
300
- "Analysis of the system's handling of highly sensitive documents"
301
- ]
302
- },
303
- "Retroactive Privacy Protection": {
304
- "status": "No",
305
- "source": null,
306
- "applicable_evaluations": [
307
- "Assessment of the system's capability to retroactively retrain in accordance with privacy policies",
308
- "Evaluation of processes for removing specific data points upon request",
309
- "Analysis of the system's adaptability to changing privacy regulations"
310
- ]
311
- },
312
- "Third-party Hosting Privacy Evaluation": {
313
- "status": "Yes",
314
- "source": "Both",
315
- "applicable_evaluations": [
316
- "Assessment of potential leakage of private input data in generations",
317
- "Evaluation of system prompt privacy, especially for prompts containing proprietary information",
318
- "Analysis of the system's handling of sensitive database records in context learning"
319
- ]
320
- },
321
- "Generative AI-Specific Privacy Measures": {
322
- "status": "Yes",
323
- "source": "1P",
324
- "applicable_evaluations": [
325
- "Assessment of the applicability of data sanitization techniques to generative models",
326
- "Evaluation of differential privacy approaches in the context of generative AI",
327
- "Analysis of novel privacy protection methods designed specifically for generative models"
328
- ]
329
  }
330
  },
331
- "Financial Costs Evaluation": {
332
- "Comprehensive Cost Evaluation": {
333
- "status": "Yes",
334
- "source": "1P",
335
- "applicable_evaluations": [
336
- "Estimation of infrastructure and hardware costs",
337
- "Calculation of labor hours from researchers, developers, and crowd workers",
338
- "Tracking of compute costs using low-cost or standard pricing per instance-hour"
339
- ]
340
- },
341
- "Storage and Training Cost Analysis": {
342
- "status": "Yes",
343
- "source": "1P",
344
- "applicable_evaluations": [
345
- "Assessment of storage costs for both datasets and resulting models",
346
- "Consideration of in-house vs. cloud storage options",
347
- "Evaluation of training costs based on in-house GPUs or per-hour-priced instances"
348
- ]
349
- },
350
- "Hosting and Inference Cost Evaluation": {
351
- "status": "Yes",
352
- "source": "Both",
353
- "applicable_evaluations": [
354
- "Evaluation of low-latency serving costs",
355
- "Assessment of inference costs based on token usage",
356
- "Consideration of factors such as initial prompt length and requested token response length"
357
- ]
358
- },
359
- "Modality-Specific Cost Analysis": {
360
- "status": "Yes",
361
- "source": "1P",
362
- "applicable_evaluations": [
363
- "Assessment of costs related to pixel density and frame usage for image and video",
364
- "Evaluation of preprocessing costs for audio (e.g., spectrogram generation)",
365
- "Consideration of model architecture in cost calculations"
366
- ]
367
- },
368
- "Long-term Cost Considerations": {
369
- "status": "No",
370
- "source": null,
371
- "applicable_evaluations": [
372
- "Assessment of pre- and post-deployment costs",
373
- "Consideration of human labor and hidden costs",
374
- "Tracking of changes in costs and economy of components over time"
375
- ]
376
- },
377
- "API Cost Evaluation": {
378
- "status": "Yes",
379
- "source": "1P",
380
- "applicable_evaluations": [
381
- "Assessment of token-usage based pricing",
382
- "Evaluation of cost variations based on initial prompt length and requested token response length",
383
- "Analysis of cost differences across model versions"
384
- ]
385
- },
386
- "Comprehensive Cost Tracking": {
387
- "status": "No",
388
- "source": null,
389
- "applicable_evaluations": [
390
- "Assessment of costs related to broader infrastructure or organizational changes",
391
- "Evaluation of long-term maintenance and update costs",
392
- "Analysis of costs associated with complementary technologies or processes"
393
- ]
394
  }
395
  },
396
- "Data and Content Moderation Labor Evaluation": {
397
- "Crowdwork Standards Compliance": {
398
- "status": "Yes",
399
- "source": "3P",
400
- "applicable_evaluations": [
401
- "Assessment of compliance with Criteria for Fairer Microwork",
402
- "Evaluation against Partnership on AI's Responsible Sourcing of Data Enrichment Services guidelines",
403
- "Comparison with Oxford Internet Institute's Fairwork Principles"
404
- ]
405
- },
406
- "Crowdworker Demographics and Compensation": {
407
- "status": "Yes",
408
- "source": "Both",
409
- "applicable_evaluations": [
410
- "Documentation of crowd workers' demographics",
411
- "Transparency in reporting instructions given to crowdworkers",
412
- "Assessment of how crowdworkers were evaluated and compensated"
413
- ]
414
- },
415
- "Psychological Support and Content Exposure": {
416
- "status": "No",
417
- "source": null,
418
- "applicable_evaluations": [
419
- "Documentation of immediate trauma support availability",
420
- "Assessment of long-term professional psychological support provision",
421
- "Evaluation of practices for controlling exposure to traumatic material"
422
- ]
423
- },
424
- "Transparency in Crowdwork Documentation": {
425
- "status": "Yes",
426
- "source": "1P",
427
- "applicable_evaluations": [
428
- "Use of transparent reporting frameworks",
429
- "Documentation of crowdwork's role in shaping AI system output",
430
- "Evaluation of the accessibility of crowdwork information"
431
- ]
432
- },
433
- "Crowdwork Stages and Types": {
434
- "status": "Yes",
435
- "source": "Both",
436
- "applicable_evaluations": [
437
- "Assessment of crowdwork in data gathering, curation, cleaning, and labeling",
438
- "Evaluation of crowdwork during model development and interim evaluations",
439
- "Examination of post-deployment crowdwork for output evaluation and correction"
440
- ]
441
- },
442
- "Evaluation of Labor Protection and Regulations": {
443
- "status": "No",
444
- "source": null,
445
- "applicable_evaluations": [
446
- "Assessment of compliance with relevant labor law interventions by jurisdiction",
447
- "Evaluation of worker classification and associated protections",
448
- "Analysis of fair work practices and compensation structures"
449
- ]
450
- },
451
- "Outsourcing Impact Evaluation": {
452
- "status": "Yes",
453
- "source": "3P",
454
- "applicable_evaluations": [
455
- "Assessment of communication barriers created by outsourcing",
456
- "Evaluation of differences in working conditions between in-house and outsourced labor",
457
- "Analysis of transparency in reporting structures for outsourced work"
458
- ]
459
- },
460
- "Impact of Precarious Employment": {
461
- "status": "No",
462
- "source": null,
463
- "applicable_evaluations": [
464
- "Assessment of job security and its impact on worker feedback",
465
- "Evaluation of anonymous reporting systems for substandard working conditions",
466
- "Analysis of power dynamics between crowdworkers and employers"
467
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
  }
469
  }
470
  }
471
- }
 
 
1
  {
2
  "metadata": {
3
  "Name": "Model B",
4
+ "Provider": "BigCode",
5
+ "URL": "https://huggingface.co/bigcode/starcoder2-15b",
6
+ "Type": "Large Language Model",
7
+ "Modalities": [
8
+ "Text-to-Text"
9
+ ]
10
  },
11
  "scores": {
12
+ "1. Bias, Stereotypes, and Representational Harms Evaluation": {
13
+ "1.1 Bias Detection Overview": {
14
+ "status": "Yes",
15
+ "sources": [
16
+ {
17
+ "type": "🌐",
18
+ "detail": "https://arxiv.org/abs/2402.19173",
19
+ "name": "BOLD - Bias in Open-ended Language Generation Dataset"
20
+ },
21
+ {
22
+ "type": "🌐",
23
+ "detail": "https://arxiv.org/abs/2402.19173",
24
+ "name": "WinoBias"
25
+ }
26
+ ],
27
+ "questions": {
28
+ "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": false,
29
+ "Have intrinsic properties of the AI system been evaluated for bias (e.g., embedding analysis)": false,
30
+ "Have extrinsic bias evaluations been run (e.g., downstream task performance)": true,
31
+ "Have evaluations been run across all applicable modalities": true,
32
+ "Have bias evaluations been run that take the form of automatic quantitative evaluation": true,
33
+ "Have bias evaluations been run with human participants?": false
34
+ }
35
+ },
36
+ "1.2 Protected Classes and Intersectional Measures": {
 
 
 
 
 
 
 
 
 
 
 
37
  "status": "No",
38
+ "sources": [],
39
+ "questions": {
40
+ "Do evaluations cover all applicable legal protected categories for in-scope uses of the system?": false,
41
+ "Do evaluations cover additional subgroups that are likely to be harmed based on other personal characteristics": false,
42
+ "Evaluation of how different aspects of identity interact and compound in AI system behavior": false,
43
+ "Evaluation of AI system biases for legal protected categories and additional relevant subgroups": false
44
+ }
45
+ },
46
+ "1.3 Measurement of Stereotypes and Harmful Associations": {
47
+ "status": "Yes",
48
+ "sources": [
49
+ {
50
+ "type": "🌐",
51
+ "detail": "https://arxiv.org/abs/2402.19173",
52
+ "name": "HONEST - Hurtful Sentence Completion in English Language Models"
53
+ },
54
+ {
55
+ "type": "🌐",
56
+ "detail": "https://arxiv.org/abs/2402.19173",
57
+ "name": "RealToxicityPrompts"
58
+ }
59
+ ],
60
+ "questions": {
61
+ "Measurement of known stereotypes in AI system outputs": true,
62
+ "Measurement of other negative associations and assumptions regarding specific groups": true,
63
+ "Measurement of stereotypes and negative associations across in-scope contexts": false
64
+ }
65
+ },
66
+ "1.4 Bias Evaluation Transparency and Documentation": {
67
+ "status": "Yes",
68
+ "sources": [
69
+ {
70
+ "type": "🌐",
71
+ "detail": "https://arxiv.org/abs/2402.19173",
72
+ "name": "Evaluation Documentation"
73
+ }
74
+ ],
75
+ "questions": {
76
+ "Sufficient documentation of evaluation methods (including code and datasets) to replicate findings": true,
77
+ "Sufficient documentation of evaluation results (including intermediary statistics) to support comparison to other AI systems": true,
78
+ "Documentation of bias mitigation measures, including their secondary impacts": false,
79
+ "Documentation of bias monitoring approaches post-release/deployment if applicable": false
80
+ }
81
  }
82
  },
83
+ "2. Cultural Values and Sensitive Content Evaluation": {
84
+ "2.1 Cultural Variation Overview": {
85
+ "status": "N/A",
86
+ "sources": [],
87
+ "questions": {
88
+ "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": false,
89
+ "Have intrinsic properties of the AI system been evaluated for cultural variation(e.g., embedding analysis)": false,
90
+ "Have extrinsic cultural variation evaluations been run (e.g., downstream task performance)": false,
91
+ "Have evaluations been run across all applicable modalities": false,
92
+ "Have cultural variation evaluations been run that take the form of automatic quantitative evaluation": false,
93
+ "Have cultural variation evaluations been run with human participants?": false
94
+ }
95
+ },
96
+ "2.2 Cultural Diversity and Representation": {
97
+ "status": "N/A",
98
+ "sources": [],
99
+ "questions": {
100
+ "Use of evaluation methods developed in the cultural contexts in scope": false,
101
+ "Respect of indigenous sovereignty, protected rights, and cultural norms in AI system-generated content": false,
102
+ "Evaluation of cultural variation across geographic dimensions": false,
103
+ "Evaluation of cultural variation representing communities' perspectives within geographical contexts": false,
104
+ "Analysis of how cultural context affects AI system performance": false
105
+ }
106
+ },
107
+ "2.3 Generated Sensitive Content across Cultural Contexts": {
108
+ "status": "Yes",
109
+ "sources": [
110
+ {
111
+ "type": "🌐",
112
+ "detail": "https://arxiv.org/abs/2402.19173",
113
+ "name": "HONEST - Hurtful Sentence Completion in English Language Models"
114
+ },
115
+ {
116
+ "type": "🌐",
117
+ "detail": "https://arxiv.org/abs/2402.19173",
118
+ "name": "RealToxicityPrompts"
119
+ }
120
+ ],
121
+ "questions": {
122
+ "Has the AI system been evaluated for its likelihood of facilitating generation of threatening or violent content": true,
123
+ "Has the AI system been evaluated for its likelihood of facilitating generation of targeted harassment or discrimination": false,
124
+ "Has the AI system been evaluated for its likelihood of facilitating generation of hate speech": false,
125
+ "Has the AI system been evaluated for its likelihood of exposing its direct users to content embedding values and assumptions not reflective of their cultural context": false,
126
+ "Has the AI system been evaluated for its likelihood of exposing its direct users to inappropriate content for their use context": true,
127
+ "Has the AI system been evaluated for its likelihood of exposing its direct users to content with negative psychological impacts": false,
128
+ "Has the evaluation of the AI system's behaviors explicitly considered cultural variation in their definition": false
129
+ }
130
+ },
131
+ "2.4 Cultural Variation Transparency and Documentation": {
132
+ "status": "N/A",
133
+ "sources": [],
134
+ "questions": {
135
+ "Documentation of cultural contexts considered during development": false,
136
+ "Documentation of the range of cultural contexts covered by evaluations": false,
137
+ "Sufficient documentation of evaluation method to understand the scope of the findings": false,
138
+ "Construct validity, documentation of strengths, weaknesses, and assumptions": false,
139
+ "Domain shift between evaluation development and AI system development settings": false,
140
+ "Sufficient documentation of evaluation methods to replicate findings": false,
141
+ "Sufficient documentation of evaluation results to support comparison": false,
142
+ "Document of psychological impact on evaluators reviewing harmful content": false,
143
+ "Documentation of measures to protect evaluator well-being": false
144
+ }
145
  }
146
  },
147
+ "3. Disparate Performance": {
148
+ "3.1 Disparate Performance Overview": {
149
+ "status": "N/A",
150
+ "sources": [],
151
+ "questions": {
152
+ "Have development choices and intrinsic properties of the AI system been evaluated for their contribution to disparate performance?": false,
153
+ "Have extrinsic disparate performance evaluations been run": false,
154
+ "Have evaluations been run across all applicable modalities": false,
155
+ "Have disparate performance evaluations been run that take the form of automatic quantitative evaluation": false,
156
+ "Have disparate performance evaluations been run with human participants": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  }
158
  },
159
+ "3.2 Identifying Target Groups for Disparate Performance Evaluation": {
160
+ "status": "N/A",
161
+ "sources": [],
162
+ "questions": {
163
+ "Identification of mandated target group based on legal nondiscrimination frameworks": false,
164
+ "Identification of further target groups that are likely to be harmed by disparate performance": false,
165
+ "Assessment of systemic barriers in dataset collection methods for different groups": false,
166
+ "Consideration of historical disparities in the task in which the AI system is deployed": false,
167
+ "Identification of both implicit and explicit markers for the target groups": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  }
169
  },
170
+ "3.3 Subgroup Performance Analysis": {
171
+ "status": "N/A",
172
+ "sources": [],
173
+ "questions": {
174
+ "Non-aggregated evaluation results across subpopulations, including feature importance and consistency analysis": false,
175
+ "Metrics to measure performance in decision-making tasks": false,
176
+ "Metrics to measure disparate performance in other tasks including generative tasks": false,
177
+ "Worst-case subgroup performance analysis, including performance on rare or underrepresented cases": false,
178
+ "Intersectional analysis examining performance across combinations of subgroup": false,
179
+ "Do evaluations of disparate performance account for implicit social group markers": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  }
181
  },
182
+ "3.4 Disparate Performance Evaluation Transparency and Documentation": {
183
+ "status": "N/A",
184
+ "sources": [],
185
+ "questions": {
186
+ "Sufficient documentation of evaluation method to understand the scope of the findings": false,
187
+ "Documentation of strengths, weaknesses, and assumptions about the context": false,
188
+ "Documentation of domain shift between evaluation and deployment settings": false,
189
+ "Sufficient documentation of evaluation methods to replicate findings": false,
190
+ "Sufficient documentation of evaluation results to support comparison": false,
191
+ "Documentation of disparate performance mitigation measures": false,
192
+ "Documentation of disparate performance monitoring approaches": false
193
+ }
194
+ }
195
+ },
196
+ "4. Environmental Costs and Carbon Emissions Evaluation": {
197
+ "4.1 Environmental Costs Overview": {
198
+ "status": "Yes",
199
+ "sources": [
200
+ {
201
+ "type": "🌐",
202
+ "detail": "https://mlco2.github.io/impact/#compute",
203
+ "name": "Machine Learning Emissions Calculator"
204
+ }
205
+ ],
206
+ "questions": {
207
+ "Evaluations of different processes within development and deployment": false,
208
+ "Have evaluations been run across all applicable modalities?": true,
209
+ "Have evaluations been run on standardized benchmarks or metrics?": true,
210
+ "Have evaluations taken into account community feedback from regions affected by data center power consumption?": false,
211
+ "Do evaluations consider the full supply chain including environmental impact of hardware components and data centers used?": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  }
213
  },
214
+ "4.2 Energy Cost and Environmental Impact of Development": {
215
+ "status": "Yes",
216
+ "sources": [
217
+ {
218
+ "type": "🌐",
219
+ "detail": "https://mlco2.github.io/impact/#compute",
220
+ "name": "Machine Learning Emissions Calculator"
221
+ }
222
+ ],
223
+ "questions": {
224
+ "Accounting of FLOPS across development stages": true,
225
+ "Evaluation of energy consumption using standardized tracking tools": true,
226
+ "Evaluation of carbon impact accounting for regional energy sources": true,
227
+ "Evaluation of hardware lifecycle environmental impact": false
228
+ }
229
+ },
230
+ "4.3 Energy Cost and Environmental Impact of Deployment": {
231
+ "status": "N/A",
232
+ "sources": [],
233
+ "questions": {
234
+ "Evaluation of inference FLOPS for the system": false,
235
+ "Evaluation of inference energy consumption on most common deployment setting": false,
236
+ "Evaluation of inference energy consumption on multiple deployment settings": false,
237
+ "Evaluation of task-specific energy consumption variations": false,
238
+ "Evaluation of carbon impact for deployment infrastructure": false,
239
+ "Evaluation of hardware lifecycle environmental impact for deployment": false
240
+ }
241
+ },
242
+ "4.4 Environmental Costs Transparency and Documentation": {
243
+ "status": "Yes",
244
+ "sources": [
245
+ {
246
+ "type": "🌐",
247
+ "detail": "https://mlco2.github.io/impact/#compute",
248
+ "name": "Machine Learning Emissions Calculator"
249
+ }
250
+ ],
251
+ "questions": {
252
+ "Documentation about equipment and infrastructure specifications": true,
253
+ "Sufficient documentation of evaluation methods including components covered": false,
254
+ "Sufficient documentation of evaluation methods to replicate findings": true,
255
+ "Sufficient documentation of evaluation results for comparison": true
256
+ }
257
+ }
258
+ },
259
+ "5. Privacy and Data Protection Evaluation": {
260
+ "5.1 Privacy and Data Protection Overview": {
261
+ "status": "Yes",
262
+ "sources": [
263
+ {
264
+ "type": "🏒",
265
+ "detail": "PII detection and redaction using an NER model"
266
+ },
267
+ {
268
+ "type": "🌐",
269
+ "detail": "https://huggingface.co/spaces/bigcode/in-the-stack",
270
+ "name": "Opt-out tool for users"
271
+ },
272
+ {
273
+ "type": "🌐",
274
+ "detail": "https://arxiv.org/abs/2402.19173",
275
+ "name": "Asleep at the Keyboard Security Benchmark"
276
+ }
277
+ ],
278
+ "questions": {
279
+ "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": true,
280
+ "Have intrinsic properties of the AI system been evaluated for privacy vulnerabilities": false,
281
+ "Have extrinsic privacy evaluations been run": true,
282
+ "Have evaluations been run across all applicable modalities": true,
283
+ "Have privacy evaluations been run that take the form of automatic quantitative evaluation": true,
284
+ "Have privacy evaluations been run with human participants?": false
285
+ }
286
+ },
287
+ "5.2 Privacy, Likeness, and Publicity Harms": {
288
+ "status": "N/A",
289
+ "sources": [],
290
+ "questions": {
291
+ "Has the AI system been evaluated for its likelihood of revealing personal information from its training data?": false,
292
+ "Has the AI system been evaluated for its likelihood of facilitating generation of content impersonating an individual?": false,
293
+ "Has the AI system been evaluated for its likelihood of providing made up or confabulated personal information about individuals?": false
294
+ }
295
+ },
296
+ "5.3 Intellectual Property and Information Security": {
297
+ "status": "Yes",
298
+ "sources": [
299
+ {
300
+ "type": "🏒",
301
+ "detail": "Membership test to find if generated code was copied from the training corpus"
302
+ },
303
+ {
304
+ "type": "🏒",
305
+ "detail": "Code attribution tool to find the original author and license of the generated code"
306
+ },
307
+ {
308
+ "type": "🌐",
309
+ "detail": "https://arxiv.org/abs/2402.19173",
310
+ "name": "Asleep at the Keyboard Security Benchmark"
311
+ }
312
+ ],
313
+ "questions": {
314
+ "Has the AI system been evaluated for its likelihood of reproducing other categories of information from its training data": true,
315
+ "Has the system been evaluated for other information security risks for in-scope uses": false
316
+ }
317
+ },
318
+ "5.4 Privacy Evaluation Transparency and Documentation": {
319
+ "status": "Yes",
320
+ "sources": [
321
+ {
322
+ "type": "🏒",
323
+ "detail": "Documentation of training data information risk categories and consent status"
324
+ }
325
+ ],
326
+ "questions": {
327
+ "Documentation of the categories of training data that present information risk": true,
328
+ "Documentation of evaluation methods to replicate findings": true,
329
+ "Documentation of evaluation results to support comparison": true,
330
+ "Documentation of evaluation limitations": false,
331
+ "Documentation of deployment considerations": false
332
+ }
333
+ }
334
+ },
335
+ "6. Financial Costs Evaluation": {
336
+ "6.1 Financial Costs Overview": {
337
+ "status": "N/A",
338
+ "sources": [],
339
+ "questions": {
340
+ "Evaluation of costs at various stages": false,
341
+ "Have costs been evaluated for different system components": false,
342
+ "Have cost evaluations been run across all applicable modalities": false,
343
+ "Have cost evaluations included both direct and indirect expenses": false,
344
+ "Have cost projections been validated against actual expenses": false
345
+ }
346
+ },
347
+ "6.2 Development and Training Costs": {
348
+ "status": "N/A",
349
+ "sources": [],
350
+ "questions": {
351
+ "Assessment of research and development labor costs": false,
352
+ "Evaluation of data collection and preprocessing costs": false,
353
+ "Assessment of training infrastructure costs": false,
354
+ "Assessment of costs associated with different training approaches": false,
355
+ "Evaluation of model architecture and size impact on costs": false
356
+ }
357
+ },
358
+ "6.3 Deployment and Operation Costs": {
359
+ "status": "N/A",
360
+ "sources": [],
361
+ "questions": {
362
+ "Assessment of inference and serving costs": false,
363
+ "Evaluation of storage and hosting expenses": false,
364
+ "Assessment of scaling costs based on usage patterns": false,
365
+ "Evaluation of costs specific to different deployment contexts": false,
366
+ "Assessment of costs for model updates or fine-tuning by end users": false
367
+ }
368
+ },
369
+ "6.4 Financial Cost Documentation and Transparency": {
370
+ "status": "N/A",
371
+ "sources": [],
372
+ "questions": {
373
+ "Sufficient documentation of cost evaluation methodology and assumptions": false,
374
+ "Sufficient documentation of cost breakdowns and metrics": false,
375
+ "Documentation of cost variations across different usage scenarios": false,
376
+ "Documentation of long-term cost projections and risk factors": false
377
+ }
378
+ }
379
+ },
380
+ "7. Data and Content Moderation Labor Evaluation": {
381
+ "7.1 Labor Evaluation Overview": {
382
+ "status": "Yes",
383
+ "sources": [
384
+ {
385
+ "type": "🏒",
386
+ "detail": "PII annotations by human annotators with fair wage"
387
+ }
388
+ ],
389
+ "questions": {
390
+ "Evaluation of labor practices at various stages": true,
391
+ "Have labor conditions been evaluated for different worker categories": true,
392
+ "Have labor evaluations been run across all applicable task types": false,
393
+ "Have labor practices been evaluated against established industry standards": true,
394
+ "Have labor evaluations included both direct employees and contracted workers": false,
395
+ "Have evaluations considered different regional and jurisdictional contexts": true
396
+ }
397
+ },
398
+ "7.2 Working Conditions and Compensation": {
399
+ "status": "Yes",
400
+ "sources": [
401
+ {
402
+ "type": "🏒",
403
+ "detail": "PII annotations by human annotators with fair wage"
404
+ }
405
+ ],
406
+ "questions": {
407
+ "Assessment of compensation relative to local living wages and industry standards": true,
408
+ "Assessment of job security and employment classification": false,
409
+ "Evaluation of workplace safety, worker protections and rights": false,
410
+ "Assessment of worker autonomy and task assignment practices": false,
411
+ "Evaluation of power dynamics and worker feedback mechanisms": false
412
+ }
413
+ },
414
+ "7.3 Worker Wellbeing and Support": {
415
+ "status": "N/A",
416
+ "sources": [],
417
+ "questions": {
418
+ "Assessment of psychological support systems, trauma resources, and other long-term mental health monitoring": false,
419
+ "Evaluation of training and preparation for difficult content": false,
420
+ "Evaluation of cultural and linguistic support for diverse workforces": false
421
+ }
422
+ },
423
+ "7.4 Labor Practice Documentation and Transparency": {
424
+ "status": "Yes",
425
+ "sources": [
426
+ {
427
+ "type": "🏒",
428
+ "detail": "PII annotations by human annotators with fair wage"
429
+ }
430
+ ],
431
+ "questions": {
432
+ "Documentation of labor evaluation methodology and frameworks used": true,
433
+ "Documentation of worker demographics and task distribution": false,
434
+ "Documentation of support systems, worker protections": false,
435
+ "Documentation of incident reporting and resolution procedures": false
436
  }
437
  }
438
  }
439
+ }
440
+ }
model_data/model_c_data.json CHANGED
@@ -1,417 +1,440 @@
1
  {
2
  "metadata": {
3
  "Name": "Model C",
4
- "Provider": "TechStart",
5
- "Version": "1.0",
6
- "Release Date": "2023-12-15",
7
- "Type": "Specialized NLP Model",
8
- "Modalities": ["Text-to-Text"]
 
9
  },
10
  "scores": {
11
- "Bias, Stereotypes, and Representational Harms Evaluation": {
12
- "Comprehensive Evaluation Methodology": {
13
- "status": "No",
14
- "source": null,
15
- "applicable_evaluations": [
16
- "Evaluations at various stages (data collection, preprocessing, model architecture, training, deployment)",
17
- "Both intrinsic (e.g., embedding analysis) and extrinsic (e.g., downstream task performance) evaluation methods"
18
- ]
19
- },
20
- "Inclusive Protected Class Consideration": {
21
- "status": "No",
22
- "source": null,
23
- "applicable_evaluations": [
24
- "Evaluation of non-standard protected classes (e.g., socioeconomic status, education level, regional differences)",
25
- "Consideration of intersectionality and how identity aspects interact"
26
- ]
27
- },
28
- "Cultural and Linguistic Diversity": {
29
- "status": "No",
30
- "source": null,
31
- "applicable_evaluations": [
32
- "Tests of model performance and biases across languages and cultures",
33
- "Consideration of how protected categories may shift in meaning across regions"
34
- ]
35
- },
36
- "Stereotype and Harmful Association Detection": {
37
- "status": "No",
38
- "source": null,
39
- "applicable_evaluations": [
40
- "Detection of stereotypical word associations in text models",
41
- "Sentiment analysis and toxicity measurements, especially regarding specific groups"
42
- ]
43
- },
44
- "Performance Disparities Assessment": {
45
- "status": "No",
46
- "source": null,
47
- "applicable_evaluations": [
48
- "Detailed breakdowns of performance metrics (accuracy, precision, recall) for various subgroups",
49
- "Performance analysis for disadvantaged subgroups"
50
- ]
51
- },
52
- "Bias Mitigation and Impact Analysis": {
53
- "status": "No",
54
- "source": null,
55
- "applicable_evaluations": [
56
- "Documentation of bias mitigation strategies",
57
- "Analyses of how model updates or mitigations affect bias metrics"
58
- ]
59
- },
60
- "Transparency and Limitations Disclosure": {
61
- "status": "No",
62
- "source": null,
63
- "applicable_evaluations": [
64
- "Clear statements on the capabilities and limitations of evaluation methods",
65
- "Acknowledgment of potential biases from the evaluation tools/processes"
66
- ]
67
- },
68
- "Ongoing Evaluation Commitment": {
69
- "status": "No",
70
- "source": null,
71
- "applicable_evaluations": [
72
- "Plans for continual bias assessment as the model is updated or deployed in new contexts",
73
- "Commitments to transparency and regular reporting on bias-related issues"
74
- ]
 
 
 
 
 
75
  }
76
  },
77
- "Cultural Values and Sensitive Content Evaluation": {
78
- "Hate Speech and Toxicity Evaluation": {
79
- "status": "No",
80
- "source": null,
81
- "applicable_evaluations": [
82
- "Assessments of harmful text generation",
83
- "Evaluations of toxicity, hurtfulness, or offensiveness"
84
- ]
85
- },
86
- "Cultural Value Representation": {
87
- "status": "No",
88
- "source": null,
89
- "applicable_evaluations": [
90
- "Use of pre-existing scholarship (e.g., World Values Survey, Geert Hofstede's work)",
91
- "Assessments of ethical scenarios and political value representation"
92
- ]
93
- },
94
- "Diverse Cultural Context": {
95
- "status": "No",
96
- "source": null,
97
- "applicable_evaluations": [
98
- "Assessments that don't equate nationality with cultural context",
99
- "Representation of differing cultural values within countries"
100
- ]
101
- },
102
- "Sensitive Content Identification": {
103
- "status": "No",
104
- "source": null,
105
- "applicable_evaluations": [
106
- "Recognition of topics that vary by culture and viewpoint",
107
- "Evaluation of adult sexual content identification"
108
- ]
109
- },
110
- "Impact of Generated Content": {
111
- "status": "No",
112
- "source": null,
113
- "applicable_evaluations": [
114
- "Assessment of potential harm to targeted viewers",
115
- "Evaluation of content's potential to normalize harmful ideas"
116
- ]
117
- },
118
- "Multidimensional Cultural Analysis": {
119
- "status": "No",
120
- "source": null,
121
- "applicable_evaluations": [
122
- "Evaluations at word, sentence, and document levels for text",
123
- "Multi-level analysis of cultural representation"
124
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  }
126
  },
127
- "Disparate Performance": {
128
- "Subpopulation Performance Analysis": {
129
- "status": "No",
130
- "source": null,
131
- "applicable_evaluations": [
132
- "Non-aggregated (disaggregated) evaluation results with in-depth breakdowns across subpopulations",
133
- "Metrics such as subgroup accuracy, calibration, AUC, recall, precision, min-max ratios"
134
- ]
135
- },
136
- "Cross-lingual and Dialect Evaluation": {
137
- "status": "No",
138
- "source": null,
139
- "applicable_evaluations": [
140
- "Cross-lingual prompting on standard benchmarks",
141
- "Examination of performance across dialects"
142
- ]
143
- },
144
- "Image Generation Quality Assessment": {
145
- "status": "N/A",
146
- "source": null,
147
- "applicable_evaluations": []
148
- },
149
- "Data Duplication and Bias Analysis": {
150
- "status": "No",
151
- "source": null,
152
- "applicable_evaluations": [
153
- "Analysis of the effect of retaining duplicate examples in the training dataset",
154
- "Evaluation of model bias towards generating certain phrases or concepts"
155
- ]
156
- },
157
- "Dataset Disparities Evaluation": {
158
- "status": "No",
159
- "source": null,
160
- "applicable_evaluations": [
161
- "Assessment of dataset skew with fewer examples from some subpopulations",
162
- "Evaluation of feature inconsistencies across subpopulations"
163
- ]
164
- },
165
- "Evaluation of Systemic Issues": {
166
- "status": "No",
167
- "source": null,
168
- "applicable_evaluations": [
169
- "Assessment of disparities due to dataset collection methods",
170
- "Evaluation of the impact of varying levels of internet access on data representation"
171
- ]
172
- },
173
- "Long-tail Data Distribution Analysis": {
174
- "status": "No",
175
- "source": null,
176
- "applicable_evaluations": [
177
- "Assessment of model performance on rare or uncommon data points",
178
- "Evaluation of the trade-off between fitting long tails and unintentional memorization"
179
- ]
180
  }
181
  },
182
- "Environmental Costs and Carbon Emissions Evaluation": {
183
- "Energy Consumption Measurement": {
184
- "status": "No",
185
- "source": null,
186
- "applicable_evaluations": [
187
- "Measurement of energy used in training, testing, and deploying the system",
188
- "Evaluation of compute power consumption"
189
- ]
190
- },
191
- "Carbon Footprint Quantification": {
192
- "status": "No",
193
- "source": null,
194
- "applicable_evaluations": [
195
- "Use of tools like CodeCarbon or Carbontracker",
196
- "Measurement of carbon emissions for training and inference"
197
- ]
198
- },
199
- "Hardware Resource Evaluation": {
200
- "status": "No",
201
- "source": null,
202
- "applicable_evaluations": [
203
- "Assessment of CPU, GPU, and TPU usage",
204
- "Measurement of FLOPS (Floating Point Operations)"
205
- ]
206
- },
207
- "Comprehensive Environmental Impact Assessment": {
208
- "status": "No",
209
- "source": null,
210
- "applicable_evaluations": [
211
- "Use of Life Cycle Assessment (LCA) methodologies",
212
- "Evaluation of immediate impacts of applying ML"
213
- ]
214
- },
215
- "Transparency in Environmental Reporting": {
216
- "status": "No",
217
- "source": null,
218
- "applicable_evaluations": [
219
- "Disclosure of uncertainty around measured variables",
220
- "Reporting of marginal costs (e.g., added parameters' contribution to energy consumption)"
221
- ]
222
- },
223
- "Comprehensive Environmental Impact Metrics": {
224
- "status": "No",
225
- "source": null,
226
- "applicable_evaluations": [
227
- "Discussion of different approaches to measuring environmental impact",
228
- "Use of diverse measurements beyond energy consumption"
229
- ]
230
  }
231
  },
232
- "Privacy and Data Protection Evaluation": {
233
- "Data Minimization and Consent Practices": {
234
- "status": "No",
235
- "source": null,
236
- "applicable_evaluations": [
237
- "Implementation of data minimization practices",
238
- "Use of opt-in data collection methods"
239
- ]
240
- },
241
- "Memorization and Data Leakage Evaluation": {
242
- "status": "No",
243
- "source": null,
244
- "applicable_evaluations": [
245
- "Examination of the maximum amount of discoverable information given training data",
246
- "Evaluation of extractable information without training data access"
247
- ]
248
- },
249
- "Personal Information Revelation Assessment": {
250
- "status": "No",
251
- "source": null,
252
- "applicable_evaluations": [
253
- "Direct prompting tests to reveal Personally Identifiable Information (PII)",
254
- "Evaluation of the system's ability to infer personal attributes"
255
- ]
256
- },
257
- "Image and Audio Privacy Evaluation": {
258
- "status": "N/A",
259
- "source": null,
260
- "applicable_evaluations": []
261
- },
262
- "Intellectual Property and Copyright Evaluation": {
263
- "status": "No",
264
- "source": null,
265
- "applicable_evaluations": [
266
- "Assessment of the system's ability to generate copyrighted content",
267
- "Evaluation of intellectual property concerns in generated content"
268
- ]
269
- },
270
- "Retroactive Privacy Protection": {
271
- "status": "No",
272
- "source": null,
273
- "applicable_evaluations": [
274
- "Assessment of the system's capability to retroactively retrain in accordance with privacy policies",
275
- "Evaluation of processes for removing specific data points upon request"
276
- ]
277
- },
278
- "Third-party Hosting Privacy Evaluation": {
279
- "status": "No",
280
- "source": null,
281
- "applicable_evaluations": [
282
- "Assessment of potential leakage of private input data in generations",
283
- "Evaluation of system prompt privacy, especially for prompts containing proprietary information"
284
- ]
285
- },
286
- "Generative AI-Specific Privacy Measures": {
287
- "status": "No",
288
- "source": null,
289
- "applicable_evaluations": [
290
- "Assessment of the applicability of data sanitization techniques to generative models",
291
- "Evaluation of differential privacy approaches in the context of generative AI"
292
- ]
293
  }
294
  },
295
- "Financial Costs Evaluation": {
296
- "Comprehensive Cost Evaluation": {
297
- "status": "No",
298
- "source": null,
299
- "applicable_evaluations": [
300
- "Estimation of infrastructure and hardware costs",
301
- "Calculation of labor hours from researchers, developers, and crowd workers"
302
- ]
303
- },
304
- "Storage and Training Cost Analysis": {
305
- "status": "No",
306
- "source": null,
307
- "applicable_evaluations": [
308
- "Assessment of storage costs for both datasets and resulting models",
309
- "Evaluation of training costs based on in-house GPUs or per-hour-priced instances"
310
- ]
311
- },
312
- "Hosting and Inference Cost Evaluation": {
313
- "status": "No",
314
- "source": null,
315
- "applicable_evaluations": [
316
- "Evaluation of low-latency serving costs",
317
- "Assessment of inference costs based on token usage"
318
- ]
319
- },
320
- "Modality-Specific Cost Analysis": {
321
- "status": "N/A",
322
- "source": null,
323
- "applicable_evaluations": []
324
- },
325
- "Long-term Cost Considerations": {
326
- "status": "No",
327
- "source": null,
328
- "applicable_evaluations": [
329
- "Assessment of pre- and post-deployment costs",
330
- "Consideration of human labor and hidden costs"
331
- ]
332
- },
333
- "API Cost Evaluation": {
334
- "status": "No",
335
- "source": null,
336
- "applicable_evaluations": [
337
- "Assessment of token-usage based pricing",
338
- "Evaluation of cost variations based on initial prompt length and requested token response length"
339
- ]
340
- },
341
- "Comprehensive Cost Tracking": {
342
- "status": "No",
343
- "source": null,
344
- "applicable_evaluations": [
345
- "Assessment of costs related to broader infrastructure or organizational changes",
346
- "Evaluation of long-term maintenance and update costs"
347
- ]
348
  }
349
  },
350
- "Data and Content Moderation Labor Evaluation": {
351
- "Crowdwork Standards Compliance": {
352
- "status": "No",
353
- "source": null,
354
- "applicable_evaluations": [
355
- "Assessment of compliance with Criteria for Fairer Microwork",
356
- "Evaluation against Partnership on AI's Responsible Sourcing of Data Enrichment Services guidelines"
357
- ]
358
- },
359
- "Crowdworker Demographics and Compensation": {
360
- "status": "No",
361
- "source": null,
362
- "applicable_evaluations": [
363
- "Documentation of crowd workers' demographics",
364
- "Assessment of how crowdworkers were evaluated and compensated"
365
- ]
366
- },
367
- "Psychological Support and Content Exposure": {
368
- "status": "No",
369
- "source": null,
370
- "applicable_evaluations": [
371
- "Documentation of immediate trauma support availability",
372
- "Evaluation of practices for controlling exposure to traumatic material"
373
- ]
374
- },
375
- "Transparency in Crowdwork Documentation": {
376
- "status": "No",
377
- "source": null,
378
- "applicable_evaluations": [
379
- "Use of transparent reporting frameworks",
380
- "Documentation of crowdwork's role in shaping AI system output"
381
- ]
382
- },
383
- "Crowdwork Stages and Types": {
384
- "status": "No",
385
- "source": null,
386
- "applicable_evaluations": [
387
- "Assessment of crowdwork in data gathering, curation, cleaning, and labeling",
388
- "Evaluation of crowdwork during model development and interim evaluations"
389
- ]
390
- },
391
- "Evaluation of Labor Protection and Regulations": {
392
- "status": "No",
393
- "source": null,
394
- "applicable_evaluations": [
395
- "Assessment of compliance with relevant labor law interventions by jurisdiction",
396
- "Evaluation of worker classification and associated protections"
397
- ]
398
- },
399
- "Outsourcing Impact Evaluation": {
400
- "status": "No",
401
- "source": null,
402
- "applicable_evaluations": [
403
- "Assessment of communication barriers created by outsourcing",
404
- "Evaluation of differences in working conditions between in-house and outsourced labor"
405
- ]
406
- },
407
- "Impact of Precarious Employment": {
408
- "status": "No",
409
- "source": null,
410
- "applicable_evaluations": [
411
- "Assessment of job security and its impact on worker feedback",
412
- "Evaluation of anonymous reporting systems for substandard working conditions"
413
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
  }
415
  }
416
  }
417
- }
 
 
1
  {
2
  "metadata": {
3
  "Name": "Model C",
4
+ "Provider": "BigCode",
5
+ "URL": "https://huggingface.co/bigcode/starcoder2-15b",
6
+ "Type": "Large Language Model",
7
+ "Modalities": [
8
+ "Text-to-Text"
9
+ ]
10
  },
11
  "scores": {
12
+ "1. Bias, Stereotypes, and Representational Harms Evaluation": {
13
+ "1.1 Bias Detection Overview": {
14
+ "status": "Yes",
15
+ "sources": [
16
+ {
17
+ "type": "🌐",
18
+ "detail": "https://arxiv.org/abs/2402.19173",
19
+ "name": "BOLD - Bias in Open-ended Language Generation Dataset"
20
+ },
21
+ {
22
+ "type": "🌐",
23
+ "detail": "https://arxiv.org/abs/2402.19173",
24
+ "name": "WinoBias"
25
+ }
26
+ ],
27
+ "questions": {
28
+ "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": false,
29
+ "Have intrinsic properties of the AI system been evaluated for bias (e.g., embedding analysis)": false,
30
+ "Have extrinsic bias evaluations been run (e.g., downstream task performance)": true,
31
+ "Have evaluations been run across all applicable modalities": true,
32
+ "Have bias evaluations been run that take the form of automatic quantitative evaluation": true,
33
+ "Have bias evaluations been run with human participants?": false
34
+ }
35
+ },
36
+ "1.2 Protected Classes and Intersectional Measures": {
37
+ "status": "No",
38
+ "sources": [],
39
+ "questions": {
40
+ "Do evaluations cover all applicable legal protected categories for in-scope uses of the system?": false,
41
+ "Do evaluations cover additional subgroups that are likely to be harmed based on other personal characteristics": false,
42
+ "Evaluation of how different aspects of identity interact and compound in AI system behavior": false,
43
+ "Evaluation of AI system biases for legal protected categories and additional relevant subgroups": false
44
+ }
45
+ },
46
+ "1.3 Measurement of Stereotypes and Harmful Associations": {
47
+ "status": "Yes",
48
+ "sources": [
49
+ {
50
+ "type": "🌐",
51
+ "detail": "https://arxiv.org/abs/2402.19173",
52
+ "name": "HONEST - Hurtful Sentence Completion in English Language Models"
53
+ },
54
+ {
55
+ "type": "🌐",
56
+ "detail": "https://arxiv.org/abs/2402.19173",
57
+ "name": "RealToxicityPrompts"
58
+ }
59
+ ],
60
+ "questions": {
61
+ "Measurement of known stereotypes in AI system outputs": true,
62
+ "Measurement of other negative associations and assumptions regarding specific groups": true,
63
+ "Measurement of stereotypes and negative associations across in-scope contexts": false
64
+ }
65
+ },
66
+ "1.4 Bias Evaluation Transparency and Documentation": {
67
+ "status": "Yes",
68
+ "sources": [
69
+ {
70
+ "type": "🌐",
71
+ "detail": "https://arxiv.org/abs/2402.19173",
72
+ "name": "Evaluation Documentation"
73
+ }
74
+ ],
75
+ "questions": {
76
+ "Sufficient documentation of evaluation methods (including code and datasets) to replicate findings": true,
77
+ "Sufficient documentation of evaluation results (including intermediary statistics) to support comparison to other AI systems": true,
78
+ "Documentation of bias mitigation measures, including their secondary impacts": false,
79
+ "Documentation of bias monitoring approaches post-release/deployment if applicable": false
80
+ }
81
  }
82
  },
83
+ "2. Cultural Values and Sensitive Content Evaluation": {
84
+ "2.1 Cultural Variation Overview": {
85
+ "status": "N/A",
86
+ "sources": [],
87
+ "questions": {
88
+ "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": false,
89
+ "Have intrinsic properties of the AI system been evaluated for cultural variation(e.g., embedding analysis)": false,
90
+ "Have extrinsic cultural variation evaluations been run (e.g., downstream task performance)": false,
91
+ "Have evaluations been run across all applicable modalities": false,
92
+ "Have cultural variation evaluations been run that take the form of automatic quantitative evaluation": false,
93
+ "Have cultural variation evaluations been run with human participants?": false
94
+ }
95
+ },
96
+ "2.2 Cultural Diversity and Representation": {
97
+ "status": "N/A",
98
+ "sources": [],
99
+ "questions": {
100
+ "Use of evaluation methods developed in the cultural contexts in scope": false,
101
+ "Respect of indigenous sovereignty, protected rights, and cultural norms in AI system-generated content": false,
102
+ "Evaluation of cultural variation across geographic dimensions": false,
103
+ "Evaluation of cultural variation representing communities' perspectives within geographical contexts": false,
104
+ "Analysis of how cultural context affects AI system performance": false
105
+ }
106
+ },
107
+ "2.3 Generated Sensitive Content across Cultural Contexts": {
108
+ "status": "Yes",
109
+ "sources": [
110
+ {
111
+ "type": "🌐",
112
+ "detail": "https://arxiv.org/abs/2402.19173",
113
+ "name": "HONEST - Hurtful Sentence Completion in English Language Models"
114
+ },
115
+ {
116
+ "type": "🌐",
117
+ "detail": "https://arxiv.org/abs/2402.19173",
118
+ "name": "RealToxicityPrompts"
119
+ }
120
+ ],
121
+ "questions": {
122
+ "Has the AI system been evaluated for its likelihood of facilitating generation of threatening or violent content": true,
123
+ "Has the AI system been evaluated for its likelihood of facilitating generation of targeted harassment or discrimination": false,
124
+ "Has the AI system been evaluated for its likelihood of facilitating generation of hate speech": false,
125
+ "Has the AI system been evaluated for its likelihood of exposing its direct users to content embedding values and assumptions not reflective of their cultural context": false,
126
+ "Has the AI system been evaluated for its likelihood of exposing its direct users to inappropriate content for their use context": true,
127
+ "Has the AI system been evaluated for its likelihood of exposing its direct users to content with negative psychological impacts": false,
128
+ "Has the evaluation of the AI system's behaviors explicitly considered cultural variation in their definition": false
129
+ }
130
+ },
131
+ "2.4 Cultural Variation Transparency and Documentation": {
132
+ "status": "N/A",
133
+ "sources": [],
134
+ "questions": {
135
+ "Documentation of cultural contexts considered during development": false,
136
+ "Documentation of the range of cultural contexts covered by evaluations": false,
137
+ "Sufficient documentation of evaluation method to understand the scope of the findings": false,
138
+ "Construct validity, documentation of strengths, weaknesses, and assumptions": false,
139
+ "Domain shift between evaluation development and AI system development settings": false,
140
+ "Sufficient documentation of evaluation methods to replicate findings": false,
141
+ "Sufficient documentation of evaluation results to support comparison": false,
142
+ "Document of psychological impact on evaluators reviewing harmful content": false,
143
+ "Documentation of measures to protect evaluator well-being": false
144
+ }
145
  }
146
  },
147
+ "3. Disparate Performance": {
148
+ "3.1 Disparate Performance Overview": {
149
+ "status": "N/A",
150
+ "sources": [],
151
+ "questions": {
152
+ "Have development choices and intrinsic properties of the AI system been evaluated for their contribution to disparate performance?": false,
153
+ "Have extrinsic disparate performance evaluations been run": false,
154
+ "Have evaluations been run across all applicable modalities": false,
155
+ "Have disparate performance evaluations been run that take the form of automatic quantitative evaluation": false,
156
+ "Have disparate performance evaluations been run with human participants": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  }
158
  },
159
+ "3.2 Identifying Target Groups for Disparate Performance Evaluation": {
160
+ "status": "N/A",
161
+ "sources": [],
162
+ "questions": {
163
+ "Identification of mandated target group based on legal nondiscrimination frameworks": false,
164
+ "Identification of further target groups that are likely to be harmed by disparate performance": false,
165
+ "Assessment of systemic barriers in dataset collection methods for different groups": false,
166
+ "Consideration of historical disparities in the task in which the AI system is deployed": false,
167
+ "Identification of both implicit and explicit markers for the target groups": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  }
169
  },
170
+ "3.3 Subgroup Performance Analysis": {
171
+ "status": "N/A",
172
+ "sources": [],
173
+ "questions": {
174
+ "Non-aggregated evaluation results across subpopulations, including feature importance and consistency analysis": false,
175
+ "Metrics to measure performance in decision-making tasks": false,
176
+ "Metrics to measure disparate performance in other tasks including generative tasks": false,
177
+ "Worst-case subgroup performance analysis, including performance on rare or underrepresented cases": false,
178
+ "Intersectional analysis examining performance across combinations of subgroup": false,
179
+ "Do evaluations of disparate performance account for implicit social group markers": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  }
181
  },
182
+ "3.4 Disparate Performance Evaluation Transparency and Documentation": {
183
+ "status": "N/A",
184
+ "sources": [],
185
+ "questions": {
186
+ "Sufficient documentation of evaluation method to understand the scope of the findings": false,
187
+ "Documentation of strengths, weaknesses, and assumptions about the context": false,
188
+ "Documentation of domain shift between evaluation and deployment settings": false,
189
+ "Sufficient documentation of evaluation methods to replicate findings": false,
190
+ "Sufficient documentation of evaluation results to support comparison": false,
191
+ "Documentation of disparate performance mitigation measures": false,
192
+ "Documentation of disparate performance monitoring approaches": false
193
+ }
194
+ }
195
+ },
196
+ "4. Environmental Costs and Carbon Emissions Evaluation": {
197
+ "4.1 Environmental Costs Overview": {
198
+ "status": "Yes",
199
+ "sources": [
200
+ {
201
+ "type": "🌐",
202
+ "detail": "https://mlco2.github.io/impact/#compute",
203
+ "name": "Machine Learning Emissions Calculator"
204
+ }
205
+ ],
206
+ "questions": {
207
+ "Evaluations of different processes within development and deployment": false,
208
+ "Have evaluations been run across all applicable modalities?": true,
209
+ "Have evaluations been run on standardized benchmarks or metrics?": true,
210
+ "Have evaluations taken into account community feedback from regions affected by data center power consumption?": false,
211
+ "Do evaluations consider the full supply chain including environmental impact of hardware components and data centers used?": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  }
213
  },
214
+ "4.2 Energy Cost and Environmental Impact of Development": {
215
+ "status": "Yes",
216
+ "sources": [
217
+ {
218
+ "type": "🌐",
219
+ "detail": "https://mlco2.github.io/impact/#compute",
220
+ "name": "Machine Learning Emissions Calculator"
221
+ }
222
+ ],
223
+ "questions": {
224
+ "Accounting of FLOPS across development stages": true,
225
+ "Evaluation of energy consumption using standardized tracking tools": true,
226
+ "Evaluation of carbon impact accounting for regional energy sources": true,
227
+ "Evaluation of hardware lifecycle environmental impact": false
228
+ }
229
+ },
230
+ "4.3 Energy Cost and Environmental Impact of Deployment": {
231
+ "status": "N/A",
232
+ "sources": [],
233
+ "questions": {
234
+ "Evaluation of inference FLOPS for the system": false,
235
+ "Evaluation of inference energy consumption on most common deployment setting": false,
236
+ "Evaluation of inference energy consumption on multiple deployment settings": false,
237
+ "Evaluation of task-specific energy consumption variations": false,
238
+ "Evaluation of carbon impact for deployment infrastructure": false,
239
+ "Evaluation of hardware lifecycle environmental impact for deployment": false
240
+ }
241
+ },
242
+ "4.4 Environmental Costs Transparency and Documentation": {
243
+ "status": "Yes",
244
+ "sources": [
245
+ {
246
+ "type": "🌐",
247
+ "detail": "https://mlco2.github.io/impact/#compute",
248
+ "name": "Machine Learning Emissions Calculator"
249
+ }
250
+ ],
251
+ "questions": {
252
+ "Documentation about equipment and infrastructure specifications": true,
253
+ "Sufficient documentation of evaluation methods including components covered": false,
254
+ "Sufficient documentation of evaluation methods to replicate findings": true,
255
+ "Sufficient documentation of evaluation results for comparison": true
256
+ }
257
+ }
258
+ },
259
+ "5. Privacy and Data Protection Evaluation": {
260
+ "5.1 Privacy and Data Protection Overview": {
261
+ "status": "Yes",
262
+ "sources": [
263
+ {
264
+ "type": "🏒",
265
+ "detail": "PII detection and redaction using an NER model"
266
+ },
267
+ {
268
+ "type": "🌐",
269
+ "detail": "https://huggingface.co/spaces/bigcode/in-the-stack",
270
+ "name": "Opt-out tool for users"
271
+ },
272
+ {
273
+ "type": "🌐",
274
+ "detail": "https://arxiv.org/abs/2402.19173",
275
+ "name": "Asleep at the Keyboard Security Benchmark"
276
+ }
277
+ ],
278
+ "questions": {
279
+ "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": true,
280
+ "Have intrinsic properties of the AI system been evaluated for privacy vulnerabilities": false,
281
+ "Have extrinsic privacy evaluations been run": true,
282
+ "Have evaluations been run across all applicable modalities": true,
283
+ "Have privacy evaluations been run that take the form of automatic quantitative evaluation": true,
284
+ "Have privacy evaluations been run with human participants?": false
285
+ }
286
+ },
287
+ "5.2 Privacy, Likeness, and Publicity Harms": {
288
+ "status": "N/A",
289
+ "sources": [],
290
+ "questions": {
291
+ "Has the AI system been evaluated for its likelihood of revealing personal information from its training data?": false,
292
+ "Has the AI system been evaluated for its likelihood of facilitating generation of content impersonating an individual?": false,
293
+ "Has the AI system been evaluated for its likelihood of providing made up or confabulated personal information about individuals?": false
294
+ }
295
+ },
296
+ "5.3 Intellectual Property and Information Security": {
297
+ "status": "Yes",
298
+ "sources": [
299
+ {
300
+ "type": "🏒",
301
+ "detail": "Membership test to find if generated code was copied from the training corpus"
302
+ },
303
+ {
304
+ "type": "🏒",
305
+ "detail": "Code attribution tool to find the original author and license of the generated code"
306
+ },
307
+ {
308
+ "type": "🌐",
309
+ "detail": "https://arxiv.org/abs/2402.19173",
310
+ "name": "Asleep at the Keyboard Security Benchmark"
311
+ }
312
+ ],
313
+ "questions": {
314
+ "Has the AI system been evaluated for its likelihood of reproducing other categories of information from its training data": true,
315
+ "Has the system been evaluated for other information security risks for in-scope uses": false
316
+ }
317
+ },
318
+ "5.4 Privacy Evaluation Transparency and Documentation": {
319
+ "status": "Yes",
320
+ "sources": [
321
+ {
322
+ "type": "🏒",
323
+ "detail": "Documentation of training data information risk categories and consent status"
324
+ }
325
+ ],
326
+ "questions": {
327
+ "Documentation of the categories of training data that present information risk": true,
328
+ "Documentation of evaluation methods to replicate findings": true,
329
+ "Documentation of evaluation results to support comparison": true,
330
+ "Documentation of evaluation limitations": false,
331
+ "Documentation of deployment considerations": false
332
+ }
333
+ }
334
+ },
335
+ "6. Financial Costs Evaluation": {
336
+ "6.1 Financial Costs Overview": {
337
+ "status": "N/A",
338
+ "sources": [],
339
+ "questions": {
340
+ "Evaluation of costs at various stages": false,
341
+ "Have costs been evaluated for different system components": false,
342
+ "Have cost evaluations been run across all applicable modalities": false,
343
+ "Have cost evaluations included both direct and indirect expenses": false,
344
+ "Have cost projections been validated against actual expenses": false
345
+ }
346
+ },
347
+ "6.2 Development and Training Costs": {
348
+ "status": "N/A",
349
+ "sources": [],
350
+ "questions": {
351
+ "Assessment of research and development labor costs": false,
352
+ "Evaluation of data collection and preprocessing costs": false,
353
+ "Assessment of training infrastructure costs": false,
354
+ "Assessment of costs associated with different training approaches": false,
355
+ "Evaluation of model architecture and size impact on costs": false
356
+ }
357
+ },
358
+ "6.3 Deployment and Operation Costs": {
359
+ "status": "N/A",
360
+ "sources": [],
361
+ "questions": {
362
+ "Assessment of inference and serving costs": false,
363
+ "Evaluation of storage and hosting expenses": false,
364
+ "Assessment of scaling costs based on usage patterns": false,
365
+ "Evaluation of costs specific to different deployment contexts": false,
366
+ "Assessment of costs for model updates or fine-tuning by end users": false
367
+ }
368
+ },
369
+ "6.4 Financial Cost Documentation and Transparency": {
370
+ "status": "N/A",
371
+ "sources": [],
372
+ "questions": {
373
+ "Sufficient documentation of cost evaluation methodology and assumptions": false,
374
+ "Sufficient documentation of cost breakdowns and metrics": false,
375
+ "Documentation of cost variations across different usage scenarios": false,
376
+ "Documentation of long-term cost projections and risk factors": false
377
+ }
378
+ }
379
+ },
380
+ "7. Data and Content Moderation Labor Evaluation": {
381
+ "7.1 Labor Evaluation Overview": {
382
+ "status": "Yes",
383
+ "sources": [
384
+ {
385
+ "type": "🏒",
386
+ "detail": "PII annotations by human annotators with fair wage"
387
+ }
388
+ ],
389
+ "questions": {
390
+ "Evaluation of labor practices at various stages": true,
391
+ "Have labor conditions been evaluated for different worker categories": true,
392
+ "Have labor evaluations been run across all applicable task types": false,
393
+ "Have labor practices been evaluated against established industry standards": true,
394
+ "Have labor evaluations included both direct employees and contracted workers": false,
395
+ "Have evaluations considered different regional and jurisdictional contexts": true
396
+ }
397
+ },
398
+ "7.2 Working Conditions and Compensation": {
399
+ "status": "Yes",
400
+ "sources": [
401
+ {
402
+ "type": "🏒",
403
+ "detail": "PII annotations by human annotators with fair wage"
404
+ }
405
+ ],
406
+ "questions": {
407
+ "Assessment of compensation relative to local living wages and industry standards": true,
408
+ "Assessment of job security and employment classification": false,
409
+ "Evaluation of workplace safety, worker protections and rights": false,
410
+ "Assessment of worker autonomy and task assignment practices": false,
411
+ "Evaluation of power dynamics and worker feedback mechanisms": false
412
+ }
413
+ },
414
+ "7.3 Worker Wellbeing and Support": {
415
+ "status": "N/A",
416
+ "sources": [],
417
+ "questions": {
418
+ "Assessment of psychological support systems, trauma resources, and other long-term mental health monitoring": false,
419
+ "Evaluation of training and preparation for difficult content": false,
420
+ "Evaluation of cultural and linguistic support for diverse workforces": false
421
+ }
422
+ },
423
+ "7.4 Labor Practice Documentation and Transparency": {
424
+ "status": "Yes",
425
+ "sources": [
426
+ {
427
+ "type": "🏒",
428
+ "detail": "PII annotations by human annotators with fair wage"
429
+ }
430
+ ],
431
+ "questions": {
432
+ "Documentation of labor evaluation methodology and frameworks used": true,
433
+ "Documentation of worker demographics and task distribution": false,
434
+ "Documentation of support systems, worker protections": false,
435
+ "Documentation of incident reporting and resolution procedures": false
436
  }
437
  }
438
  }
439
+ }
440
+ }