evijit HF staff commited on
Commit
51c883c
β€’
1 Parent(s): 3064620

Delete model_data/model_c_data.json

Browse files
Files changed (1) hide show
  1. model_data/model_c_data.json +0 -440
model_data/model_c_data.json DELETED
@@ -1,440 +0,0 @@
1
- {
2
- "metadata": {
3
- "Name": "Model C",
4
- "Provider": "BigCode",
5
- "URL": "https://huggingface.co/bigcode/starcoder2-15b",
6
- "Type": "Large Language Model",
7
- "Modalities": [
8
- "Text-to-Text"
9
- ]
10
- },
11
- "scores": {
12
- "1. Bias, Stereotypes, and Representational Harms Evaluation": {
13
- "1.1 Bias Detection Overview": {
14
- "status": "Yes",
15
- "sources": [
16
- {
17
- "type": "🌐",
18
- "detail": "https://arxiv.org/abs/2402.19173",
19
- "name": "BOLD - Bias in Open-ended Language Generation Dataset"
20
- },
21
- {
22
- "type": "🌐",
23
- "detail": "https://arxiv.org/abs/2402.19173",
24
- "name": "WinoBias"
25
- }
26
- ],
27
- "questions": {
28
- "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": false,
29
- "Have intrinsic properties of the AI system been evaluated for bias (e.g., embedding analysis)": false,
30
- "Have extrinsic bias evaluations been run (e.g., downstream task performance)": true,
31
- "Have evaluations been run across all applicable modalities": true,
32
- "Have bias evaluations been run that take the form of automatic quantitative evaluation": true,
33
- "Have bias evaluations been run with human participants?": false
34
- }
35
- },
36
- "1.2 Protected Classes and Intersectional Measures": {
37
- "status": "No",
38
- "sources": [],
39
- "questions": {
40
- "Do evaluations cover all applicable legal protected categories for in-scope uses of the system?": false,
41
- "Do evaluations cover additional subgroups that are likely to be harmed based on other personal characteristics": false,
42
- "Evaluation of how different aspects of identity interact and compound in AI system behavior": false,
43
- "Evaluation of AI system biases for legal protected categories and additional relevant subgroups": false
44
- }
45
- },
46
- "1.3 Measurement of Stereotypes and Harmful Associations": {
47
- "status": "Yes",
48
- "sources": [
49
- {
50
- "type": "🌐",
51
- "detail": "https://arxiv.org/abs/2402.19173",
52
- "name": "HONEST - Hurtful Sentence Completion in English Language Models"
53
- },
54
- {
55
- "type": "🌐",
56
- "detail": "https://arxiv.org/abs/2402.19173",
57
- "name": "RealToxicityPrompts"
58
- }
59
- ],
60
- "questions": {
61
- "Measurement of known stereotypes in AI system outputs": true,
62
- "Measurement of other negative associations and assumptions regarding specific groups": true,
63
- "Measurement of stereotypes and negative associations across in-scope contexts": false
64
- }
65
- },
66
- "1.4 Bias Evaluation Transparency and Documentation": {
67
- "status": "Yes",
68
- "sources": [
69
- {
70
- "type": "🌐",
71
- "detail": "https://arxiv.org/abs/2402.19173",
72
- "name": "Evaluation Documentation"
73
- }
74
- ],
75
- "questions": {
76
- "Sufficient documentation of evaluation methods (including code and datasets) to replicate findings": true,
77
- "Sufficient documentation of evaluation results (including intermediary statistics) to support comparison to other AI systems": true,
78
- "Documentation of bias mitigation measures, including their secondary impacts": false,
79
- "Documentation of bias monitoring approaches post-release/deployment if applicable": false
80
- }
81
- }
82
- },
83
- "2. Cultural Values and Sensitive Content Evaluation": {
84
- "2.1 Cultural Variation Overview": {
85
- "status": "N/A",
86
- "sources": [],
87
- "questions": {
88
- "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": false,
89
- "Have intrinsic properties of the AI system been evaluated for cultural variation(e.g., embedding analysis)": false,
90
- "Have extrinsic cultural variation evaluations been run (e.g., downstream task performance)": false,
91
- "Have evaluations been run across all applicable modalities": false,
92
- "Have cultural variation evaluations been run that take the form of automatic quantitative evaluation": false,
93
- "Have cultural variation evaluations been run with human participants?": false
94
- }
95
- },
96
- "2.2 Cultural Diversity and Representation": {
97
- "status": "N/A",
98
- "sources": [],
99
- "questions": {
100
- "Use of evaluation methods developed in the cultural contexts in scope": false,
101
- "Respect of indigenous sovereignty, protected rights, and cultural norms in AI system-generated content": false,
102
- "Evaluation of cultural variation across geographic dimensions": false,
103
- "Evaluation of cultural variation representing communities' perspectives within geographical contexts": false,
104
- "Analysis of how cultural context affects AI system performance": false
105
- }
106
- },
107
- "2.3 Generated Sensitive Content across Cultural Contexts": {
108
- "status": "Yes",
109
- "sources": [
110
- {
111
- "type": "🌐",
112
- "detail": "https://arxiv.org/abs/2402.19173",
113
- "name": "HONEST - Hurtful Sentence Completion in English Language Models"
114
- },
115
- {
116
- "type": "🌐",
117
- "detail": "https://arxiv.org/abs/2402.19173",
118
- "name": "RealToxicityPrompts"
119
- }
120
- ],
121
- "questions": {
122
- "Has the AI system been evaluated for its likelihood of facilitating generation of threatening or violent content": true,
123
- "Has the AI system been evaluated for its likelihood of facilitating generation of targeted harassment or discrimination": false,
124
- "Has the AI system been evaluated for its likelihood of facilitating generation of hate speech": false,
125
- "Has the AI system been evaluated for its likelihood of exposing its direct users to content embedding values and assumptions not reflective of their cultural context": false,
126
- "Has the AI system been evaluated for its likelihood of exposing its direct users to inappropriate content for their use context": true,
127
- "Has the AI system been evaluated for its likelihood of exposing its direct users to content with negative psychological impacts": false,
128
- "Has the evaluation of the AI system's behaviors explicitly considered cultural variation in their definition": false
129
- }
130
- },
131
- "2.4 Cultural Variation Transparency and Documentation": {
132
- "status": "N/A",
133
- "sources": [],
134
- "questions": {
135
- "Documentation of cultural contexts considered during development": false,
136
- "Documentation of the range of cultural contexts covered by evaluations": false,
137
- "Sufficient documentation of evaluation method to understand the scope of the findings": false,
138
- "Construct validity, documentation of strengths, weaknesses, and assumptions": false,
139
- "Domain shift between evaluation development and AI system development settings": false,
140
- "Sufficient documentation of evaluation methods to replicate findings": false,
141
- "Sufficient documentation of evaluation results to support comparison": false,
142
- "Document of psychological impact on evaluators reviewing harmful content": false,
143
- "Documentation of measures to protect evaluator well-being": false
144
- }
145
- }
146
- },
147
- "3. Disparate Performance Evaluation": {
148
- "3.1 Disparate Performance Overview": {
149
- "status": "N/A",
150
- "sources": [],
151
- "questions": {
152
- "Have development choices and intrinsic properties of the AI system been evaluated for their contribution to disparate performance?": false,
153
- "Have extrinsic disparate performance evaluations been run": false,
154
- "Have evaluations been run across all applicable modalities": false,
155
- "Have disparate performance evaluations been run that take the form of automatic quantitative evaluation": false,
156
- "Have disparate performance evaluations been run with human participants": false
157
- }
158
- },
159
- "3.2 Identifying Target Groups for Disparate Performance Evaluation": {
160
- "status": "N/A",
161
- "sources": [],
162
- "questions": {
163
- "Identification of mandated target group based on legal nondiscrimination frameworks": false,
164
- "Identification of further target groups that are likely to be harmed by disparate performance": false,
165
- "Assessment of systemic barriers in dataset collection methods for different groups": false,
166
- "Consideration of historical disparities in the task in which the AI system is deployed": false,
167
- "Identification of both implicit and explicit markers for the target groups": false
168
- }
169
- },
170
- "3.3 Subgroup Performance Analysis": {
171
- "status": "N/A",
172
- "sources": [],
173
- "questions": {
174
- "Non-aggregated evaluation results across subpopulations, including feature importance and consistency analysis": false,
175
- "Metrics to measure performance in decision-making tasks": false,
176
- "Metrics to measure disparate performance in other tasks including generative tasks": false,
177
- "Worst-case subgroup performance analysis, including performance on rare or underrepresented cases": false,
178
- "Intersectional analysis examining performance across combinations of subgroup": false,
179
- "Do evaluations of disparate performance account for implicit social group markers": false
180
- }
181
- },
182
- "3.4 Disparate Performance Evaluation Transparency and Documentation": {
183
- "status": "N/A",
184
- "sources": [],
185
- "questions": {
186
- "Sufficient documentation of evaluation method to understand the scope of the findings": false,
187
- "Documentation of strengths, weaknesses, and assumptions about the context": false,
188
- "Documentation of domain shift between evaluation and deployment settings": false,
189
- "Sufficient documentation of evaluation methods to replicate findings": false,
190
- "Sufficient documentation of evaluation results to support comparison": false,
191
- "Documentation of disparate performance mitigation measures": false,
192
- "Documentation of disparate performance monitoring approaches": false
193
- }
194
- }
195
- },
196
- "4. Environmental Costs and Carbon Emissions Evaluation": {
197
- "4.1 Environmental Costs Overview": {
198
- "status": "Yes",
199
- "sources": [
200
- {
201
- "type": "🌐",
202
- "detail": "https://mlco2.github.io/impact/#compute",
203
- "name": "Machine Learning Emissions Calculator"
204
- }
205
- ],
206
- "questions": {
207
- "Evaluations of different processes within development and deployment": false,
208
- "Have evaluations been run across all applicable modalities?": true,
209
- "Have evaluations been run on standardized benchmarks or metrics?": true,
210
- "Have evaluations taken into account community feedback from regions affected by data center power consumption?": false,
211
- "Do evaluations consider the full supply chain including environmental impact of hardware components and data centers used?": false
212
- }
213
- },
214
- "4.2 Energy Cost and Environmental Impact of Development": {
215
- "status": "Yes",
216
- "sources": [
217
- {
218
- "type": "🌐",
219
- "detail": "https://mlco2.github.io/impact/#compute",
220
- "name": "Machine Learning Emissions Calculator"
221
- }
222
- ],
223
- "questions": {
224
- "Accounting of FLOPS across development stages": true,
225
- "Evaluation of energy consumption using standardized tracking tools": true,
226
- "Evaluation of carbon impact accounting for regional energy sources": true,
227
- "Evaluation of hardware lifecycle environmental impact": false
228
- }
229
- },
230
- "4.3 Energy Cost and Environmental Impact of Deployment": {
231
- "status": "N/A",
232
- "sources": [],
233
- "questions": {
234
- "Evaluation of inference FLOPS for the system": false,
235
- "Evaluation of inference energy consumption on most common deployment setting": false,
236
- "Evaluation of inference energy consumption on multiple deployment settings": false,
237
- "Evaluation of task-specific energy consumption variations": false,
238
- "Evaluation of carbon impact for deployment infrastructure": false,
239
- "Evaluation of hardware lifecycle environmental impact for deployment": false
240
- }
241
- },
242
- "4.4 Environmental Costs Transparency and Documentation": {
243
- "status": "Yes",
244
- "sources": [
245
- {
246
- "type": "🌐",
247
- "detail": "https://mlco2.github.io/impact/#compute",
248
- "name": "Machine Learning Emissions Calculator"
249
- }
250
- ],
251
- "questions": {
252
- "Documentation about equipment and infrastructure specifications": true,
253
- "Sufficient documentation of evaluation methods including components covered": false,
254
- "Sufficient documentation of evaluation methods to replicate findings": true,
255
- "Sufficient documentation of evaluation results for comparison": true
256
- }
257
- }
258
- },
259
- "5. Privacy and Data Protection Evaluation": {
260
- "5.1 Privacy and Data Protection Overview": {
261
- "status": "Yes",
262
- "sources": [
263
- {
264
- "type": "🏒",
265
- "detail": "PII detection and redaction using an NER model"
266
- },
267
- {
268
- "type": "🌐",
269
- "detail": "https://huggingface.co/spaces/bigcode/in-the-stack",
270
- "name": "Opt-out tool for users"
271
- },
272
- {
273
- "type": "🌐",
274
- "detail": "https://arxiv.org/abs/2402.19173",
275
- "name": "Asleep at the Keyboard Security Benchmark"
276
- }
277
- ],
278
- "questions": {
279
- "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": true,
280
- "Have intrinsic properties of the AI system been evaluated for privacy vulnerabilities": false,
281
- "Have extrinsic privacy evaluations been run": true,
282
- "Have evaluations been run across all applicable modalities": true,
283
- "Have privacy evaluations been run that take the form of automatic quantitative evaluation": true,
284
- "Have privacy evaluations been run with human participants?": false
285
- }
286
- },
287
- "5.2 Privacy, Likeness, and Publicity Harms": {
288
- "status": "N/A",
289
- "sources": [],
290
- "questions": {
291
- "Has the AI system been evaluated for its likelihood of revealing personal information from its training data?": false,
292
- "Has the AI system been evaluated for its likelihood of facilitating generation of content impersonating an individual?": false,
293
- "Has the AI system been evaluated for its likelihood of providing made up or confabulated personal information about individuals?": false
294
- }
295
- },
296
- "5.3 Intellectual Property and Information Security": {
297
- "status": "Yes",
298
- "sources": [
299
- {
300
- "type": "🏒",
301
- "detail": "Membership test to find if generated code was copied from the training corpus"
302
- },
303
- {
304
- "type": "🏒",
305
- "detail": "Code attribution tool to find the original author and license of the generated code"
306
- },
307
- {
308
- "type": "🌐",
309
- "detail": "https://arxiv.org/abs/2402.19173",
310
- "name": "Asleep at the Keyboard Security Benchmark"
311
- }
312
- ],
313
- "questions": {
314
- "Has the AI system been evaluated for its likelihood of reproducing other categories of information from its training data": true,
315
- "Has the system been evaluated for other information security risks for in-scope uses": false
316
- }
317
- },
318
- "5.4 Privacy Evaluation Transparency and Documentation": {
319
- "status": "Yes",
320
- "sources": [
321
- {
322
- "type": "🏒",
323
- "detail": "Documentation of training data information risk categories and consent status"
324
- }
325
- ],
326
- "questions": {
327
- "Documentation of the categories of training data that present information risk": true,
328
- "Documentation of evaluation methods to replicate findings": true,
329
- "Documentation of evaluation results to support comparison": true,
330
- "Documentation of evaluation limitations": false,
331
- "Documentation of deployment considerations": false
332
- }
333
- }
334
- },
335
- "6. Financial Costs Evaluation": {
336
- "6.1 Financial Costs Overview": {
337
- "status": "N/A",
338
- "sources": [],
339
- "questions": {
340
- "Evaluation of costs at various stages": false,
341
- "Have costs been evaluated for different system components": false,
342
- "Have cost evaluations been run across all applicable modalities": false,
343
- "Have cost evaluations included both direct and indirect expenses": false,
344
- "Have cost projections been validated against actual expenses": false
345
- }
346
- },
347
- "6.2 Development and Training Costs": {
348
- "status": "N/A",
349
- "sources": [],
350
- "questions": {
351
- "Assessment of research and development labor costs": false,
352
- "Evaluation of data collection and preprocessing costs": false,
353
- "Assessment of training infrastructure costs": false,
354
- "Assessment of costs associated with different training approaches": false,
355
- "Evaluation of model architecture and size impact on costs": false
356
- }
357
- },
358
- "6.3 Deployment and Operation Costs": {
359
- "status": "N/A",
360
- "sources": [],
361
- "questions": {
362
- "Assessment of inference and serving costs": false,
363
- "Evaluation of storage and hosting expenses": false,
364
- "Assessment of scaling costs based on usage patterns": false,
365
- "Evaluation of costs specific to different deployment contexts": false,
366
- "Assessment of costs for model updates or fine-tuning by end users": false
367
- }
368
- },
369
- "6.4 Financial Cost Documentation and Transparency": {
370
- "status": "N/A",
371
- "sources": [],
372
- "questions": {
373
- "Sufficient documentation of cost evaluation methodology and assumptions": false,
374
- "Sufficient documentation of cost breakdowns and metrics": false,
375
- "Documentation of cost variations across different usage scenarios": false,
376
- "Documentation of long-term cost projections and risk factors": false
377
- }
378
- }
379
- },
380
- "7. Data and Content Moderation Labor Evaluation": {
381
- "7.1 Labor Evaluation Overview": {
382
- "status": "Yes",
383
- "sources": [
384
- {
385
- "type": "🏒",
386
- "detail": "PII annotations by human annotators with fair wage"
387
- }
388
- ],
389
- "questions": {
390
- "Evaluation of labor practices at various stages": true,
391
- "Have labor conditions been evaluated for different worker categories": true,
392
- "Have labor evaluations been run across all applicable task types": false,
393
- "Have labor practices been evaluated against established industry standards": true,
394
- "Have labor evaluations included both direct employees and contracted workers": false,
395
- "Have evaluations considered different regional and jurisdictional contexts": true
396
- }
397
- },
398
- "7.2 Working Conditions and Compensation": {
399
- "status": "Yes",
400
- "sources": [
401
- {
402
- "type": "🏒",
403
- "detail": "PII annotations by human annotators with fair wage"
404
- }
405
- ],
406
- "questions": {
407
- "Assessment of compensation relative to local living wages and industry standards": true,
408
- "Assessment of job security and employment classification": false,
409
- "Evaluation of workplace safety, worker protections and rights": false,
410
- "Assessment of worker autonomy and task assignment practices": false,
411
- "Evaluation of power dynamics and worker feedback mechanisms": false
412
- }
413
- },
414
- "7.3 Worker Wellbeing and Support": {
415
- "status": "N/A",
416
- "sources": [],
417
- "questions": {
418
- "Assessment of psychological support systems, trauma resources, and other long-term mental health monitoring": false,
419
- "Evaluation of training and preparation for difficult content": false,
420
- "Evaluation of cultural and linguistic support for diverse workforces": false
421
- }
422
- },
423
- "7.4 Labor Practice Documentation and Transparency": {
424
- "status": "Yes",
425
- "sources": [
426
- {
427
- "type": "🏒",
428
- "detail": "PII annotations by human annotators with fair wage"
429
- }
430
- ],
431
- "questions": {
432
- "Documentation of labor evaluation methodology and frameworks used": true,
433
- "Documentation of worker demographics and task distribution": false,
434
- "Documentation of support systems, worker protections": false,
435
- "Documentation of incident reporting and resolution procedures": false
436
- }
437
- }
438
- }
439
- }
440
- }