File size: 15,685 Bytes
01a1e86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8721bb
01a1e86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8721bb
01a1e86
 
 
 
 
 
 
 
 
 
 
 
f8721bb
01a1e86
f8721bb
01a1e86
 
 
 
 
 
 
 
 
 
f8721bb
01a1e86
 
 
 
 
 
 
 
 
 
f8721bb
01a1e86
 
 
 
 
 
 
 
 
 
 
f8721bb
01a1e86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8721bb
01a1e86
 
 
 
 
 
 
 
 
 
 
f8721bb
01a1e86
 
 
 
 
 
 
 
 
 
f8721bb
01a1e86
 
 
 
 
 
 
 
f8721bb
01a1e86
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
{
  "metadata": {
    "Name": "Gemma 2",
    "Provider": "Google",
    "URL": "https://ai.google.dev/gemma/docs/model_card_2",
    "Type": "Large Language Model",
    "Modalities": [
      "Text-to-Text"
    ]
  },
  "scores": {
    "1. Bias, Stereotypes, and Representational Harms Evaluation": {
      "1.1 Bias Detection Overview": {
        "status": "Yes",
        "sources": [
          {
            "type": "🌐",
            "detail": "https://ai.google.dev/gemma/docs/model_card_2#data_preprocessing",
            "name": "Model Card - Data Preprocessing"
          },
          {
            "type": "🌐",
            "detail": "https://developers.googleblog.com/en/gemma-explained-new-in-gemma-2/",
            "name": "Developer Blog"
          },
          {
            "type": "🌐",
            "detail": "https://arxiv.org/html/2410.12864",
            "name": "Bias Analysis Paper"
          }
        ],
        "questions": {
          "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": true,
          "Have intrinsic properties of the AI system been evaluated for bias (e.g., embedding analysis)": true,
          "Have extrinsic bias evaluations been run (e.g., downstream task performance)": true,
          "Have evaluations been run across all applicable modalities": true,
          "Have bias evaluations been run that take the form of automatic quantitative evaluation": true,
          "Have bias evaluations been run with human participants?": true
        }
      },
      "1.2 Protected Classes and Intersectional Measures": {
        "status": "Yes",
        "sources": [
          {
            "type": "🌐",
            "detail": "https://ai.google.dev/gemma/docs/model_card_2#evaluation_results",
            "name": "Model Card - Evaluation Results"
          }
        ],
        "questions": {
          "Do evaluations cover all applicable legal protected categories for in-scope uses of the system?": true,
          "Do evaluations cover additional subgroups that are likely to be harmed based on other personal characteristics": false,
          "Evaluation of how different aspects of identity interact and compound in AI system behavior": false,
          "Evaluation of AI system biases for legal protected categories and additional relevant subgroups": false
        }
      },
      "1.3 Measurement of Stereotypes and Harmful Associations": {
        "status": "Yes",
        "sources": [
          {
            "type": "🌐",
            "detail": "https://arxiv.org/abs/2009.11462",
            "name": "Stereotype Analysis"
          }
        ],
        "questions": {
          "Measurement of known stereotypes in AI system outputs": true,
          "Measurement of other negative associations and assumptions regarding specific groups": true,
          "Measurement of stereotypes and negative associations across in-scope contexts": false
        }
      },
      "1.4 Bias Evaluation Transparency and Documentation": {
        "status": "Yes",
        "sources": [
          {
            "type": "🌐",
            "detail": "https://arxiv.org/pdf/2403.13793",
            "name": "Evaluation Documentation"
          }
        ],
        "questions": {
          "Sufficient documentation of evaluation method to understand the scope of the findings": false,
          "Sufficient documentation of evaluation methods to replicate findings": true,
          "Sufficient documentation of evaluation results to support comparison": true,
          "Documentation of bias mitigation measures": false,
          "Documentation of bias monitoring approaches": false
        }
      }
    },
    "2. Cultural Values and Sensitive Content Evaluation": {
      "2.1 Cultural Variation Overview": {
        "status": "Yes",
        "sources": [
          {
            "type": "🌐",
            "detail": "https://aclanthology.org/2024.findings-emnlp.942.pdf",
            "name": "Cultural Variation Analysis"
          }
        ],
        "questions": {
          "Evaluations at various stages": false,
          "Have intrinsic properties been evaluated for cultural variation": false,
          "Have extrinsic cultural variation evaluations been run": true,
          "Have evaluations been run across all applicable modalities": true,
          "Have cultural variation evaluations been run that take the form of automatic quantitative evaluation": true,
          "Have cultural variation evaluations been run with human participants?": false
        }
      },
      "2.2 Cultural Diversity and Representation": {
        "status": "No",
        "sources": [],
        "questions": {
          "Use of evaluation methods developed in the cultural contexts in scope": false,
          "Respect of indigenous sovereignty, protected rights, and cultural norms": false,
          "Evaluation of cultural variation across geographic dimensions": false,
          "Evaluation of cultural variation representing communities' perspectives": false,
          "Analysis of how cultural context affects AI system performance": false
        }
      },
      "2.3 Generated Sensitive Content across Cultural Contexts": {
        "status": "Yes",
        "sources": [
          {
            "type": "🌐",
            "detail": "https://arxiv.org/html/2408.00118v1#S6",
            "name": "Content Safety Analysis"
          }
        ],
        "questions": {
          "Has the AI system been evaluated for its likelihood of facilitating generation of threatening or violent content": true,
          "Has the AI system been evaluated for its likelihood of facilitating generation of targeted harassment or discrimination": false,
          "Has the AI system been evaluated for its likelihood of facilitating generation of hate speech": false,
          "Has the AI system been evaluated for content embedding values not reflective of user cultural context": false,
          "Has the AI system been evaluated for exposing users to inappropriate content": false,
          "Has the AI system been evaluated for content with negative psychological impacts": true,
          "Has the evaluation explicitly addressed cultural variation": false
        }
      },
      "2.4 Cultural Variation Transparency and Documentation": {
        "status": "No",
        "sources": [],
        "questions": {
          "Documentation of cultural contexts considered during development": false,
          "Documentation of cultural contexts covered by evaluations": false,
          "Sufficient documentation of evaluation method": false,
          "Sufficient documentation of evaluation methods to replicate findings": false,
          "Sufficient documentation of evaluation results": false,
          "Documentation of psychological impact on evaluators": false,
          "Documentation of evaluator well-being measures": false
        }
      }
    },
    "3. Disparate Performance": {
      "3.1 Disparate Performance Overview": {
        "status": "No",
        "sources": [],
        "questions": {
          "Have development choices been evaluated for disparate performance contribution": false,
          "Have extrinsic disparate performance evaluations been run": false,
          "Have evaluations been run across all applicable modalities": false,
          "Have disparate performance evaluations been run quantitatively": false,
          "Have disparate performance evaluations been run with human participants": false
        }
      },
      "3.2 Identifying Target Groups": {
        "status": "No",
        "sources": [],
        "questions": {
          "Identification of mandated target groups": false,
          "Identification of additional potentially harmed groups": false,
          "Assessment of systemic barriers in data collection": false,
          "Consideration of historical disparities": false,
          "Identification of implicit and explicit markers": false
        }
      },
      "3.3 Subgroup Performance Analysis": {
        "status": "No",
        "sources": [],
        "questions": {
          "Non-aggregated evaluation results across subpopulations": false,
          "Metrics for decision-making tasks": false,
          "Metrics for other tasks including generative": false,
          "Worst-case subgroup performance analysis": false,
          "Intersectional analysis": false,
          "Evaluation of implicit social group markers": false
        }
      },
      "3.4 Transparency and Documentation": {
        "status": "No",
        "sources": [],
        "questions": {
          "Documentation of evaluation method scope": false,
          "Documentation of evaluation methods for replication": false,
          "Documentation of evaluation results for comparison": false,
          "Documentation of mitigation measures": false,
          "Documentation of monitoring approaches": false
        }
      }
    },
    "4. Environmental Costs and Carbon Emissions Evaluation": {
      "4.1 Environmental Costs Overview": {
        "status": "N/A",
        "sources": [],
        "questions": {
          "Evaluations of different processes": false,
          "Evaluations across modalities": false,
          "Evaluations on standardized benchmarks": false,
          "Community feedback consideration": false,
          "Full supply chain consideration": false
        }
      },
      "4.2 Development Impact": {
        "status": "N/A",
        "sources": [],
        "questions": {
          "FLOPS accounting": false,
          "Energy consumption evaluation": false,
          "Carbon impact evaluation": false,
          "Hardware lifecycle evaluation": false
        }
      },
      "4.3 Deployment Impact": {
        "status": "Yes",
        "sources": [
          {
            "type": "🌐",
            "detail": "https://cloud.google.com/blog/products/ai-machine-learning/performance-deepdive-of-gemma-on-google-cloud",
            "name": "Performance Analysis"
          }
        ],
        "questions": {
          "Evaluation of inference FLOPS": true,
          "Evaluation of common deployment energy consumption": false,
          "Evaluation across deployment settings": false,
          "Evaluation of task-specific variations": false,
          "Evaluation of deployment carbon impact": false,
          "Evaluation of deployment hardware lifecycle": false
        }
      },
      "4.4 Documentation": {
        "status": "N/A",
        "sources": [],
        "questions": {
          "Equipment and infrastructure documentation": false,
          "Evaluation methods documentation": false,
          "Results documentation": false,
          "Documentation for comparison": false
        }
      }
    },
    "5. Privacy and Data Protection Evaluation": {
      "5.1 Overview": {
        "status": "Yes",
        "sources": [
          {
            "type": "🌐",
            "detail": "https://arxiv.org/pdf/2408.00118",
            "name": "Privacy Evaluation"
          }
        ],
        "questions": {
          "Evaluations at various stages": true,
          "Intrinsic privacy vulnerability evaluation": false,
          "Extrinsic privacy evaluations": true,
          "Evaluations across modalities": false,
          "Quantitative privacy evaluations": true,
          "Human participant privacy evaluations": false
        }
      },
      "5.2 Privacy Harms": {
        "status": "Yes",
        "sources": [
          {
            "type": "🌐",
            "detail": "https://arxiv.org/pdf/2408.00118",
            "name": "Privacy Analysis"
          }
        ],
        "questions": {
          "Personal information revelation evaluation": true,
          "Content impersonation evaluation": true,
          "Personal information confabulation evaluation": true
        }
      },
      "5.3 IP and Security": {
        "status": "Yes",
        "sources": [
          {
            "type": "🌐",
            "detail": "https://www.cio.com/article/3567106/latticeflow-launches-first-comprehensive-evaluation-framework-for-compliance-with-the-eu-ai-act.html",
            "name": "Security Evaluation"
          }
        ],
        "questions": {
          "Training data reproduction evaluation": true,
          "Information security risk evaluation": false
        }
      },
      "5.4 Documentation": {
        "status": "Yes",
        "sources": [
          {
            "type": "🌐",
            "detail": "https://ai.google.dev/gemma/docs/model_card_2",
            "name": "Model Card Documentation"
          }
        ],
        "questions": {
          "Evaluation methods documentation": false,
          "Results documentation": false,
          "Limitations documentation": true,
          "Deployment considerations documentation": false,
          "Training data documentation": false
        }
      }
    },
    "6. Financial Costs Evaluation": {
      "6.1 Overview": {
        "status": "N/A",
        "sources": [],
        "questions": {
          "Cost evaluation across stages": false,
          "Component cost evaluation": false,
          "Modality cost evaluation": false,
          "Direct and indirect expense evaluation": false,
          "Cost projection validation": false
        }
      },
      "6.2 Development Costs": {
        "status": "N/A",
        "sources": [],
        "questions": {
          "R&D labor costs": false,
          "Data collection costs": false,
          "Infrastructure costs": false,
          "Training approach costs": false,
          "Architecture impact costs": false
        }
      },
      "6.3 Operation Costs": {
        "status": "N/A",
        "sources": [],
        "questions": {
          "Inference costs": false,
          "Storage costs": false,
          "Scaling costs": false,
          "Deployment context costs": false,
          "Update costs": false
        }
      },
      "6.4 Documentation": {
        "status": "N/A",
        "sources": [],
        "questions": {
          "Methodology documentation": false,
          "Cost breakdown documentation": false,
          "Usage scenario documentation": false,
          "Projection documentation": false
        }
      }
    },
    "7. Data and Content Moderation Labor Evaluation": {
      "7.1 Overview": {
        "status": "No",
        "sources": [],
        "questions": {
          "Labor practice evaluation": false,
          "Worker category evaluation": false,
          "Task type evaluation": false,
          "Industry standard evaluation": false,
          "Worker type evaluation": false,
          "Regional context evaluation": false
        }
      },
      "7.2 Working Conditions": {
        "status": "No",
        "sources": [],
        "questions": {
          "Compensation assessment": false,
          "Job security assessment": false,
          "Workplace safety evaluation": false,
          "Worker autonomy assessment": false,
          "Power dynamics evaluation": false
        }
      },
      "7.3 Worker Wellbeing": {
        "status": "No",
        "sources": [],
        "questions": {
          "Support system assessment": false,
          "Content preparation evaluation": false,
          "Cultural support evaluation": false
        }
      },
      "7.4 Documentation": {
        "status": "No",
        "sources": [],
        "questions": {
          "Methodology documentation": false,
          "Demographics documentation": false,
          "Support system documentation": false,
          "Incident reporting documentation": false
        }
      }
    }
  }
}