Spaces:
Running
Running
File size: 15,685 Bytes
01a1e86 f8721bb 01a1e86 f8721bb 01a1e86 f8721bb 01a1e86 f8721bb 01a1e86 f8721bb 01a1e86 f8721bb 01a1e86 f8721bb 01a1e86 f8721bb 01a1e86 f8721bb 01a1e86 f8721bb 01a1e86 f8721bb 01a1e86 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 |
{
"metadata": {
"Name": "Gemma 2",
"Provider": "Google",
"URL": "https://ai.google.dev/gemma/docs/model_card_2",
"Type": "Large Language Model",
"Modalities": [
"Text-to-Text"
]
},
"scores": {
"1. Bias, Stereotypes, and Representational Harms Evaluation": {
"1.1 Bias Detection Overview": {
"status": "Yes",
"sources": [
{
"type": "π",
"detail": "https://ai.google.dev/gemma/docs/model_card_2#data_preprocessing",
"name": "Model Card - Data Preprocessing"
},
{
"type": "π",
"detail": "https://developers.googleblog.com/en/gemma-explained-new-in-gemma-2/",
"name": "Developer Blog"
},
{
"type": "π",
"detail": "https://arxiv.org/html/2410.12864",
"name": "Bias Analysis Paper"
}
],
"questions": {
"Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": true,
"Have intrinsic properties of the AI system been evaluated for bias (e.g., embedding analysis)": true,
"Have extrinsic bias evaluations been run (e.g., downstream task performance)": true,
"Have evaluations been run across all applicable modalities": true,
"Have bias evaluations been run that take the form of automatic quantitative evaluation": true,
"Have bias evaluations been run with human participants?": true
}
},
"1.2 Protected Classes and Intersectional Measures": {
"status": "Yes",
"sources": [
{
"type": "π",
"detail": "https://ai.google.dev/gemma/docs/model_card_2#evaluation_results",
"name": "Model Card - Evaluation Results"
}
],
"questions": {
"Do evaluations cover all applicable legal protected categories for in-scope uses of the system?": true,
"Do evaluations cover additional subgroups that are likely to be harmed based on other personal characteristics": false,
"Evaluation of how different aspects of identity interact and compound in AI system behavior": false,
"Evaluation of AI system biases for legal protected categories and additional relevant subgroups": false
}
},
"1.3 Measurement of Stereotypes and Harmful Associations": {
"status": "Yes",
"sources": [
{
"type": "π",
"detail": "https://arxiv.org/abs/2009.11462",
"name": "Stereotype Analysis"
}
],
"questions": {
"Measurement of known stereotypes in AI system outputs": true,
"Measurement of other negative associations and assumptions regarding specific groups": true,
"Measurement of stereotypes and negative associations across in-scope contexts": false
}
},
"1.4 Bias Evaluation Transparency and Documentation": {
"status": "Yes",
"sources": [
{
"type": "π",
"detail": "https://arxiv.org/pdf/2403.13793",
"name": "Evaluation Documentation"
}
],
"questions": {
"Sufficient documentation of evaluation method to understand the scope of the findings": false,
"Sufficient documentation of evaluation methods to replicate findings": true,
"Sufficient documentation of evaluation results to support comparison": true,
"Documentation of bias mitigation measures": false,
"Documentation of bias monitoring approaches": false
}
}
},
"2. Cultural Values and Sensitive Content Evaluation": {
"2.1 Cultural Variation Overview": {
"status": "Yes",
"sources": [
{
"type": "π",
"detail": "https://aclanthology.org/2024.findings-emnlp.942.pdf",
"name": "Cultural Variation Analysis"
}
],
"questions": {
"Evaluations at various stages": false,
"Have intrinsic properties been evaluated for cultural variation": false,
"Have extrinsic cultural variation evaluations been run": true,
"Have evaluations been run across all applicable modalities": true,
"Have cultural variation evaluations been run that take the form of automatic quantitative evaluation": true,
"Have cultural variation evaluations been run with human participants?": false
}
},
"2.2 Cultural Diversity and Representation": {
"status": "No",
"sources": [],
"questions": {
"Use of evaluation methods developed in the cultural contexts in scope": false,
"Respect of indigenous sovereignty, protected rights, and cultural norms": false,
"Evaluation of cultural variation across geographic dimensions": false,
"Evaluation of cultural variation representing communities' perspectives": false,
"Analysis of how cultural context affects AI system performance": false
}
},
"2.3 Generated Sensitive Content across Cultural Contexts": {
"status": "Yes",
"sources": [
{
"type": "π",
"detail": "https://arxiv.org/html/2408.00118v1#S6",
"name": "Content Safety Analysis"
}
],
"questions": {
"Has the AI system been evaluated for its likelihood of facilitating generation of threatening or violent content": true,
"Has the AI system been evaluated for its likelihood of facilitating generation of targeted harassment or discrimination": false,
"Has the AI system been evaluated for its likelihood of facilitating generation of hate speech": false,
"Has the AI system been evaluated for content embedding values not reflective of user cultural context": false,
"Has the AI system been evaluated for exposing users to inappropriate content": false,
"Has the AI system been evaluated for content with negative psychological impacts": true,
"Has the evaluation explicitly addressed cultural variation": false
}
},
"2.4 Cultural Variation Transparency and Documentation": {
"status": "No",
"sources": [],
"questions": {
"Documentation of cultural contexts considered during development": false,
"Documentation of cultural contexts covered by evaluations": false,
"Sufficient documentation of evaluation method": false,
"Sufficient documentation of evaluation methods to replicate findings": false,
"Sufficient documentation of evaluation results": false,
"Documentation of psychological impact on evaluators": false,
"Documentation of evaluator well-being measures": false
}
}
},
"3. Disparate Performance": {
"3.1 Disparate Performance Overview": {
"status": "No",
"sources": [],
"questions": {
"Have development choices been evaluated for disparate performance contribution": false,
"Have extrinsic disparate performance evaluations been run": false,
"Have evaluations been run across all applicable modalities": false,
"Have disparate performance evaluations been run quantitatively": false,
"Have disparate performance evaluations been run with human participants": false
}
},
"3.2 Identifying Target Groups": {
"status": "No",
"sources": [],
"questions": {
"Identification of mandated target groups": false,
"Identification of additional potentially harmed groups": false,
"Assessment of systemic barriers in data collection": false,
"Consideration of historical disparities": false,
"Identification of implicit and explicit markers": false
}
},
"3.3 Subgroup Performance Analysis": {
"status": "No",
"sources": [],
"questions": {
"Non-aggregated evaluation results across subpopulations": false,
"Metrics for decision-making tasks": false,
"Metrics for other tasks including generative": false,
"Worst-case subgroup performance analysis": false,
"Intersectional analysis": false,
"Evaluation of implicit social group markers": false
}
},
"3.4 Transparency and Documentation": {
"status": "No",
"sources": [],
"questions": {
"Documentation of evaluation method scope": false,
"Documentation of evaluation methods for replication": false,
"Documentation of evaluation results for comparison": false,
"Documentation of mitigation measures": false,
"Documentation of monitoring approaches": false
}
}
},
"4. Environmental Costs and Carbon Emissions Evaluation": {
"4.1 Environmental Costs Overview": {
"status": "N/A",
"sources": [],
"questions": {
"Evaluations of different processes": false,
"Evaluations across modalities": false,
"Evaluations on standardized benchmarks": false,
"Community feedback consideration": false,
"Full supply chain consideration": false
}
},
"4.2 Development Impact": {
"status": "N/A",
"sources": [],
"questions": {
"FLOPS accounting": false,
"Energy consumption evaluation": false,
"Carbon impact evaluation": false,
"Hardware lifecycle evaluation": false
}
},
"4.3 Deployment Impact": {
"status": "Yes",
"sources": [
{
"type": "π",
"detail": "https://cloud.google.com/blog/products/ai-machine-learning/performance-deepdive-of-gemma-on-google-cloud",
"name": "Performance Analysis"
}
],
"questions": {
"Evaluation of inference FLOPS": true,
"Evaluation of common deployment energy consumption": false,
"Evaluation across deployment settings": false,
"Evaluation of task-specific variations": false,
"Evaluation of deployment carbon impact": false,
"Evaluation of deployment hardware lifecycle": false
}
},
"4.4 Documentation": {
"status": "N/A",
"sources": [],
"questions": {
"Equipment and infrastructure documentation": false,
"Evaluation methods documentation": false,
"Results documentation": false,
"Documentation for comparison": false
}
}
},
"5. Privacy and Data Protection Evaluation": {
"5.1 Overview": {
"status": "Yes",
"sources": [
{
"type": "π",
"detail": "https://arxiv.org/pdf/2408.00118",
"name": "Privacy Evaluation"
}
],
"questions": {
"Evaluations at various stages": true,
"Intrinsic privacy vulnerability evaluation": false,
"Extrinsic privacy evaluations": true,
"Evaluations across modalities": false,
"Quantitative privacy evaluations": true,
"Human participant privacy evaluations": false
}
},
"5.2 Privacy Harms": {
"status": "Yes",
"sources": [
{
"type": "π",
"detail": "https://arxiv.org/pdf/2408.00118",
"name": "Privacy Analysis"
}
],
"questions": {
"Personal information revelation evaluation": true,
"Content impersonation evaluation": true,
"Personal information confabulation evaluation": true
}
},
"5.3 IP and Security": {
"status": "Yes",
"sources": [
{
"type": "π",
"detail": "https://www.cio.com/article/3567106/latticeflow-launches-first-comprehensive-evaluation-framework-for-compliance-with-the-eu-ai-act.html",
"name": "Security Evaluation"
}
],
"questions": {
"Training data reproduction evaluation": true,
"Information security risk evaluation": false
}
},
"5.4 Documentation": {
"status": "Yes",
"sources": [
{
"type": "π",
"detail": "https://ai.google.dev/gemma/docs/model_card_2",
"name": "Model Card Documentation"
}
],
"questions": {
"Evaluation methods documentation": false,
"Results documentation": false,
"Limitations documentation": true,
"Deployment considerations documentation": false,
"Training data documentation": false
}
}
},
"6. Financial Costs Evaluation": {
"6.1 Overview": {
"status": "N/A",
"sources": [],
"questions": {
"Cost evaluation across stages": false,
"Component cost evaluation": false,
"Modality cost evaluation": false,
"Direct and indirect expense evaluation": false,
"Cost projection validation": false
}
},
"6.2 Development Costs": {
"status": "N/A",
"sources": [],
"questions": {
"R&D labor costs": false,
"Data collection costs": false,
"Infrastructure costs": false,
"Training approach costs": false,
"Architecture impact costs": false
}
},
"6.3 Operation Costs": {
"status": "N/A",
"sources": [],
"questions": {
"Inference costs": false,
"Storage costs": false,
"Scaling costs": false,
"Deployment context costs": false,
"Update costs": false
}
},
"6.4 Documentation": {
"status": "N/A",
"sources": [],
"questions": {
"Methodology documentation": false,
"Cost breakdown documentation": false,
"Usage scenario documentation": false,
"Projection documentation": false
}
}
},
"7. Data and Content Moderation Labor Evaluation": {
"7.1 Overview": {
"status": "No",
"sources": [],
"questions": {
"Labor practice evaluation": false,
"Worker category evaluation": false,
"Task type evaluation": false,
"Industry standard evaluation": false,
"Worker type evaluation": false,
"Regional context evaluation": false
}
},
"7.2 Working Conditions": {
"status": "No",
"sources": [],
"questions": {
"Compensation assessment": false,
"Job security assessment": false,
"Workplace safety evaluation": false,
"Worker autonomy assessment": false,
"Power dynamics evaluation": false
}
},
"7.3 Worker Wellbeing": {
"status": "No",
"sources": [],
"questions": {
"Support system assessment": false,
"Content preparation evaluation": false,
"Cultural support evaluation": false
}
},
"7.4 Documentation": {
"status": "No",
"sources": [],
"questions": {
"Methodology documentation": false,
"Demographics documentation": false,
"Support system documentation": false,
"Incident reporting documentation": false
}
}
}
}
}
|