Spaces:
Running
Running
File size: 20,871 Bytes
a8af1a7 7c15f75 a8af1a7 7c15f75 a8af1a7 7c15f75 a8af1a7 6301ef2 7c15f75 a8af1a7 7c15f75 a8af1a7 7c15f75 a8af1a7 7c15f75 a8af1a7 7c15f75 a8af1a7 7c15f75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 |
{
"metadata": {
"Name": "Model C",
"Provider": "BigCode",
"URL": "https://huggingface.co/bigcode/starcoder2-15b",
"Type": "Large Language Model",
"Modalities": [
"Text-to-Text"
]
},
"scores": {
"1. Bias, Stereotypes, and Representational Harms Evaluation": {
"1.1 Bias Detection Overview": {
"status": "Yes",
"sources": [
{
"type": "π",
"detail": "https://arxiv.org/abs/2402.19173",
"name": "BOLD - Bias in Open-ended Language Generation Dataset"
},
{
"type": "π",
"detail": "https://arxiv.org/abs/2402.19173",
"name": "WinoBias"
}
],
"questions": {
"Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": false,
"Have intrinsic properties of the AI system been evaluated for bias (e.g., embedding analysis)": false,
"Have extrinsic bias evaluations been run (e.g., downstream task performance)": true,
"Have evaluations been run across all applicable modalities": true,
"Have bias evaluations been run that take the form of automatic quantitative evaluation": true,
"Have bias evaluations been run with human participants?": false
}
},
"1.2 Protected Classes and Intersectional Measures": {
"status": "No",
"sources": [],
"questions": {
"Do evaluations cover all applicable legal protected categories for in-scope uses of the system?": false,
"Do evaluations cover additional subgroups that are likely to be harmed based on other personal characteristics": false,
"Evaluation of how different aspects of identity interact and compound in AI system behavior": false,
"Evaluation of AI system biases for legal protected categories and additional relevant subgroups": false
}
},
"1.3 Measurement of Stereotypes and Harmful Associations": {
"status": "Yes",
"sources": [
{
"type": "π",
"detail": "https://arxiv.org/abs/2402.19173",
"name": "HONEST - Hurtful Sentence Completion in English Language Models"
},
{
"type": "π",
"detail": "https://arxiv.org/abs/2402.19173",
"name": "RealToxicityPrompts"
}
],
"questions": {
"Measurement of known stereotypes in AI system outputs": true,
"Measurement of other negative associations and assumptions regarding specific groups": true,
"Measurement of stereotypes and negative associations across in-scope contexts": false
}
},
"1.4 Bias Evaluation Transparency and Documentation": {
"status": "Yes",
"sources": [
{
"type": "π",
"detail": "https://arxiv.org/abs/2402.19173",
"name": "Evaluation Documentation"
}
],
"questions": {
"Sufficient documentation of evaluation methods (including code and datasets) to replicate findings": true,
"Sufficient documentation of evaluation results (including intermediary statistics) to support comparison to other AI systems": true,
"Documentation of bias mitigation measures, including their secondary impacts": false,
"Documentation of bias monitoring approaches post-release/deployment if applicable": false
}
}
},
"2. Cultural Values and Sensitive Content Evaluation": {
"2.1 Cultural Variation Overview": {
"status": "N/A",
"sources": [],
"questions": {
"Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": false,
"Have intrinsic properties of the AI system been evaluated for cultural variation(e.g., embedding analysis)": false,
"Have extrinsic cultural variation evaluations been run (e.g., downstream task performance)": false,
"Have evaluations been run across all applicable modalities": false,
"Have cultural variation evaluations been run that take the form of automatic quantitative evaluation": false,
"Have cultural variation evaluations been run with human participants?": false
}
},
"2.2 Cultural Diversity and Representation": {
"status": "N/A",
"sources": [],
"questions": {
"Use of evaluation methods developed in the cultural contexts in scope": false,
"Respect of indigenous sovereignty, protected rights, and cultural norms in AI system-generated content": false,
"Evaluation of cultural variation across geographic dimensions": false,
"Evaluation of cultural variation representing communities' perspectives within geographical contexts": false,
"Analysis of how cultural context affects AI system performance": false
}
},
"2.3 Generated Sensitive Content across Cultural Contexts": {
"status": "Yes",
"sources": [
{
"type": "π",
"detail": "https://arxiv.org/abs/2402.19173",
"name": "HONEST - Hurtful Sentence Completion in English Language Models"
},
{
"type": "π",
"detail": "https://arxiv.org/abs/2402.19173",
"name": "RealToxicityPrompts"
}
],
"questions": {
"Has the AI system been evaluated for its likelihood of facilitating generation of threatening or violent content": true,
"Has the AI system been evaluated for its likelihood of facilitating generation of targeted harassment or discrimination": false,
"Has the AI system been evaluated for its likelihood of facilitating generation of hate speech": false,
"Has the AI system been evaluated for its likelihood of exposing its direct users to content embedding values and assumptions not reflective of their cultural context": false,
"Has the AI system been evaluated for its likelihood of exposing its direct users to inappropriate content for their use context": true,
"Has the AI system been evaluated for its likelihood of exposing its direct users to content with negative psychological impacts": false,
"Has the evaluation of the AI system's behaviors explicitly considered cultural variation in their definition": false
}
},
"2.4 Cultural Variation Transparency and Documentation": {
"status": "N/A",
"sources": [],
"questions": {
"Documentation of cultural contexts considered during development": false,
"Documentation of the range of cultural contexts covered by evaluations": false,
"Sufficient documentation of evaluation method to understand the scope of the findings": false,
"Construct validity, documentation of strengths, weaknesses, and assumptions": false,
"Domain shift between evaluation development and AI system development settings": false,
"Sufficient documentation of evaluation methods to replicate findings": false,
"Sufficient documentation of evaluation results to support comparison": false,
"Document of psychological impact on evaluators reviewing harmful content": false,
"Documentation of measures to protect evaluator well-being": false
}
}
},
"3. Disparate Performance Evaluation": {
"3.1 Disparate Performance Overview": {
"status": "N/A",
"sources": [],
"questions": {
"Have development choices and intrinsic properties of the AI system been evaluated for their contribution to disparate performance?": false,
"Have extrinsic disparate performance evaluations been run": false,
"Have evaluations been run across all applicable modalities": false,
"Have disparate performance evaluations been run that take the form of automatic quantitative evaluation": false,
"Have disparate performance evaluations been run with human participants": false
}
},
"3.2 Identifying Target Groups for Disparate Performance Evaluation": {
"status": "N/A",
"sources": [],
"questions": {
"Identification of mandated target group based on legal nondiscrimination frameworks": false,
"Identification of further target groups that are likely to be harmed by disparate performance": false,
"Assessment of systemic barriers in dataset collection methods for different groups": false,
"Consideration of historical disparities in the task in which the AI system is deployed": false,
"Identification of both implicit and explicit markers for the target groups": false
}
},
"3.3 Subgroup Performance Analysis": {
"status": "N/A",
"sources": [],
"questions": {
"Non-aggregated evaluation results across subpopulations, including feature importance and consistency analysis": false,
"Metrics to measure performance in decision-making tasks": false,
"Metrics to measure disparate performance in other tasks including generative tasks": false,
"Worst-case subgroup performance analysis, including performance on rare or underrepresented cases": false,
"Intersectional analysis examining performance across combinations of subgroup": false,
"Do evaluations of disparate performance account for implicit social group markers": false
}
},
"3.4 Disparate Performance Evaluation Transparency and Documentation": {
"status": "N/A",
"sources": [],
"questions": {
"Sufficient documentation of evaluation method to understand the scope of the findings": false,
"Documentation of strengths, weaknesses, and assumptions about the context": false,
"Documentation of domain shift between evaluation and deployment settings": false,
"Sufficient documentation of evaluation methods to replicate findings": false,
"Sufficient documentation of evaluation results to support comparison": false,
"Documentation of disparate performance mitigation measures": false,
"Documentation of disparate performance monitoring approaches": false
}
}
},
"4. Environmental Costs and Carbon Emissions Evaluation": {
"4.1 Environmental Costs Overview": {
"status": "Yes",
"sources": [
{
"type": "π",
"detail": "https://mlco2.github.io/impact/#compute",
"name": "Machine Learning Emissions Calculator"
}
],
"questions": {
"Evaluations of different processes within development and deployment": false,
"Have evaluations been run across all applicable modalities?": true,
"Have evaluations been run on standardized benchmarks or metrics?": true,
"Have evaluations taken into account community feedback from regions affected by data center power consumption?": false,
"Do evaluations consider the full supply chain including environmental impact of hardware components and data centers used?": false
}
},
"4.2 Energy Cost and Environmental Impact of Development": {
"status": "Yes",
"sources": [
{
"type": "π",
"detail": "https://mlco2.github.io/impact/#compute",
"name": "Machine Learning Emissions Calculator"
}
],
"questions": {
"Accounting of FLOPS across development stages": true,
"Evaluation of energy consumption using standardized tracking tools": true,
"Evaluation of carbon impact accounting for regional energy sources": true,
"Evaluation of hardware lifecycle environmental impact": false
}
},
"4.3 Energy Cost and Environmental Impact of Deployment": {
"status": "N/A",
"sources": [],
"questions": {
"Evaluation of inference FLOPS for the system": false,
"Evaluation of inference energy consumption on most common deployment setting": false,
"Evaluation of inference energy consumption on multiple deployment settings": false,
"Evaluation of task-specific energy consumption variations": false,
"Evaluation of carbon impact for deployment infrastructure": false,
"Evaluation of hardware lifecycle environmental impact for deployment": false
}
},
"4.4 Environmental Costs Transparency and Documentation": {
"status": "Yes",
"sources": [
{
"type": "π",
"detail": "https://mlco2.github.io/impact/#compute",
"name": "Machine Learning Emissions Calculator"
}
],
"questions": {
"Documentation about equipment and infrastructure specifications": true,
"Sufficient documentation of evaluation methods including components covered": false,
"Sufficient documentation of evaluation methods to replicate findings": true,
"Sufficient documentation of evaluation results for comparison": true
}
}
},
"5. Privacy and Data Protection Evaluation": {
"5.1 Privacy and Data Protection Overview": {
"status": "Yes",
"sources": [
{
"type": "π’",
"detail": "PII detection and redaction using an NER model"
},
{
"type": "π",
"detail": "https://huggingface.co/spaces/bigcode/in-the-stack",
"name": "Opt-out tool for users"
},
{
"type": "π",
"detail": "https://arxiv.org/abs/2402.19173",
"name": "Asleep at the Keyboard Security Benchmark"
}
],
"questions": {
"Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": true,
"Have intrinsic properties of the AI system been evaluated for privacy vulnerabilities": false,
"Have extrinsic privacy evaluations been run": true,
"Have evaluations been run across all applicable modalities": true,
"Have privacy evaluations been run that take the form of automatic quantitative evaluation": true,
"Have privacy evaluations been run with human participants?": false
}
},
"5.2 Privacy, Likeness, and Publicity Harms": {
"status": "N/A",
"sources": [],
"questions": {
"Has the AI system been evaluated for its likelihood of revealing personal information from its training data?": false,
"Has the AI system been evaluated for its likelihood of facilitating generation of content impersonating an individual?": false,
"Has the AI system been evaluated for its likelihood of providing made up or confabulated personal information about individuals?": false
}
},
"5.3 Intellectual Property and Information Security": {
"status": "Yes",
"sources": [
{
"type": "π’",
"detail": "Membership test to find if generated code was copied from the training corpus"
},
{
"type": "π’",
"detail": "Code attribution tool to find the original author and license of the generated code"
},
{
"type": "π",
"detail": "https://arxiv.org/abs/2402.19173",
"name": "Asleep at the Keyboard Security Benchmark"
}
],
"questions": {
"Has the AI system been evaluated for its likelihood of reproducing other categories of information from its training data": true,
"Has the system been evaluated for other information security risks for in-scope uses": false
}
},
"5.4 Privacy Evaluation Transparency and Documentation": {
"status": "Yes",
"sources": [
{
"type": "π’",
"detail": "Documentation of training data information risk categories and consent status"
}
],
"questions": {
"Documentation of the categories of training data that present information risk": true,
"Documentation of evaluation methods to replicate findings": true,
"Documentation of evaluation results to support comparison": true,
"Documentation of evaluation limitations": false,
"Documentation of deployment considerations": false
}
}
},
"6. Financial Costs Evaluation": {
"6.1 Financial Costs Overview": {
"status": "N/A",
"sources": [],
"questions": {
"Evaluation of costs at various stages": false,
"Have costs been evaluated for different system components": false,
"Have cost evaluations been run across all applicable modalities": false,
"Have cost evaluations included both direct and indirect expenses": false,
"Have cost projections been validated against actual expenses": false
}
},
"6.2 Development and Training Costs": {
"status": "N/A",
"sources": [],
"questions": {
"Assessment of research and development labor costs": false,
"Evaluation of data collection and preprocessing costs": false,
"Assessment of training infrastructure costs": false,
"Assessment of costs associated with different training approaches": false,
"Evaluation of model architecture and size impact on costs": false
}
},
"6.3 Deployment and Operation Costs": {
"status": "N/A",
"sources": [],
"questions": {
"Assessment of inference and serving costs": false,
"Evaluation of storage and hosting expenses": false,
"Assessment of scaling costs based on usage patterns": false,
"Evaluation of costs specific to different deployment contexts": false,
"Assessment of costs for model updates or fine-tuning by end users": false
}
},
"6.4 Financial Cost Documentation and Transparency": {
"status": "N/A",
"sources": [],
"questions": {
"Sufficient documentation of cost evaluation methodology and assumptions": false,
"Sufficient documentation of cost breakdowns and metrics": false,
"Documentation of cost variations across different usage scenarios": false,
"Documentation of long-term cost projections and risk factors": false
}
}
},
"7. Data and Content Moderation Labor Evaluation": {
"7.1 Labor Evaluation Overview": {
"status": "Yes",
"sources": [
{
"type": "π’",
"detail": "PII annotations by human annotators with fair wage"
}
],
"questions": {
"Evaluation of labor practices at various stages": true,
"Have labor conditions been evaluated for different worker categories": true,
"Have labor evaluations been run across all applicable task types": false,
"Have labor practices been evaluated against established industry standards": true,
"Have labor evaluations included both direct employees and contracted workers": false,
"Have evaluations considered different regional and jurisdictional contexts": true
}
},
"7.2 Working Conditions and Compensation": {
"status": "Yes",
"sources": [
{
"type": "π’",
"detail": "PII annotations by human annotators with fair wage"
}
],
"questions": {
"Assessment of compensation relative to local living wages and industry standards": true,
"Assessment of job security and employment classification": false,
"Evaluation of workplace safety, worker protections and rights": false,
"Assessment of worker autonomy and task assignment practices": false,
"Evaluation of power dynamics and worker feedback mechanisms": false
}
},
"7.3 Worker Wellbeing and Support": {
"status": "N/A",
"sources": [],
"questions": {
"Assessment of psychological support systems, trauma resources, and other long-term mental health monitoring": false,
"Evaluation of training and preparation for difficult content": false,
"Evaluation of cultural and linguistic support for diverse workforces": false
}
},
"7.4 Labor Practice Documentation and Transparency": {
"status": "Yes",
"sources": [
{
"type": "π’",
"detail": "PII annotations by human annotators with fair wage"
}
],
"questions": {
"Documentation of labor evaluation methodology and frameworks used": true,
"Documentation of worker demographics and task distribution": false,
"Documentation of support systems, worker protections": false,
"Documentation of incident reporting and resolution procedures": false
}
}
}
}
}
|