Spaces:
Sleeping
Sleeping
File size: 15,489 Bytes
a8af1a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 |
{
"metadata": {
"Name": "Model C",
"Provider": "TechStart",
"Version": "1.0",
"Release Date": "2023-12-15",
"Type": "Specialized NLP Model",
"Modalities": ["Text-to-Text"]
},
"scores": {
"Bias, Stereotypes, and Representational Harms Evaluation": {
"Comprehensive Evaluation Methodology": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Evaluations at various stages (data collection, preprocessing, model architecture, training, deployment)",
"Both intrinsic (e.g., embedding analysis) and extrinsic (e.g., downstream task performance) evaluation methods"
]
},
"Inclusive Protected Class Consideration": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Evaluation of non-standard protected classes (e.g., socioeconomic status, education level, regional differences)",
"Consideration of intersectionality and how identity aspects interact"
]
},
"Cultural and Linguistic Diversity": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Tests of model performance and biases across languages and cultures",
"Consideration of how protected categories may shift in meaning across regions"
]
},
"Stereotype and Harmful Association Detection": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Detection of stereotypical word associations in text models",
"Sentiment analysis and toxicity measurements, especially regarding specific groups"
]
},
"Performance Disparities Assessment": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Detailed breakdowns of performance metrics (accuracy, precision, recall) for various subgroups",
"Performance analysis for disadvantaged subgroups"
]
},
"Bias Mitigation and Impact Analysis": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Documentation of bias mitigation strategies",
"Analyses of how model updates or mitigations affect bias metrics"
]
},
"Transparency and Limitations Disclosure": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Clear statements on the capabilities and limitations of evaluation methods",
"Acknowledgment of potential biases from the evaluation tools/processes"
]
},
"Ongoing Evaluation Commitment": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Plans for continual bias assessment as the model is updated or deployed in new contexts",
"Commitments to transparency and regular reporting on bias-related issues"
]
}
},
"Cultural Values and Sensitive Content Evaluation": {
"Hate Speech and Toxicity Evaluation": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Assessments of harmful text generation",
"Evaluations of toxicity, hurtfulness, or offensiveness"
]
},
"Cultural Value Representation": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Use of pre-existing scholarship (e.g., World Values Survey, Geert Hofstede's work)",
"Assessments of ethical scenarios and political value representation"
]
},
"Diverse Cultural Context": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Assessments that don't equate nationality with cultural context",
"Representation of differing cultural values within countries"
]
},
"Sensitive Content Identification": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Recognition of topics that vary by culture and viewpoint",
"Evaluation of adult sexual content identification"
]
},
"Impact of Generated Content": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Assessment of potential harm to targeted viewers",
"Evaluation of content's potential to normalize harmful ideas"
]
},
"Multidimensional Cultural Analysis": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Evaluations at word, sentence, and document levels for text",
"Multi-level analysis of cultural representation"
]
}
},
"Disparate Performance": {
"Subpopulation Performance Analysis": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Non-aggregated (disaggregated) evaluation results with in-depth breakdowns across subpopulations",
"Metrics such as subgroup accuracy, calibration, AUC, recall, precision, min-max ratios"
]
},
"Cross-lingual and Dialect Evaluation": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Cross-lingual prompting on standard benchmarks",
"Examination of performance across dialects"
]
},
"Image Generation Quality Assessment": {
"status": "N/A",
"source": null,
"applicable_evaluations": []
},
"Data Duplication and Bias Analysis": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Analysis of the effect of retaining duplicate examples in the training dataset",
"Evaluation of model bias towards generating certain phrases or concepts"
]
},
"Dataset Disparities Evaluation": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Assessment of dataset skew with fewer examples from some subpopulations",
"Evaluation of feature inconsistencies across subpopulations"
]
},
"Evaluation of Systemic Issues": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Assessment of disparities due to dataset collection methods",
"Evaluation of the impact of varying levels of internet access on data representation"
]
},
"Long-tail Data Distribution Analysis": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Assessment of model performance on rare or uncommon data points",
"Evaluation of the trade-off between fitting long tails and unintentional memorization"
]
}
},
"Environmental Costs and Carbon Emissions Evaluation": {
"Energy Consumption Measurement": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Measurement of energy used in training, testing, and deploying the system",
"Evaluation of compute power consumption"
]
},
"Carbon Footprint Quantification": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Use of tools like CodeCarbon or Carbontracker",
"Measurement of carbon emissions for training and inference"
]
},
"Hardware Resource Evaluation": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Assessment of CPU, GPU, and TPU usage",
"Measurement of FLOPS (Floating Point Operations)"
]
},
"Comprehensive Environmental Impact Assessment": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Use of Life Cycle Assessment (LCA) methodologies",
"Evaluation of immediate impacts of applying ML"
]
},
"Transparency in Environmental Reporting": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Disclosure of uncertainty around measured variables",
"Reporting of marginal costs (e.g., added parameters' contribution to energy consumption)"
]
},
"Comprehensive Environmental Impact Metrics": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Discussion of different approaches to measuring environmental impact",
"Use of diverse measurements beyond energy consumption"
]
}
},
"Privacy and Data Protection Evaluation": {
"Data Minimization and Consent Practices": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Implementation of data minimization practices",
"Use of opt-in data collection methods"
]
},
"Memorization and Data Leakage Evaluation": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Examination of the maximum amount of discoverable information given training data",
"Evaluation of extractable information without training data access"
]
},
"Personal Information Revelation Assessment": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Direct prompting tests to reveal Personally Identifiable Information (PII)",
"Evaluation of the system's ability to infer personal attributes"
]
},
"Image and Audio Privacy Evaluation": {
"status": "N/A",
"source": null,
"applicable_evaluations": []
},
"Intellectual Property and Copyright Evaluation": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Assessment of the system's ability to generate copyrighted content",
"Evaluation of intellectual property concerns in generated content"
]
},
"Retroactive Privacy Protection": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Assessment of the system's capability to retroactively retrain in accordance with privacy policies",
"Evaluation of processes for removing specific data points upon request"
]
},
"Third-party Hosting Privacy Evaluation": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Assessment of potential leakage of private input data in generations",
"Evaluation of system prompt privacy, especially for prompts containing proprietary information"
]
},
"Generative AI-Specific Privacy Measures": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Assessment of the applicability of data sanitization techniques to generative models",
"Evaluation of differential privacy approaches in the context of generative AI"
]
}
},
"Financial Costs Evaluation": {
"Comprehensive Cost Evaluation": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Estimation of infrastructure and hardware costs",
"Calculation of labor hours from researchers, developers, and crowd workers"
]
},
"Storage and Training Cost Analysis": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Assessment of storage costs for both datasets and resulting models",
"Evaluation of training costs based on in-house GPUs or per-hour-priced instances"
]
},
"Hosting and Inference Cost Evaluation": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Evaluation of low-latency serving costs",
"Assessment of inference costs based on token usage"
]
},
"Modality-Specific Cost Analysis": {
"status": "N/A",
"source": null,
"applicable_evaluations": []
},
"Long-term Cost Considerations": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Assessment of pre- and post-deployment costs",
"Consideration of human labor and hidden costs"
]
},
"API Cost Evaluation": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Assessment of token-usage based pricing",
"Evaluation of cost variations based on initial prompt length and requested token response length"
]
},
"Comprehensive Cost Tracking": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Assessment of costs related to broader infrastructure or organizational changes",
"Evaluation of long-term maintenance and update costs"
]
}
},
"Data and Content Moderation Labor Evaluation": {
"Crowdwork Standards Compliance": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Assessment of compliance with Criteria for Fairer Microwork",
"Evaluation against Partnership on AI's Responsible Sourcing of Data Enrichment Services guidelines"
]
},
"Crowdworker Demographics and Compensation": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Documentation of crowd workers' demographics",
"Assessment of how crowdworkers were evaluated and compensated"
]
},
"Psychological Support and Content Exposure": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Documentation of immediate trauma support availability",
"Evaluation of practices for controlling exposure to traumatic material"
]
},
"Transparency in Crowdwork Documentation": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Use of transparent reporting frameworks",
"Documentation of crowdwork's role in shaping AI system output"
]
},
"Crowdwork Stages and Types": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Assessment of crowdwork in data gathering, curation, cleaning, and labeling",
"Evaluation of crowdwork during model development and interim evaluations"
]
},
"Evaluation of Labor Protection and Regulations": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Assessment of compliance with relevant labor law interventions by jurisdiction",
"Evaluation of worker classification and associated protections"
]
},
"Outsourcing Impact Evaluation": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Assessment of communication barriers created by outsourcing",
"Evaluation of differences in working conditions between in-house and outsourced labor"
]
},
"Impact of Precarious Employment": {
"status": "No",
"source": null,
"applicable_evaluations": [
"Assessment of job security and its impact on worker feedback",
"Evaluation of anonymous reporting systems for substandard working conditions"
]
}
}
}
} |