{ "metadata": { "Name": "Model A", "Provider": "TechCorp", "Version": "2.1", "Release Date": "2023-09-15", "Type": "Large Language Model", "Modalities": ["Text-to-Text"] }, "scores": { "Bias, Stereotypes, and Representational Harms Evaluation": { "Comprehensive Evaluation Methodology": { "status": "Yes", "source": "Both", "applicable_evaluations": [ "Evaluations at various stages (data collection, preprocessing, model architecture, training, deployment)", "Both intrinsic (e.g., embedding analysis) and extrinsic (e.g., downstream task performance) evaluation methods" ] }, "Inclusive Protected Class Consideration": { "status": "No", "source": null, "applicable_evaluations": [ "Evaluation of non-standard protected classes (e.g., socioeconomic status, education level, regional differences)", "Consideration of intersectionality and how identity aspects interact", "Assessment of potential harms to non-typical groups (e.g., by profession or hobbies)" ] }, "Cultural and Linguistic Diversity": { "status": "Yes", "source": "3P", "applicable_evaluations": [ "Tests of model performance and biases across languages and cultures", "Consideration of how protected categories may shift in meaning across regions" ] }, "Stereotype and Harmful Association Detection": { "status": "Yes", "source": "1P", "applicable_evaluations": [ "Detection of stereotypical word associations in text models", "Sentiment analysis and toxicity measurements, especially regarding specific groups" ] }, "Performance Disparities Assessment": { "status": "No", "source": null, "applicable_evaluations": [ "Detailed breakdowns of performance metrics (accuracy, precision, recall) for various subgroups", "Performance analysis for disadvantaged subgroups", "Intersectionality considerations in performance analysis" ] } }, "Cultural Values and Sensitive Content Evaluation": { "Hate Speech and Toxicity Evaluation": { "status": "Yes", "source": "Both", "applicable_evaluations": [ "Assessments of harmful text generation", "Evaluations of toxicity, hurtfulness, or offensiveness" ] }, "Cultural Value Representation": { "status": "No", "source": null, "applicable_evaluations": [ "Use of pre-existing scholarship (e.g., World Values Survey, Geert Hofstede's work)", "Inductive and participatory evaluations grounded in specific cultural contexts", "Assessments of ethical scenarios and political value representation" ] }, "Diverse Cultural Context": { "status": "Yes", "source": "3P", "applicable_evaluations": [ "Assessments that don't equate nationality with cultural context", "Representation of differing cultural values within countries" ] } }, "Disparate Performance": { "Subpopulation Performance Analysis": { "status": "Yes", "source": "1P", "applicable_evaluations": [ "Non-aggregated (disaggregated) evaluation results with in-depth breakdowns across subpopulations", "Metrics such as subgroup accuracy, calibration, AUC, recall, precision, min-max ratios" ] }, "Cross-lingual and Dialect Evaluation": { "status": "No", "source": null, "applicable_evaluations": [ "Cross-lingual prompting on standard benchmarks", "Examination of performance across dialects", "Analysis of hallucination disparity across languages" ] }, "Image Generation Quality Assessment": { "status": "N/A", "source": null, "applicable_evaluations": [] } }, "Environmental Costs and Carbon Emissions Evaluation": { "Energy Consumption Measurement": { "status": "Yes", "source": "1P", "applicable_evaluations": [ "Measurement of energy used in training, testing, and deploying the system", "Evaluation of compute power consumption" ] }, "Carbon Footprint Quantification": { "status": "No", "source": null, "applicable_evaluations": [ "Use of tools like CodeCarbon or Carbontracker", "Measurement of carbon emissions for training and inference", "Conversion of energy consumption to carbon emissions" ] }, "Hardware Resource Evaluation": { "status": "Yes", "source": "1P", "applicable_evaluations": [ "Assessment of CPU, GPU, and TPU usage", "Measurement of FLOPS (Floating Point Operations)" ] } }, "Privacy and Data Protection Evaluation": { "Data Minimization and Consent Practices": { "status": "Yes", "source": "Both", "applicable_evaluations": [ "Implementation of data minimization practices", "Use of opt-in data collection methods", "Assessment of active consent for collecting, processing, and sharing data" ] }, "Memorization and Data Leakage Evaluation": { "status": "Yes", "source": "1P", "applicable_evaluations": [ "Examination of the maximum amount of discoverable information given training data", "Evaluation of extractable information without training data access" ] }, "Personal Information Revelation Assessment": { "status": "No", "source": null, "applicable_evaluations": [ "Direct prompting tests to reveal Personally Identifiable Information (PII)", "Use of tools like ProPILE to audit PII revelation likelihood", "Evaluation of the system's ability to infer personal attributes" ] } }, "Financial Costs Evaluation": { "Comprehensive Cost Evaluation": { "status": "Yes", "source": "1P", "applicable_evaluations": [ "Estimation of infrastructure and hardware costs", "Calculation of labor hours from researchers, developers, and crowd workers", "Tracking of compute costs using low-cost or standard pricing per instance-hour" ] }, "Storage and Training Cost Analysis": { "status": "Yes", "source": "1P", "applicable_evaluations": [ "Assessment of storage costs for both datasets and resulting models", "Consideration of in-house vs. cloud storage options", "Evaluation of training costs based on in-house GPUs or per-hour-priced instances" ] }, "Hosting and Inference Cost Evaluation": { "status": "No", "source": null, "applicable_evaluations": [ "Evaluation of low-latency serving costs", "Assessment of inference costs based on token usage", "Consideration of factors such as initial prompt length and requested token response length" ] } }, "Data and Content Moderation Labor Evaluation": { "Crowdwork Standards Compliance": { "status": "No", "source": null, "applicable_evaluations": [ "Assessment of compliance with Criteria for Fairer Microwork", "Evaluation against Partnership on AI's Responsible Sourcing of Data Enrichment Services guidelines", "Comparison with Oxford Internet Institute's Fairwork Principles" ] }, "Crowdworker Demographics and Compensation": { "status": "Yes", "source": "3P", "applicable_evaluations": [ "Documentation of crowd workers' demographics", "Transparency in reporting instructions given to crowdworkers", "Assessment of how crowdworkers were evaluated and compensated" ] }, "Psychological Support and Content Exposure": { "status": "No", "source": null, "applicable_evaluations": [ "Documentation of immediate trauma support availability", "Assessment of long-term professional psychological support provision", "Evaluation of practices for controlling exposure to traumatic material" ] } } } }