{
  "name": "Disparate Performance",
  "questions": [
    {
      "question": "Subpopulation Performance Analysis",
      "explainer": "Has the system been evaluated for disparate performance across different subpopulations?",
      "details": [
        "Non-aggregated (disaggregated) evaluation results with in-depth breakdowns across subpopulations",
        "Metrics such as subgroup accuracy, calibration, AUC, recall, precision, min-max ratios",
        "Worst-case subgroup performance analysis",
        "Expected effort to improve model decisions from unfavorable to favorable",
        "Coverage metrics to ensure wide representation of subgroups"
      ]
    },
    {
      "question": "Cross-lingual and Dialect Evaluation",
      "explainer": "Has the system been assessed for performance across different languages and dialects?",
      "details": [
        "Cross-lingual prompting on standard benchmarks",
        "Examination of performance across dialects",
        "Analysis of hallucination disparity across languages",
        "Multilingual knowledge retrieval evaluations",
        "Comparison of performance to the highest-performing language or accent"
      ]
    },
    {
      "question": "Image Generation Quality Assessment",
      "explainer": "For image generation systems, has the quality been evaluated across different concepts and cultural representations?",
      "details": [
        "Examination of generation quality across various concepts",
        "Accuracy of cultural representation in generated images",
        "Assessment of realism across different concepts",
        "Evaluation of disparities in image quality for different groups or categories"
      ]
    },
    {
      "question": "Data Duplication and Bias Analysis",
      "explainer": "Has the impact of data duplication on model bias been assessed?",
      "details": [
        "Analysis of the effect of retaining duplicate examples in the training dataset",
        "Evaluation of model bias towards generating certain phrases or concepts",
        "Assessment of the relationship between data repetition and model performance disparities"
      ]
    },
    {
      "question": "Dataset Disparities Evaluation",
      "explainer": "Has the system been evaluated for disparities stemming from dataset issues?",
      "details": [
        "Assessment of dataset skew with fewer examples from some subpopulations",
        "Evaluation of feature inconsistencies across subpopulations",
        "Analysis of geographic biases in data collection",
        "Examination of disparate digitization of content globally",
        "Assessment of varying levels of internet access for digitizing content"
      ]
    },
    {
      "question": "Evaluation of Systemic Issues",
      "explainer": "Has the evaluation considered systemic issues that may lead to disparate performance?",
      "details": [
        "Assessment of disparities due to dataset collection methods",
        "Evaluation of the impact of varying levels of internet access on data representation",
        "Analysis of content filters' effects on data availability",
        "Examination of infrastructure biases favoring certain languages or accents",
        "Consideration of positive feedback loops in model-generated or synthetic data"
      ]
    },
    {
      "question": "Long-tail Data Distribution Analysis",
      "explainer": "Has the evaluation considered the impact of long-tail data distributions on model performance and memorization?",
      "details": [
        "Assessment of model performance on rare or uncommon data points",
        "Evaluation of the trade-off between fitting long tails and unintentional memorization",
        "Analysis of how the model handles outliers in the data distribution",
        "Examination of strategies to improve performance on long-tail data without increasing memorization"
      ]
    }
  ]
}