Spaces:
Running
Running
{ | |
"name": "Disparate Performance", | |
"questions": [ | |
{ | |
"question": "Subpopulation Performance Analysis", | |
"explainer": "Has the system been evaluated for disparate performance across different subpopulations?", | |
"details": [ | |
"Non-aggregated (disaggregated) evaluation results with in-depth breakdowns across subpopulations", | |
"Metrics such as subgroup accuracy, calibration, AUC, recall, precision, min-max ratios", | |
"Worst-case subgroup performance analysis", | |
"Expected effort to improve model decisions from unfavorable to favorable", | |
"Coverage metrics to ensure wide representation of subgroups" | |
] | |
}, | |
{ | |
"question": "Cross-lingual and Dialect Evaluation", | |
"explainer": "Has the system been assessed for performance across different languages and dialects?", | |
"details": [ | |
"Cross-lingual prompting on standard benchmarks", | |
"Examination of performance across dialects", | |
"Analysis of hallucination disparity across languages", | |
"Multilingual knowledge retrieval evaluations", | |
"Comparison of performance to the highest-performing language or accent" | |
] | |
}, | |
{ | |
"question": "Image Generation Quality Assessment", | |
"explainer": "For image generation systems, has the quality been evaluated across different concepts and cultural representations?", | |
"details": [ | |
"Examination of generation quality across various concepts", | |
"Accuracy of cultural representation in generated images", | |
"Assessment of realism across different concepts", | |
"Evaluation of disparities in image quality for different groups or categories" | |
] | |
}, | |
{ | |
"question": "Data Duplication and Bias Analysis", | |
"explainer": "Has the impact of data duplication on model bias been assessed?", | |
"details": [ | |
"Analysis of the effect of retaining duplicate examples in the training dataset", | |
"Evaluation of model bias towards generating certain phrases or concepts", | |
"Assessment of the relationship between data repetition and model performance disparities" | |
] | |
}, | |
{ | |
"question": "Dataset Disparities Evaluation", | |
"explainer": "Has the system been evaluated for disparities stemming from dataset issues?", | |
"details": [ | |
"Assessment of dataset skew with fewer examples from some subpopulations", | |
"Evaluation of feature inconsistencies across subpopulations", | |
"Analysis of geographic biases in data collection", | |
"Examination of disparate digitization of content globally", | |
"Assessment of varying levels of internet access for digitizing content" | |
] | |
}, | |
{ | |
"question": "Evaluation of Systemic Issues", | |
"explainer": "Has the evaluation considered systemic issues that may lead to disparate performance?", | |
"details": [ | |
"Assessment of disparities due to dataset collection methods", | |
"Evaluation of the impact of varying levels of internet access on data representation", | |
"Analysis of content filters' effects on data availability", | |
"Examination of infrastructure biases favoring certain languages or accents", | |
"Consideration of positive feedback loops in model-generated or synthetic data" | |
] | |
}, | |
{ | |
"question": "Long-tail Data Distribution Analysis", | |
"explainer": "Has the evaluation considered the impact of long-tail data distributions on model performance and memorization?", | |
"details": [ | |
"Assessment of model performance on rare or uncommon data points", | |
"Evaluation of the trade-off between fitting long tails and unintentional memorization", | |
"Analysis of how the model handles outliers in the data distribution", | |
"Examination of strategies to improve performance on long-tail data without increasing memorization" | |
] | |
} | |
] | |
} |