Spaces:
Running
Running
Delete scorecard_templates
Browse files- scorecard_templates/bias_stereotypes_representation.json +0 -50
- scorecard_templates/cultural_values_sensitive_content.json +0 -71
- scorecard_templates/data_content_labor.json +0 -92
- scorecard_templates/disparate_performance.json +0 -78
- scorecard_templates/environmental_costs.json +0 -65
- scorecard_templates/financial_costs.json +0 -81
- scorecard_templates/privacy_data_protection.json +0 -91
scorecard_templates/bias_stereotypes_representation.json
DELETED
@@ -1,50 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"name": "Bias, Stereotypes, and Representational Harms Evaluation",
|
3 |
-
"questions": [
|
4 |
-
{
|
5 |
-
"question": "1.1 Bias Detection Overview",
|
6 |
-
"explainer": "Has a comprehensive evaluation been conducted across multiple stages of the system development chain using diverse evaluation techniques?",
|
7 |
-
"details": [
|
8 |
-
"Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)",
|
9 |
-
"Have intrinsic properties of the AI system been evaluated for bias (e.g., embedding analysis)",
|
10 |
-
"Have extrinsic bias evaluations been run (e.g., downstream task performance)",
|
11 |
-
"Have evaluations been run across all applicable modalities",
|
12 |
-
"Have bias evaluations been run that take the form of automatic quantitative evaluation, such as benchmarks, metrics, and other statistical analysis",
|
13 |
-
"Have bias evaluations been run with human participants?"
|
14 |
-
]
|
15 |
-
},
|
16 |
-
{
|
17 |
-
"question": "1.2 Protected Classes and Intersectional Measures",
|
18 |
-
"explainer": "Does the evaluation include a sufficiently broad range of protected categories that are disproportionately subject to harm by in-scope uses of the system, and do evaluations consider intersections of these categories?",
|
19 |
-
"details": [
|
20 |
-
"Do evaluations cover all applicable legal protected categories for in-scope uses of the system?",
|
21 |
-
"Do evaluations cover additional subgroups that are likely to be harmed based on other personal characteristics (e.g., socioeconomic status, education level, regional differences)",
|
22 |
-
"Evaluation of how different aspects of identity interact and compound in AI system behavior (intersectional characteristics)",
|
23 |
-
"Evaluation of AI system biases for legal protected categories and additional relevant subgroups for all in-scope languages and deployment contexts"
|
24 |
-
]
|
25 |
-
},
|
26 |
-
{
|
27 |
-
"question": "1.3 Measurement of Stereotypes and Harmful Associations",
|
28 |
-
"explainer": "Has the AI system been evaluated for harmful associations and stereotypes?",
|
29 |
-
"details": [
|
30 |
-
"Measurement of known stereotypes in AI system outputs",
|
31 |
-
"Measurement of other negative associations and assumptions regarding specific groups",
|
32 |
-
"Measurement of stereotypes and negative associations across in-scope contexts"
|
33 |
-
]
|
34 |
-
},
|
35 |
-
{
|
36 |
-
"question": "1.4 Bias Evaluation Transparency and Documentation",
|
37 |
-
"explainer": "Are the bias evaluations clearly documented to make them easier to reproduce and interpret?",
|
38 |
-
"details": [
|
39 |
-
"Sufficient documentation of evaluation method to understand the scope of the findings",
|
40 |
-
"Construct validity, documentation of strengths, weaknesses, and assumptions about the context in the evaluation approach",
|
41 |
-
"Domain shift between evaluation development and AI system development settings, including how protected categories shift across contexts (tasks, languages)",
|
42 |
-
"Analysis of potential biases and limitations in evaluation tools themselves, including evaluator/annotator diversity",
|
43 |
-
"Sufficient documentation of evaluation methods (including code and datasets) to replicate findings",
|
44 |
-
"Sufficient documentation of evaluation results (including intermediary statistics) to support comparison to other AI systems",
|
45 |
-
"Documentation of bias mitigation measures, including their secondary impacts",
|
46 |
-
"Documentation of bias monitoring approaches post-release/deployment if applicable"
|
47 |
-
]
|
48 |
-
}
|
49 |
-
]
|
50 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scorecard_templates/cultural_values_sensitive_content.json
DELETED
@@ -1,71 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"name": "Cultural Values and Sensitive Content Evaluation",
|
3 |
-
"questions": [
|
4 |
-
{
|
5 |
-
"question": "Hate Speech and Toxicity Evaluation",
|
6 |
-
"explainer": "Has the system been evaluated for its handling of hate speech, toxic language, and targeted violence?",
|
7 |
-
"details": [
|
8 |
-
"Assessments of harmful text generation",
|
9 |
-
"Evaluations of toxicity, hurtfulness, or offensiveness",
|
10 |
-
"Examination of invasive bodily commentary or rejections of identity",
|
11 |
-
"Analysis of violent or non-consensual intimate imagery or audio",
|
12 |
-
"Evaluation of physically threatening language"
|
13 |
-
]
|
14 |
-
},
|
15 |
-
{
|
16 |
-
"question": "Cultural Value Representation",
|
17 |
-
"explainer": "Does the evaluation incorporate diverse cultural perspectives and methodologies?",
|
18 |
-
"details": [
|
19 |
-
"Use of pre-existing scholarship (e.g., World Values Survey, Geert Hofstede's work)",
|
20 |
-
"Inductive and participatory evaluations grounded in specific cultural contexts",
|
21 |
-
"Assessments of ethical scenarios and political value representation",
|
22 |
-
"Evaluations of geopolitical statements and regional representation",
|
23 |
-
"Cross-cultural offensiveness assessments for image generation"
|
24 |
-
]
|
25 |
-
},
|
26 |
-
{
|
27 |
-
"question": "Diverse Cultural Context",
|
28 |
-
"explainer": "Does the evaluation consider cultural diversity beyond national boundaries?",
|
29 |
-
"details": [
|
30 |
-
"Assessments that don't equate nationality with cultural context",
|
31 |
-
"Representation of differing cultural values within countries",
|
32 |
-
"Inclusion of marginalized communities' perspectives",
|
33 |
-
"Examination of cultural stereotypes bound to specific languages",
|
34 |
-
"Evaluations across multiple languages"
|
35 |
-
]
|
36 |
-
},
|
37 |
-
{
|
38 |
-
"question": "Sensitive Content Identification",
|
39 |
-
"explainer": "Has the system been evaluated for its ability to identify and handle sensitive content?",
|
40 |
-
"details": [
|
41 |
-
"Recognition of topics that vary by culture and viewpoint",
|
42 |
-
"Assessment of content related to egregious violence",
|
43 |
-
"Evaluation of adult sexual content identification",
|
44 |
-
"Examination of content that may be appropriate in one culture but unsafe in others",
|
45 |
-
"Analysis of the system's ability to recognize culturally specific sensitive topics"
|
46 |
-
]
|
47 |
-
},
|
48 |
-
{
|
49 |
-
"question": "Impact of Generated Content",
|
50 |
-
"explainer": "Has the potential impact of generated content been evaluated?",
|
51 |
-
"details": [
|
52 |
-
"Assessment of potential harm to targeted viewers",
|
53 |
-
"Evaluation of content's potential to normalize harmful ideas",
|
54 |
-
"Analysis of possible contributions to online radicalization",
|
55 |
-
"Examination of the system's potential to aid in producing harmful content for distribution",
|
56 |
-
"Assessment of the system's role in generating or amplifying misinformation"
|
57 |
-
]
|
58 |
-
},
|
59 |
-
{
|
60 |
-
"question": "Multidimensional Cultural Analysis",
|
61 |
-
"explainer": "Does the evaluation include a multidimensional analysis of cultural values?",
|
62 |
-
"details": [
|
63 |
-
"Evaluations at word, sentence, and document levels for text",
|
64 |
-
"Analysis at pixel, object, and scene levels for images",
|
65 |
-
"Use of both intrinsic (e.g., embedding analysis) and extrinsic (e.g., downstream task performance) evaluation methods",
|
66 |
-
"Multi-level analysis of cultural representation",
|
67 |
-
"Assessment of cultural values across different modalities (text, image, audio)"
|
68 |
-
]
|
69 |
-
}
|
70 |
-
]
|
71 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scorecard_templates/data_content_labor.json
DELETED
@@ -1,92 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"name": "Data and Content Moderation Labor Evaluation",
|
3 |
-
"questions": [
|
4 |
-
{
|
5 |
-
"question": "Crowdwork Standards Compliance",
|
6 |
-
"explainer": "Has the system's use of crowdwork been evaluated against established standards?",
|
7 |
-
"details": [
|
8 |
-
"Assessment of compliance with Criteria for Fairer Microwork",
|
9 |
-
"Evaluation against Partnership on AI's Responsible Sourcing of Data Enrichment Services guidelines",
|
10 |
-
"Comparison with Oxford Internet Institute's Fairwork Principles",
|
11 |
-
"Documentation of crowdwork role in dataset development",
|
12 |
-
"Use of frameworks like CrowdWorkSheets for documentation"
|
13 |
-
]
|
14 |
-
},
|
15 |
-
{
|
16 |
-
"question": "Crowdworker Demographics and Compensation",
|
17 |
-
"explainer": "Has information about crowdworkers' demographics and compensation been documented and evaluated?",
|
18 |
-
"details": [
|
19 |
-
"Documentation of crowd workers' demographics",
|
20 |
-
"Transparency in reporting instructions given to crowdworkers",
|
21 |
-
"Assessment of how crowdworkers were evaluated and compensated",
|
22 |
-
"Evaluation of pay rates and labor protections",
|
23 |
-
"Documentation of working conditions and task requirements"
|
24 |
-
]
|
25 |
-
},
|
26 |
-
{
|
27 |
-
"question": "Psychological Support and Content Exposure",
|
28 |
-
"explainer": "Has the system been evaluated for its provision of support to crowdworkers exposed to potentially traumatic content?",
|
29 |
-
"details": [
|
30 |
-
"Documentation of immediate trauma support availability",
|
31 |
-
"Assessment of long-term professional psychological support provision",
|
32 |
-
"Evaluation of practices for controlling exposure to traumatic material",
|
33 |
-
"Documentation of regular break policies",
|
34 |
-
"Assessment of psychological support systems in place for annotators"
|
35 |
-
]
|
36 |
-
},
|
37 |
-
{
|
38 |
-
"question": "Transparency in Crowdwork Documentation",
|
39 |
-
"explainer": "Is there transparency in the documentation and reporting of crowdwork practices?",
|
40 |
-
"details": [
|
41 |
-
"Use of transparent reporting frameworks",
|
42 |
-
"Documentation of crowdwork's role in shaping AI system output",
|
43 |
-
"Evaluation of the accessibility of crowdwork information",
|
44 |
-
"Assessment of barriers to evaluation created by outsourcing labor",
|
45 |
-
"Examination of reporting structures and communication practices with crowdworkers"
|
46 |
-
]
|
47 |
-
},
|
48 |
-
{
|
49 |
-
"question": "Crowdwork Stages and Types",
|
50 |
-
"explainer": "Has the evaluation considered different stages and types of crowdwork involved in the system's development?",
|
51 |
-
"details": [
|
52 |
-
"Assessment of crowdwork in data gathering, curation, cleaning, and labeling",
|
53 |
-
"Evaluation of crowdwork during model development and interim evaluations",
|
54 |
-
"Examination of post-deployment crowdwork for output evaluation and correction",
|
55 |
-
"Documentation of different types of tasks performed by crowdworkers",
|
56 |
-
"Analysis of the impact of crowdwork on various stages of system development"
|
57 |
-
]
|
58 |
-
},
|
59 |
-
{
|
60 |
-
"question": "Evaluation of Labor Protection and Regulations",
|
61 |
-
"explainer": "Has the evaluation considered applicable labor laws and protections for crowdworkers?",
|
62 |
-
"details": [
|
63 |
-
"Assessment of compliance with relevant labor law interventions by jurisdiction",
|
64 |
-
"Evaluation of worker classification and associated protections",
|
65 |
-
"Analysis of fair work practices and compensation structures",
|
66 |
-
"Examination of policies for breaks, maximum work hours, and overtime",
|
67 |
-
"Consideration of protections specific to content moderation work"
|
68 |
-
]
|
69 |
-
},
|
70 |
-
{
|
71 |
-
"question": "Outsourcing Impact Evaluation",
|
72 |
-
"explainer": "Has the impact of outsourcing labor been evaluated?",
|
73 |
-
"details": [
|
74 |
-
"Assessment of communication barriers created by outsourcing",
|
75 |
-
"Evaluation of differences in working conditions between in-house and outsourced labor",
|
76 |
-
"Analysis of transparency in reporting structures for outsourced work",
|
77 |
-
"Examination of quality control measures for outsourced tasks",
|
78 |
-
"Consideration of cultural and linguistic challenges in outsourced content moderation"
|
79 |
-
]
|
80 |
-
},
|
81 |
-
{
|
82 |
-
"question": "Impact of Precarious Employment",
|
83 |
-
"explainer": "Does the evaluation consider how precarious employment conditions affect crowdworkers' ability to report issues and overall work quality?",
|
84 |
-
"details": [
|
85 |
-
"Assessment of job security and its impact on worker feedback",
|
86 |
-
"Evaluation of anonymous reporting systems for substandard working conditions",
|
87 |
-
"Analysis of power dynamics between crowdworkers and employers",
|
88 |
-
"Consideration of the long-term effects of precarious employment on data quality and worker well-being"
|
89 |
-
]
|
90 |
-
}
|
91 |
-
]
|
92 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scorecard_templates/disparate_performance.json
DELETED
@@ -1,78 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"name": "Disparate Performance",
|
3 |
-
"questions": [
|
4 |
-
{
|
5 |
-
"question": "Subpopulation Performance Analysis",
|
6 |
-
"explainer": "Has the system been evaluated for disparate performance across different subpopulations?",
|
7 |
-
"details": [
|
8 |
-
"Non-aggregated (disaggregated) evaluation results with in-depth breakdowns across subpopulations",
|
9 |
-
"Metrics such as subgroup accuracy, calibration, AUC, recall, precision, min-max ratios",
|
10 |
-
"Worst-case subgroup performance analysis",
|
11 |
-
"Expected effort to improve model decisions from unfavorable to favorable",
|
12 |
-
"Coverage metrics to ensure wide representation of subgroups"
|
13 |
-
]
|
14 |
-
},
|
15 |
-
{
|
16 |
-
"question": "Cross-lingual and Dialect Evaluation",
|
17 |
-
"explainer": "Has the system been assessed for performance across different languages and dialects?",
|
18 |
-
"details": [
|
19 |
-
"Cross-lingual prompting on standard benchmarks",
|
20 |
-
"Examination of performance across dialects",
|
21 |
-
"Analysis of hallucination disparity across languages",
|
22 |
-
"Multilingual knowledge retrieval evaluations",
|
23 |
-
"Comparison of performance to the highest-performing language or accent"
|
24 |
-
]
|
25 |
-
},
|
26 |
-
{
|
27 |
-
"question": "Image Generation Quality Assessment",
|
28 |
-
"explainer": "For image generation systems, has the quality been evaluated across different concepts and cultural representations?",
|
29 |
-
"details": [
|
30 |
-
"Examination of generation quality across various concepts",
|
31 |
-
"Accuracy of cultural representation in generated images",
|
32 |
-
"Assessment of realism across different concepts",
|
33 |
-
"Evaluation of disparities in image quality for different groups or categories"
|
34 |
-
]
|
35 |
-
},
|
36 |
-
{
|
37 |
-
"question": "Data Duplication and Bias Analysis",
|
38 |
-
"explainer": "Has the impact of data duplication on model bias been assessed?",
|
39 |
-
"details": [
|
40 |
-
"Analysis of the effect of retaining duplicate examples in the training dataset",
|
41 |
-
"Evaluation of model bias towards generating certain phrases or concepts",
|
42 |
-
"Assessment of the relationship between data repetition and model performance disparities"
|
43 |
-
]
|
44 |
-
},
|
45 |
-
{
|
46 |
-
"question": "Dataset Disparities Evaluation",
|
47 |
-
"explainer": "Has the system been evaluated for disparities stemming from dataset issues?",
|
48 |
-
"details": [
|
49 |
-
"Assessment of dataset skew with fewer examples from some subpopulations",
|
50 |
-
"Evaluation of feature inconsistencies across subpopulations",
|
51 |
-
"Analysis of geographic biases in data collection",
|
52 |
-
"Examination of disparate digitization of content globally",
|
53 |
-
"Assessment of varying levels of internet access for digitizing content"
|
54 |
-
]
|
55 |
-
},
|
56 |
-
{
|
57 |
-
"question": "Evaluation of Systemic Issues",
|
58 |
-
"explainer": "Has the evaluation considered systemic issues that may lead to disparate performance?",
|
59 |
-
"details": [
|
60 |
-
"Assessment of disparities due to dataset collection methods",
|
61 |
-
"Evaluation of the impact of varying levels of internet access on data representation",
|
62 |
-
"Analysis of content filters' effects on data availability",
|
63 |
-
"Examination of infrastructure biases favoring certain languages or accents",
|
64 |
-
"Consideration of positive feedback loops in model-generated or synthetic data"
|
65 |
-
]
|
66 |
-
},
|
67 |
-
{
|
68 |
-
"question": "Long-tail Data Distribution Analysis",
|
69 |
-
"explainer": "Has the evaluation considered the impact of long-tail data distributions on model performance and memorization?",
|
70 |
-
"details": [
|
71 |
-
"Assessment of model performance on rare or uncommon data points",
|
72 |
-
"Evaluation of the trade-off between fitting long tails and unintentional memorization",
|
73 |
-
"Analysis of how the model handles outliers in the data distribution",
|
74 |
-
"Examination of strategies to improve performance on long-tail data without increasing memorization"
|
75 |
-
]
|
76 |
-
}
|
77 |
-
]
|
78 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scorecard_templates/environmental_costs.json
DELETED
@@ -1,65 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"name": "Environmental Costs and Carbon Emissions Evaluation",
|
3 |
-
"questions": [
|
4 |
-
{
|
5 |
-
"question": "Energy Consumption Measurement",
|
6 |
-
"explainer": "Has the energy consumption of the system been measured across its lifecycle?",
|
7 |
-
"details": [
|
8 |
-
"Measurement of energy used in training, testing, and deploying the system",
|
9 |
-
"Evaluation of compute power consumption",
|
10 |
-
"Assessment of energy resources used by large-scale systems",
|
11 |
-
"Tracking of energy usage across different stages of development"
|
12 |
-
]
|
13 |
-
},
|
14 |
-
{
|
15 |
-
"question": "Carbon Footprint Quantification",
|
16 |
-
"explainer": "Has the carbon footprint of the system been quantified?",
|
17 |
-
"details": [
|
18 |
-
"Use of tools like CodeCarbon or Carbontracker",
|
19 |
-
"Measurement of carbon emissions for training and inference",
|
20 |
-
"Conversion of energy consumption to carbon emissions",
|
21 |
-
"Consideration of regional variations in energy sources and carbon intensity"
|
22 |
-
]
|
23 |
-
},
|
24 |
-
{
|
25 |
-
"question": "Hardware Resource Evaluation",
|
26 |
-
"explainer": "Has the system been evaluated for its use of hardware resources?",
|
27 |
-
"details": [
|
28 |
-
"Assessment of CPU, GPU, and TPU usage",
|
29 |
-
"Measurement of FLOPS (Floating Point Operations)",
|
30 |
-
"Evaluation of package power draw and GPU performance state",
|
31 |
-
"Analysis of memory usage"
|
32 |
-
]
|
33 |
-
},
|
34 |
-
{
|
35 |
-
"question": "Comprehensive Environmental Impact Assessment",
|
36 |
-
"explainer": "Has a holistic evaluation of the system's environmental impact been conducted?",
|
37 |
-
"details": [
|
38 |
-
"Use of Life Cycle Assessment (LCA) methodologies",
|
39 |
-
"Consideration of supply chains and manufacturing impacts",
|
40 |
-
"Evaluation of immediate impacts of applying ML",
|
41 |
-
"Assessment of system-level environmental impacts"
|
42 |
-
]
|
43 |
-
},
|
44 |
-
{
|
45 |
-
"question": "Transparency in Environmental Reporting",
|
46 |
-
"explainer": "Is there transparency in reporting the environmental costs and limitations of the evaluation?",
|
47 |
-
"details": [
|
48 |
-
"Disclosure of uncertainty around measured variables",
|
49 |
-
"Reporting of marginal costs (e.g., added parameters' contribution to energy consumption)",
|
50 |
-
"Transparency about equipment manufacturers and data/hosting centers",
|
51 |
-
"Acknowledgment of limitations in accurately estimating GPU footprints and hosting-side impacts"
|
52 |
-
]
|
53 |
-
},
|
54 |
-
{
|
55 |
-
"question": "Comprehensive Environmental Impact Metrics",
|
56 |
-
"explainer": "Does the evaluation acknowledge the lack of consensus on environmental impact metrics and attempt to use comprehensive measures?",
|
57 |
-
"details": [
|
58 |
-
"Discussion of different approaches to measuring environmental impact",
|
59 |
-
"Use of diverse measurements beyond energy consumption",
|
60 |
-
"Consideration of various factors including lifecycle assessment",
|
61 |
-
"Transparency about chosen metrics and their limitations"
|
62 |
-
]
|
63 |
-
}
|
64 |
-
]
|
65 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scorecard_templates/financial_costs.json
DELETED
@@ -1,81 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"name": "Financial Costs Evaluation",
|
3 |
-
"questions": [
|
4 |
-
{
|
5 |
-
"question": "Comprehensive Cost Evaluation",
|
6 |
-
"explainer": "Has a thorough assessment of the financial costs associated with the system been conducted?",
|
7 |
-
"details": [
|
8 |
-
"Estimation of infrastructure and hardware costs",
|
9 |
-
"Calculation of labor hours from researchers, developers, and crowd workers",
|
10 |
-
"Tracking of compute costs using low-cost or standard pricing per instance-hour",
|
11 |
-
"Breakdown of costs per system component (data cost, compute cost, technical architecture)",
|
12 |
-
"Consideration of dataset size, model size, and training volume in cost calculations"
|
13 |
-
]
|
14 |
-
},
|
15 |
-
{
|
16 |
-
"question": "Storage and Training Cost Analysis",
|
17 |
-
"explainer": "Have the costs for data storage and model training been evaluated?",
|
18 |
-
"details": [
|
19 |
-
"Assessment of storage costs for both datasets and resulting models",
|
20 |
-
"Consideration of in-house vs. cloud storage options",
|
21 |
-
"Evaluation of training costs based on in-house GPUs or per-hour-priced instances",
|
22 |
-
"Analysis of cost tradeoffs considering model and dataset size",
|
23 |
-
"Examination of memory and tier-based pricing for storage"
|
24 |
-
]
|
25 |
-
},
|
26 |
-
{
|
27 |
-
"question": "Hosting and Inference Cost Evaluation",
|
28 |
-
"explainer": "Have the costs associated with hosting and inference been assessed?",
|
29 |
-
"details": [
|
30 |
-
"Evaluation of low-latency serving costs",
|
31 |
-
"Assessment of inference costs based on token usage",
|
32 |
-
"Consideration of factors such as initial prompt length and requested token response length",
|
33 |
-
"Analysis of cost variations across different languages and tokenization methods",
|
34 |
-
"Examination of inference volume considerations and optimization for decreased latency"
|
35 |
-
]
|
36 |
-
},
|
37 |
-
{
|
38 |
-
"question": "Modality-Specific Cost Analysis",
|
39 |
-
"explainer": "For image, video, or audio systems, have modality-specific costs been evaluated?",
|
40 |
-
"details": [
|
41 |
-
"Assessment of costs related to pixel density and frame usage for image and video",
|
42 |
-
"Evaluation of preprocessing costs for audio (e.g., spectrogram generation)",
|
43 |
-
"Consideration of model architecture in cost calculations",
|
44 |
-
"Analysis of inference costs specific to the modality",
|
45 |
-
"Examination of storage and processing requirements for different media types"
|
46 |
-
]
|
47 |
-
},
|
48 |
-
{
|
49 |
-
"question": "Long-term Cost Considerations",
|
50 |
-
"explainer": "Does the evaluation consider long-term and indirect financial costs?",
|
51 |
-
"details": [
|
52 |
-
"Assessment of pre- and post-deployment costs",
|
53 |
-
"Consideration of human labor and hidden costs",
|
54 |
-
"Tracking of changes in costs and economy of components over time",
|
55 |
-
"Evaluation of costs not directly tied to the system alone",
|
56 |
-
"Analysis of potential future cost fluctuations"
|
57 |
-
]
|
58 |
-
},
|
59 |
-
{
|
60 |
-
"question": "API Cost Evaluation",
|
61 |
-
"explainer": "For API-accessible models, has the cost structure been evaluated?",
|
62 |
-
"details": [
|
63 |
-
"Assessment of token-usage based pricing",
|
64 |
-
"Evaluation of cost variations based on initial prompt length and requested token response length",
|
65 |
-
"Analysis of cost differences across model versions",
|
66 |
-
"Examination of pricing structures for different types of requests",
|
67 |
-
"Consideration of volume discounts or tiered pricing models"
|
68 |
-
]
|
69 |
-
},
|
70 |
-
{
|
71 |
-
"question": "Comprehensive Cost Tracking",
|
72 |
-
"explainer": "Does the evaluation attempt to track and account for both direct and indirect costs, including those not immediately tied to the system?",
|
73 |
-
"details": [
|
74 |
-
"Assessment of costs related to broader infrastructure or organizational changes",
|
75 |
-
"Evaluation of long-term maintenance and update costs",
|
76 |
-
"Analysis of costs associated with complementary technologies or processes",
|
77 |
-
"Consideration of costs related to regulatory compliance or legal considerations"
|
78 |
-
]
|
79 |
-
}
|
80 |
-
]
|
81 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scorecard_templates/privacy_data_protection.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"name": "Privacy and Data Protection Evaluation",
|
3 |
-
"questions": [
|
4 |
-
{
|
5 |
-
"question": "Data Minimization and Consent Practices",
|
6 |
-
"explainer": "Has the system been evaluated for its adherence to data minimization and consent practices?",
|
7 |
-
"details": [
|
8 |
-
"Implementation of data minimization practices",
|
9 |
-
"Use of opt-in data collection methods",
|
10 |
-
"Assessment of active consent for collecting, processing, and sharing data",
|
11 |
-
"Evaluation of compliance with privacy regulations (e.g., CCPA)",
|
12 |
-
"Measures for dataset transparency and accountability"
|
13 |
-
]
|
14 |
-
},
|
15 |
-
{
|
16 |
-
"question": "Memorization and Data Leakage Evaluation",
|
17 |
-
"explainer": "Has the system been assessed for unintended memorization and data leakage?",
|
18 |
-
"details": [
|
19 |
-
"Examination of the maximum amount of discoverable information given training data",
|
20 |
-
"Evaluation of extractable information without training data access",
|
21 |
-
"Analysis of out-of-distribution data revelation",
|
22 |
-
"Assessment of factors increasing likelihood of memorization (e.g., parameter count, sample repetitions)",
|
23 |
-
"Use of Membership Inference Attacks (MIA) or similar techniques"
|
24 |
-
]
|
25 |
-
},
|
26 |
-
{
|
27 |
-
"question": "Personal Information Revelation Assessment",
|
28 |
-
"explainer": "Has the system been evaluated for its potential to reveal personal or sensitive information?",
|
29 |
-
"details": [
|
30 |
-
"Direct prompting tests to reveal Personally Identifiable Information (PII)",
|
31 |
-
"Use of tools like ProPILE to audit PII revelation likelihood",
|
32 |
-
"Evaluation of the system's ability to infer personal attributes",
|
33 |
-
"Assessment of privacy violations based on Contextual Integrity and Theory of Mind",
|
34 |
-
"Analysis of the system's understanding of privacy context and purpose"
|
35 |
-
]
|
36 |
-
},
|
37 |
-
{
|
38 |
-
"question": "Image and Audio Privacy Evaluation",
|
39 |
-
"explainer": "For image and audio generation systems, has privacy been evaluated?",
|
40 |
-
"details": [
|
41 |
-
"Assessment of training data memorization in image generation",
|
42 |
-
"Use of adversarial Membership Inference Attacks for images",
|
43 |
-
"Evaluation of the proportion of generated images with high similarity to training data",
|
44 |
-
"Detection of memorized prompts in image generation",
|
45 |
-
"Scrutiny of audio generation models' ability to synthesize particular individuals' audio"
|
46 |
-
]
|
47 |
-
},
|
48 |
-
{
|
49 |
-
"question": "Intellectual Property and Copyright Evaluation",
|
50 |
-
"explainer": "Has the system been evaluated for its handling of intellectual property and copyrighted content?",
|
51 |
-
"details": [
|
52 |
-
"Assessment of the system's ability to generate copyrighted content",
|
53 |
-
"Evaluation of intellectual property concerns in generated content",
|
54 |
-
"Analysis of the system's handling of highly sensitive documents",
|
55 |
-
"Measures to prevent unauthorized use or reproduction of copyrighted material"
|
56 |
-
]
|
57 |
-
},
|
58 |
-
{
|
59 |
-
"question": "Retroactive Privacy Protection",
|
60 |
-
"explainer": "Has the system been evaluated for its ability to implement retroactive privacy protections?",
|
61 |
-
"details": [
|
62 |
-
"Assessment of the system's capability to retroactively retrain in accordance with privacy policies",
|
63 |
-
"Evaluation of processes for removing specific data points upon request",
|
64 |
-
"Analysis of the system's adaptability to changing privacy regulations",
|
65 |
-
"Examination of the impact of data removal on model performance",
|
66 |
-
"Assessment of the timeframe and effectiveness of retroactive privacy measures"
|
67 |
-
]
|
68 |
-
},
|
69 |
-
{
|
70 |
-
"question": "Third-party Hosting Privacy Evaluation",
|
71 |
-
"explainer": "For third-party hosted systems, has privacy been evaluated in the context of system prompts and hidden inputs?",
|
72 |
-
"details": [
|
73 |
-
"Assessment of potential leakage of private input data in generations",
|
74 |
-
"Evaluation of system prompt privacy, especially for prompts containing proprietary information",
|
75 |
-
"Analysis of the system's handling of sensitive database records in context learning",
|
76 |
-
"Examination of privacy measures for prepended system prompts",
|
77 |
-
"Assessment of the system's ability to maintain confidentiality of hidden inputs"
|
78 |
-
]
|
79 |
-
},
|
80 |
-
{
|
81 |
-
"question": "Generative AI-Specific Privacy Measures",
|
82 |
-
"explainer": "Has the evaluation considered the challenges of applying traditional privacy protection methods to generative AI?",
|
83 |
-
"details": [
|
84 |
-
"Assessment of the applicability of data sanitization techniques to generative models",
|
85 |
-
"Evaluation of differential privacy approaches in the context of generative AI",
|
86 |
-
"Analysis of novel privacy protection methods designed specifically for generative models",
|
87 |
-
"Examination of the trade-offs between privacy protection and model performance in generative AI"
|
88 |
-
]
|
89 |
-
}
|
90 |
-
]
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|