NimaBoscarino commited on
Commit
f8c21da
·
1 Parent(s): 490bc75

WIP: Slight refactor, IntendedPurposeCheck w/ edge cases

Browse files
compliance_checks/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from compliance_checks.base import (
2
+ ComplianceSuite,
3
+ )
4
+
5
+ from compliance_checks.intended_purpose import (
6
+ IntendedPurposeCheck, IntendedPurposeResult
7
+ )
compliance_checks.py → compliance_checks/base.py RENAMED
@@ -5,7 +5,7 @@ import markdown
5
  from bs4 import BeautifulSoup, Comment
6
 
7
 
8
- def walk_to_next_heading(card, heading, heading_text):
9
  stop_at = [heading, f"h{int(heading[1]) - 1}"]
10
 
11
  try:
@@ -22,11 +22,11 @@ def walk_to_next_heading(card, heading, heading_text):
22
  sibling = next(sibling_gen, None)
23
 
24
  if content.strip() == "[More Information Needed]":
25
- return False, None
26
 
27
- return True, content
28
  except AttributeError:
29
- return False, None
30
 
31
 
32
  class ComplianceResult(ABC):
@@ -94,55 +94,6 @@ class ModelProviderIdentityCheck(ComplianceCheck):
94
  return ModelProviderIdentityResult()
95
 
96
 
97
- class IntendedPurposeResult(ComplianceResult):
98
- name = "Intended Purpose"
99
-
100
- def __init__(
101
- self,
102
- direct_use: str = None,
103
- downstream_use: str = None,
104
- out_of_scope_use: str = None,
105
- *args,
106
- **kwargs,
107
- ):
108
- super().__init__(*args, **kwargs)
109
- self.direct_use = direct_use
110
- self.downstream_use = downstream_use
111
- self.out_of_scope_use = out_of_scope_use
112
-
113
- def __eq__(self, other):
114
- if isinstance(other, IntendedPurposeResult):
115
- if super().__eq__(other):
116
- try:
117
- assert self.direct_use == other.direct_use
118
- assert self.downstream_use == other.downstream_use
119
- assert self.out_of_scope_use == other.out_of_scope_use
120
- return True
121
- except AssertionError:
122
- return False
123
- else:
124
- return False
125
-
126
- def to_string(self):
127
- return str((self.direct_use, self.direct_use, self.out_of_scope_use))
128
-
129
-
130
- class IntendedPurposeCheck(ComplianceCheck):
131
- name = "Intended Purpose"
132
-
133
- def run_check(self, card: BeautifulSoup):
134
- direct_use_check, direct_use_content = walk_to_next_heading(card, "h3", "Direct Use")
135
- # TODO: Handle [optional], which doesn't exist in BLOOM, e.g.
136
- downstream_use_check, downstream_use_content = walk_to_next_heading(card, "h3", "Downstream Use [optional]")
137
- out_of_scope_use_check, out_of_scope_use_content = walk_to_next_heading(card, "h3", "Out-of-Scope Use")
138
- return IntendedPurposeResult(
139
- status=direct_use_check and out_of_scope_use_check,
140
- direct_use=direct_use_content,
141
- downstream_use=downstream_use_content,
142
- out_of_scope_use=out_of_scope_use_content
143
- )
144
-
145
-
146
  class GeneralLimitationsResult(ComplianceResult):
147
  name = "General Limitations"
148
 
 
5
  from bs4 import BeautifulSoup, Comment
6
 
7
 
8
+ def walk_to_next_heading(card, heading, heading_text) -> bool:
9
  stop_at = [heading, f"h{int(heading[1]) - 1}"]
10
 
11
  try:
 
22
  sibling = next(sibling_gen, None)
23
 
24
  if content.strip() == "[More Information Needed]":
25
+ return False # , None
26
 
27
+ return True # , content
28
  except AttributeError:
29
+ return False # , None
30
 
31
 
32
  class ComplianceResult(ABC):
 
94
  return ModelProviderIdentityResult()
95
 
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  class GeneralLimitationsResult(ComplianceResult):
98
  name = "General Limitations"
99
 
compliance_checks/intended_purpose.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from compliance_checks.base import ComplianceResult, ComplianceCheck, walk_to_next_heading
2
+ from bs4 import BeautifulSoup
3
+
4
+
5
+ class IntendedPurposeResult(ComplianceResult):
6
+ name = "Intended Purpose"
7
+
8
+ def __init__(
9
+ self,
10
+ direct_use: str = None,
11
+ downstream_use: str = None,
12
+ out_of_scope_use: str = None,
13
+ *args,
14
+ **kwargs,
15
+ ):
16
+ super().__init__(*args, **kwargs)
17
+ self.direct_use = direct_use
18
+ self.downstream_use = downstream_use
19
+ self.out_of_scope_use = out_of_scope_use
20
+
21
+ def __eq__(self, other):
22
+ if isinstance(other, IntendedPurposeResult):
23
+ if super().__eq__(other):
24
+ try:
25
+ # TODO: Either use these, or remove them.
26
+ # assert self.direct_use == other.direct_use
27
+ # assert self.downstream_use == other.downstream_use
28
+ # assert self.out_of_scope_use == other.out_of_scope_use
29
+ return True
30
+ except AssertionError:
31
+ return False
32
+ else:
33
+ return False
34
+
35
+ def to_string(self):
36
+ return str((self.direct_use, self.direct_use, self.out_of_scope_use))
37
+
38
+
39
+ class IntendedPurposeCheck(ComplianceCheck):
40
+ name = "Intended Purpose"
41
+
42
+ def run_check(self, card: BeautifulSoup):
43
+ combos = [
44
+ ("h2", "Intended uses & limitations"),
45
+ ("h1", "Uses"), ("h2", "Uses"),
46
+ ("h2", "Model Use"),
47
+ ("h2", "Intended uses"),
48
+ ]
49
+
50
+ for hX, heading in combos:
51
+ purpose_check = walk_to_next_heading(card, hX, heading)
52
+ if purpose_check:
53
+ return IntendedPurposeResult(
54
+ status=True,
55
+ )
56
+
57
+ return IntendedPurposeResult()
tests/conftest.py CHANGED
@@ -5,6 +5,9 @@ from pathlib import Path
5
 
6
  # TODO: I have the option of maybe making a check for accuracy/metrics?
7
 
 
 
 
8
  # Intended Purpose, General Limitations, Computational Requirements
9
  expected_check_results = {
10
  "albert-base-v2": [True, True, False],
@@ -22,7 +25,7 @@ expected_check_results = {
22
  "gpt2": [True, True, False],
23
  "Helsinki-NLP___opus-mt-en-es": [False, False, False],
24
  "jonatasgrosman___wav2vec2-large-xlsr-53-english": [False, False, False],
25
- "microsoft___layoutlmv3-base": [True, False, False],
26
  "openai___clip-vit-base-patch32": [True, True, False],
27
  "openai___clip-vit-large-patch14": [True, True, False],
28
  "philschmid___bart-large-cnn-samsum": [False, False, False],
@@ -36,7 +39,7 @@ expected_check_results = {
36
  "t5-small": [True, False, False],
37
  "xlm-roberta-base": [True, False, False],
38
  "xlm-roberta-large": [True, False, False],
39
- "yiyanghkust___finbert-tone": [True, False, False],
40
  }
41
 
42
 
 
5
 
6
  # TODO: I have the option of maybe making a check for accuracy/metrics?
7
 
8
+ # Note, some of these are marked as FALSE instead of TRUE because the
9
+ # information is hidden somewhere non-standard, e.g. described in prose
10
+
11
  # Intended Purpose, General Limitations, Computational Requirements
12
  expected_check_results = {
13
  "albert-base-v2": [True, True, False],
 
25
  "gpt2": [True, True, False],
26
  "Helsinki-NLP___opus-mt-en-es": [False, False, False],
27
  "jonatasgrosman___wav2vec2-large-xlsr-53-english": [False, False, False],
28
+ "microsoft___layoutlmv3-base": [False, False, False],
29
  "openai___clip-vit-base-patch32": [True, True, False],
30
  "openai___clip-vit-large-patch14": [True, True, False],
31
  "philschmid___bart-large-cnn-samsum": [False, False, False],
 
39
  "t5-small": [True, False, False],
40
  "xlm-roberta-base": [True, False, False],
41
  "xlm-roberta-large": [True, False, False],
42
+ "yiyanghkust___finbert-tone": [False, False, False],
43
  }
44
 
45
 
tests/test_compliance_checks.py CHANGED
@@ -4,8 +4,6 @@ from unittest.mock import MagicMock
4
  from compliance_checks import (
5
  ComplianceSuite,
6
  IntendedPurposeCheck,
7
- GeneralLimitationsCheck,
8
- ComputationalRequirementsCheck,
9
  )
10
 
11
 
@@ -59,8 +57,8 @@ class TestComplianceSuite:
59
  def test_end_to_end_compliance_suite(real_model_card, expected_check_results):
60
  suite = ComplianceSuite(checks=[
61
  IntendedPurposeCheck(),
62
- GeneralLimitationsCheck(),
63
- ComputationalRequirementsCheck()
64
  ])
65
 
66
  results = suite.run(real_model_card)
 
4
  from compliance_checks import (
5
  ComplianceSuite,
6
  IntendedPurposeCheck,
 
 
7
  )
8
 
9
 
 
57
  def test_end_to_end_compliance_suite(real_model_card, expected_check_results):
58
  suite = ComplianceSuite(checks=[
59
  IntendedPurposeCheck(),
60
+ # GeneralLimitationsCheck(),
61
+ # ComputationalRequirementsCheck()
62
  ])
63
 
64
  results = suite.run(real_model_card)
tests/test_intended_purpose_check.py CHANGED
@@ -2,18 +2,12 @@ import pytest
2
 
3
  import markdown
4
  from bs4 import BeautifulSoup
5
- from compliance_checks import (
6
  IntendedPurposeCheck, IntendedPurposeResult,
7
  )
8
 
9
 
10
- @pytest.fixture
11
- def intended_purpose_model_card():
12
- return """
13
- # Model Card for Sample Model
14
-
15
- Some random info...
16
-
17
  ## Uses
18
 
19
  <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
@@ -33,67 +27,69 @@ Here is some info about direct uses...
33
  <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
34
 
35
  Here is some info about out-of-scope uses...
 
 
 
36
 
37
- ## Bias, Risks, and Limitations
38
-
39
- <!-- This section is meant to convey both technical and sociotechnical limitations. -->
40
-
41
- [More Information Needed]
42
- """
43
-
44
-
45
- @pytest.fixture
46
- def bad_intended_purpose_model_card():
47
- return """
48
- # Model Card for Sample Model
49
-
50
- Some random info...
51
 
52
  ## Uses
53
 
54
- <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
 
 
 
55
 
56
- ### Direct Use
57
 
58
- <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
 
59
 
60
- [More Information Needed]
 
61
 
62
- ### Downstream Use [optional]
 
63
 
64
- <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
 
65
 
66
- [More Information Needed]
67
-
68
- ### Out-of-Scope Use
69
 
70
- <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
 
 
71
 
72
- [More Information Needed]
 
73
 
74
- ## Bias, Risks, and Limitations
75
 
76
- <!-- This section is meant to convey both technical and sociotechnical limitations. -->
 
77
 
78
- [More Information Needed]
79
- """
 
80
 
81
 
82
- @pytest.mark.parametrize("check,card,expected", [
83
- (IntendedPurposeCheck(), "intended_purpose_model_card", IntendedPurposeResult(
84
- status=True,
85
- direct_use="Here is some info about direct uses...",
86
- downstream_use=None,
87
- out_of_scope_use="Here is some info about out-of-scope uses...",
88
- )),
89
- (IntendedPurposeCheck(), "bad_intended_purpose_model_card", IntendedPurposeResult()),
90
  ])
91
- def test_run_checks(check, card, expected, request):
92
- card = request.getfixturevalue(card)
93
-
94
  model_card_html = markdown.markdown(card)
95
  card_soup = BeautifulSoup(model_card_html, features="html.parser")
96
 
97
- results = check.run_check(card_soup)
98
 
99
- assert results == expected
 
2
 
3
  import markdown
4
  from bs4 import BeautifulSoup
5
+ from compliance_checks.intended_purpose import (
6
  IntendedPurposeCheck, IntendedPurposeResult,
7
  )
8
 
9
 
10
+ model_card_template = """\
 
 
 
 
 
 
11
  ## Uses
12
 
13
  <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
 
27
  <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
28
 
29
  Here is some info about out-of-scope uses...
30
+ """
31
+ albert_base_v2 = """\
32
+ # ALBERT Base v2
33
 
34
+ ## Intended uses & limitations
35
+ Here is some info about direct uses...
36
+ """
37
+ distilbert_base_cased_distilled_squad = """\
38
+ # DistilBERT base cased distilled SQuAD
 
 
 
 
 
 
 
 
 
39
 
40
  ## Uses
41
 
42
+ This model can be used for question answering.
43
+ """
44
+ distilroberta_base = """\
45
+ # Model Card for DistilRoBERTa base
46
 
47
+ # Uses
48
 
49
+ You can use the raw model for masked language modeling, but it's mostly intended to be fine-tuned on a downstream task.
50
+ """
51
 
52
+ openai_clip_vit_base_patch = """\
53
+ # Model Card: CLIP
54
 
55
+ ## Model Use
56
+ Stuff.
57
 
58
+ ### Intended Use
59
+ Stuff.
60
 
61
+ #### Primary intended uses
62
+ Stuff.
 
63
 
64
+ ### Out-of-Scope Use Cases
65
+ Stuff.
66
+ """
67
 
68
+ sentence_transformers = """\
69
+ # all-MiniLM-L6-v2
70
 
71
+ ## Intended uses
72
 
73
+ Our model is intented to be used as a sentence and short paragraph encoder.
74
+ """
75
 
76
+ success_result = IntendedPurposeResult(
77
+ status=True
78
+ )
79
 
80
 
81
+ @pytest.mark.parametrize("card", [
82
+ model_card_template,
83
+ albert_base_v2,
84
+ distilbert_base_cased_distilled_squad,
85
+ distilroberta_base,
86
+ openai_clip_vit_base_patch,
87
+ sentence_transformers,
 
88
  ])
89
+ def test_run_checks(card):
 
 
90
  model_card_html = markdown.markdown(card)
91
  card_soup = BeautifulSoup(model_card_html, features="html.parser")
92
 
93
+ results = IntendedPurposeCheck().run_check(card_soup)
94
 
95
+ assert results == success_result