libraxiong commited on
Commit
f4c0c85
·
1 Parent(s): 29f7723

add oppo_refuse_match.py

Browse files
Files changed (4) hide show
  1. app.py +6 -0
  2. eval.py +40 -0
  3. oppo_refuse_match.py +67 -0
  4. requirements.txt +1 -0
app.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import evaluate
2
+ from evaluate.utils import launch_gradio_widget
3
+
4
+
5
+ module = evaluate.load("libraxiong/oppo_refuse_match")
6
+ launch_gradio_widget(module)
eval.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import regex
2
+ import unicodedata
3
+ class SimpleTokenizer(object):
4
+ ALPHA_NUM = r'[\p{L}\p{N}\p{M}]+'
5
+ NON_WS = r'[^\p{Z}\p{C}]'
6
+
7
+ def __init__(self):
8
+ """
9
+ Args:
10
+ annotators: None or empty set (only tokenizes).
11
+ """
12
+ self._regexp = regex.compile(
13
+ '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
14
+ flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
15
+ )
16
+
17
+ def tokenize(self, text, uncased=False):
18
+ matches = [m for m in self._regexp.finditer(text)]
19
+ if uncased:
20
+ tokens = [m.group().lower() for m in matches]
21
+ else:
22
+ tokens = [m.group() for m in matches]
23
+ return tokens
24
+
25
+
26
+ def _normalize(text):
27
+ return unicodedata.normalize('NFD', text)
28
+
29
+ def has_answer(text, answers, tokenizer=SimpleTokenizer()) -> bool:
30
+ """Check if a document contains an answer string."""
31
+ text = _normalize(text)
32
+ text = tokenizer.tokenize(text, uncased=True)
33
+
34
+ for answer in answers:
35
+ answer = _normalize(answer)
36
+ answer = tokenizer.tokenize(answer, uncased=True)
37
+ for i in range(0, len(text) - len(answer) + 1):
38
+ if answer == text[i: i + len(answer)]:
39
+ return True
40
+ return False
oppo_refuse_match.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Oppo Refuse Match metric."""
15
+ import re
16
+ import string
17
+
18
+ import datasets
19
+ import numpy as np
20
+
21
+ import evaluate
22
+ from eval import has_answer
23
+
24
+ _DESCRIPTION = """
25
+ Returns the rate at which the input predicted strings exactly match the refuse list
26
+ """
27
+
28
+ _KWARGS_DESCRIPTION = """
29
+ Args:
30
+ predictions: List of predicted texts.
31
+
32
+ Returns:
33
+ exact_match: Dictionary containing oppo_refuse_match rate. Possible values are 0 or 1
34
+ Examples:
35
+
36
+ """
37
+
38
+ _CITATION = """ the dpr exact match
39
+ """
40
+
41
+
42
+ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
43
+ class OppoRefuseMatch(evaluate.Metric):
44
+ def _info(self):
45
+ return evaluate.MetricInfo(
46
+ description=_DESCRIPTION,
47
+ citation=_CITATION,
48
+ inputs_description=_KWARGS_DESCRIPTION,
49
+ features=datasets.Features(
50
+ {
51
+ "predictions": datasets.Value("string", id="sequence"),
52
+ }
53
+ ),
54
+ reference_urls=[],
55
+ )
56
+
57
+ def _compute(
58
+ self,
59
+ predictions
60
+ ):
61
+ patterns = [
62
+ r"There is no", r"no", r"non-existent", r"not a", r"none"
63
+ ]
64
+ if has_answer(predictions,patterns):
65
+ return {"oppo_refuse_match": 1}
66
+ else:
67
+ return {"oppo_refuse_match": 0}
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ git+https://github.com/huggingface/evaluate@a4bdc10c48a450b978d91389a48dbb5297835c7d