Spaces:
Runtime error
Runtime error
import regex | |
import unicodedata | |
class SimpleTokenizer(object): | |
ALPHA_NUM = r'[\p{L}\p{N}\p{M}]+' | |
NON_WS = r'[^\p{Z}\p{C}]' | |
def __init__(self): | |
""" | |
Args: | |
annotators: None or empty set (only tokenizes). | |
""" | |
self._regexp = regex.compile( | |
'(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS), | |
flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE | |
) | |
def tokenize(self, text, uncased=False): | |
matches = [m for m in self._regexp.finditer(text)] | |
if uncased: | |
tokens = [m.group().lower() for m in matches] | |
else: | |
tokens = [m.group() for m in matches] | |
return tokens | |
def _normalize(text): | |
return unicodedata.normalize('NFD', text) | |
def has_answer(text, answers, tokenizer=SimpleTokenizer()) -> bool: | |
"""Check if a document contains an answer string.""" | |
text = _normalize(text) | |
text = tokenizer.tokenize(text, uncased=True) | |
for answer in answers: | |
answer = _normalize(answer) | |
answer = tokenizer.tokenize(answer, uncased=True) | |
for i in range(0, len(text) - len(answer) + 1): | |
if answer == text[i: i + len(answer)]: | |
return True | |
return False | |