import regex import unicodedata class SimpleTokenizer(object): ALPHA_NUM = r'[\p{L}\p{N}\p{M}]+' NON_WS = r'[^\p{Z}\p{C}]' def __init__(self): """ Args: annotators: None or empty set (only tokenizes). """ self._regexp = regex.compile( '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS), flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE ) def tokenize(self, text, uncased=False): matches = [m for m in self._regexp.finditer(text)] if uncased: tokens = [m.group().lower() for m in matches] else: tokens = [m.group() for m in matches] return tokens def _normalize(text): return unicodedata.normalize('NFD', text) def has_answer(text, answers, tokenizer=SimpleTokenizer()) -> bool: """Check if a document contains an answer string.""" text = _normalize(text) text = tokenizer.tokenize(text, uncased=True) for answer in answers: answer = _normalize(answer) answer = tokenizer.tokenize(answer, uncased=True) for i in range(0, len(text) - len(answer) + 1): if answer == text[i: i + len(answer)]: return True return False