from collections import defaultdict from functools import cache import re import unicodedata import components.homoglyphs as hg def get_normalizer(strategy_name: str) -> object: if strategy_name == "unicode": return UnicodeNormalizer() elif strategy_name == "homoglyph": return HomoglyphNormalizer() elif strategy_name == "truecase": return TrueCaseNormalizer() class HomoglyphNormalizer: """Detects homoglyph attacks and normalizes text to a consistent canonical form.""" def __init__(self): self.homoglyphs = None def __call__(self, text: str) -> str: target_category, all_categories = self._identify_categories(text) homoglyph_map = self._load_homoglyph_map(target_category, all_categories) return self._replace_homoglyphs(target_category, homoglyph_map, text) def _identify_categories(self, text: str) -> tuple: category_count = defaultdict(int) for char in text: category_count[hg.UnicodeCategories.identify_category(char)] += 1 target_category = max(category_count, key=category_count.get) all_categories = tuple(category_count) return target_category, all_categories @cache def _load_homoglyph_map(self, target_category: str, all_categories: tuple) -> dict: homoglyphs = hg.HomoglyphManager(categories=(target_category, "COMMON")) source_alphabet = hg.UnicodeCategories.get_category_alphabet(all_categories) return homoglyphs._generate_restricted_table(source_alphabet, homoglyphs.alphabet) def _replace_homoglyphs(self, target_category: str, homoglyph_map: dict, text: str) -> str: result = "" for char in text: cat = hg.UnicodeCategories.identify_category(char) if target_category in cat or "COMMON" in cat or len(cat) == 0: result += char else: result += list(homoglyph_map[char])[0] return result class UnicodeNormalizer: """Normalizes Unicode text according to specified rulesets.""" def __init__(self, ruleset="whitespace"): if ruleset == "whitespace": self.pattern = re.compile( r"[\u00A0\u1680\u180E\u2000-\u200B\u200C\u200D\u200E\u200F\u2060\u2063\u202F\u205F\u3000\uFEFF\uFFA0\uFFF9\uFFFA\uFFFB" r"\uFE00\uFE01\uFE02\uFE03\uFE04\uFE05\uFE06\uFE07\uFE08\uFE09\uFE0A\uFE0B\uFE0C\uFE0D\uFE0E\uFE0F\u3164\u202A\u202B\u202C\u202D" r"\u202E\u202F]" ) elif ruleset == "IDN": self.pattern = re.compile( r"[\u00A0\u1680\u180E\u2000-\u200B\u202F\u205F\u2060\u2063\uFEFF\uFFF9-\uFFFB\uD800-\uDB7F\uDB80-\uDBFF]" r"[\uDC00-\uDFFF]?|[\uDB40\uDC20-\uDB40\uDC7F][\uDC00-\uDFFF]" ) else: self.pattern = re.compile(r"[^\x00-\x7F]+") def __call__(self, text: str) -> str: text = unicodedata.normalize("NFC", text) text = self.pattern.sub(" ", text) text = re.sub(" +", " ", text) text = "".join(c for c in text if unicodedata.category(c) != "Cc") return text class TrueCaseNormalizer: """Normalizes text to its true capitalization using POS tagging.""" upper_pos_tags = ["PROPN"] def __init__(self, backend="spacy"): if backend == "spacy": import spacy self.nlp = spacy.load("en_core_web_sm") self.normalize = self._spacy_normalize else: from nltk import pos_tag, word_tokenize import nltk nltk.download("punkt") nltk.download("averaged_perceptron_tagger") nltk.download("universal_tagset") self.normalize = self._nltk_normalize def __call__(self, text: str) -> str: return self.normalize(text) def _spacy_normalize(self, text: str) -> str: doc = self.nlp(text.lower()) return "".join( w.text_with_ws.capitalize() if w.pos_ in self.upper_pos_tags or w.is_sent_start else w.text_with_ws for w in doc ) def _nltk_normalize(self, text: str) -> str: from nltk import pos_tag, word_tokenize POS_TAGS = ["NNP", "NNPS"] tagged_text = pos_tag(word_tokenize(text.lower())) return " ".join(w.capitalize() if p in POS_TAGS else w for w, p in tagged_text)