""" Updated version of core.py from https://github.com/yamatt/homoglyphs/tree/main/homoglyphs_fork for modern python3 """ from collections import defaultdict import json from itertools import product import os import unicodedata # Actions if char not in alphabet ACTION_LOAD = 1 # load category for this char ACTION_IGNORE = 2 # add char to result ACTION_REMOVE = 3 # remove char from result ASCII_CHAR_RANGE = range(128) CURRENT_DIRECTORY = os.path.dirname(os.path.abspath(__file__)) DATA_DIRECTORY = os.path.join(CURRENT_DIRECTORY, "data") class UnicodeCategories: """ Work with aliases from ISO 15924. https://en.wikipedia.org/wiki/ISO_15924#List_of_codes """ file_path = os.path.join(DATA_DIRECTORY, "categories.json") @classmethod def _get_unicode_ranges(cls, categories): """ :return: iter: (start code, end code) :rtype: list """ with open(cls.file_path, encoding="utf-8") as file: data = json.load(file) for category in categories: if category not in data["aliases"]: raise ValueError(f"Invalid category: {category}") for point in data["points"]: if point[2] in categories: yield point[:2] @classmethod def get_category_alphabet(cls, categories): """ :return: set of chars in alphabet by categories list :rtype: set """ alphabet = set() for start, end in cls._get_unicode_ranges(categories): chars = (chr(code) for code in range(start, end + 1)) alphabet.update(chars) return alphabet @classmethod def identify_category(cls, char): """ :return: category :rtype: str """ with open(cls.file_path, encoding="utf-8") as file: data = json.load(file) # try detect category by unicodedata try: category = unicodedata.name(char).split()[0] except (TypeError, ValueError): pass else: if category in data["aliases"]: return category # try detect category by ranges from JSON file. code = ord(char) for point in data["points"]: if point[0] <= code <= point[1]: return point[2] @classmethod def get_all_categories(cls): with open(cls.file_path, encoding="utf-8") as file: data = json.load(file) return set(data["aliases"]) class LanguageIdentifiers: file_path = os.path.join(DATA_DIRECTORY, "languages.json") @classmethod def get_language_alphabet(cls, languages): """ :return: set of chars in alphabet by languages list :rtype: set """ with open(cls.file_path, encoding="utf-8") as file: data = json.load(file) alphabet = set() for lang in languages: if lang not in data: raise ValueError(f"Invalid language code: {lang}") alphabet.update(data[lang]) return alphabet @classmethod def identify_languages(cls, char): """ :return: set of languages which alphabet contains passed char. :rtype: set """ with open(cls.file_path, encoding="utf-8") as file: data = json.load(file) languages = set() for lang, alphabet in data.items(): if char in alphabet: languages.add(lang) return languages @classmethod def get_all_languages(cls): with open(cls.file_path, encoding="utf-8") as file: data = json.load(file) return set(data.keys()) class HomoglyphManager: def __init__( self, categories=None, languages=None, alphabet=None, strategy=ACTION_IGNORE, ascii_strategy=ACTION_IGNORE, ascii_range=ASCII_CHAR_RANGE, ): # strategies if strategy not in (ACTION_LOAD, ACTION_IGNORE, ACTION_REMOVE): raise ValueError("Invalid strategy") self.strategy = strategy self.ascii_strategy = ascii_strategy self.ascii_range = ascii_range # HomoglyphManager must be initialized by any alphabet for correct work if not categories and not languages and not alphabet: categories = ("LATIN", "COMMON") # cats and langs self.categories = set(categories or []) self.languages = set(languages or []) # alphabet self.alphabet = set(alphabet or []) if self.categories: alphabet = UnicodeCategories.get_category_alphabet(self.categories) self.alphabet.update(alphabet) if self.languages: alphabet = LanguageIdentifiers.get_language_alphabet(self.languages) self.alphabet.update(alphabet) self.table = self._generate_homoglyph_table(self.alphabet) @staticmethod def _generate_homoglyph_table(alphabet): table = defaultdict(set) with open(os.path.join(DATA_DIRECTORY, "confusables.json")) as file: data = json.load(file) for char in alphabet: if char in data: for homoglyph in data[char]: if homoglyph in alphabet: table[char].add(homoglyph) return table @staticmethod def _generate_restricted_table(source_alphabet, target_alphabet): table = defaultdict(set) with open(os.path.join(DATA_DIRECTORY, "confusables.json")) as file: data = json.load(file) for char in source_alphabet: if char in data: for homoglyph in data[char]: if homoglyph in target_alphabet: table[char].add(homoglyph) return table @staticmethod def _uniq_and_sort(data): result = list(set(data)) result.sort(key=lambda x: (-len(x), x)) return result def _update_alphabet_with_char(self, char): # try detect languages langs = LanguageIdentifiers.identify_languages(char) if langs: self.languages.update(langs) alphabet = LanguageIdentifiers.get_language_alphabet(langs) self.alphabet.update(alphabet) else: # try detect categories category = UnicodeCategories.identify_category(char) if category is None: return False self.categories.add(category) alphabet = UnicodeCategories.get_category_alphabet([category]) self.alphabet.update(alphabet) # update table for new alphabet self.table = self._generate_homoglyph_table(self.alphabet) return True def _get_char_variants(self, char): if char not in self.alphabet: if self.strategy == ACTION_LOAD: if not self._update_alphabet_with_char(char): return [] elif self.strategy == ACTION_IGNORE: return [char] elif self.strategy == ACTION_REMOVE: return [] # find alternative chars for current char alt_chars = self.table.get(char, set()) if alt_chars: # find alternative chars for alternative chars for current char alt_chars2 = [self.table.get(alt_char, set()) for alt_char in alt_chars] # combine all alternatives alt_chars.update(*alt_chars2) # add current char to alternatives alt_chars.add(char) # uniq, sort and return return self._uniq_and_sort(alt_chars) def _get_combinations(self, text, ascii=False): variations = [] for char in text: alt_chars = self._get_char_variants(char) if ascii: alt_chars = [char for char in alt_chars if ord(char) in self.ascii_range] if not alt_chars and self.ascii_strategy == ACTION_IGNORE: return if alt_chars: variations.append(alt_chars) if variations: for variant in product(*variations): yield "".join(variant) def get_all_combinations(self, text): return list(self._get_combinations(text)) def _convert_to_ascii(self, text): for variant in self._get_combinations(text, ascii=True): if max(map(ord, variant)) in self.ascii_range: yield variant def convert_to_ascii(self, text): return self._uniq_and_sort(self._convert_to_ascii(text))