|
""" |
|
Updated version of core.py from |
|
https://github.com/yamatt/homoglyphs/tree/main/homoglyphs_fork |
|
for modern python3 |
|
""" |
|
|
|
from collections import defaultdict |
|
import json |
|
from itertools import product |
|
import os |
|
import unicodedata |
|
|
|
|
|
ACTION_LOAD = 1 |
|
ACTION_IGNORE = 2 |
|
ACTION_REMOVE = 3 |
|
|
|
ASCII_CHAR_RANGE = range(128) |
|
|
|
CURRENT_DIRECTORY = os.path.dirname(os.path.abspath(__file__)) |
|
DATA_DIRECTORY = os.path.join(CURRENT_DIRECTORY, "data") |
|
|
|
|
|
class UnicodeCategories: |
|
""" |
|
Work with aliases from ISO 15924. |
|
https://en.wikipedia.org/wiki/ISO_15924#List_of_codes |
|
""" |
|
|
|
file_path = os.path.join(DATA_DIRECTORY, "categories.json") |
|
|
|
@classmethod |
|
def _get_unicode_ranges(cls, categories): |
|
""" |
|
:return: iter: (start code, end code) |
|
:rtype: list |
|
""" |
|
with open(cls.file_path, encoding="utf-8") as file: |
|
data = json.load(file) |
|
|
|
for category in categories: |
|
if category not in data["aliases"]: |
|
raise ValueError(f"Invalid category: {category}") |
|
|
|
for point in data["points"]: |
|
if point[2] in categories: |
|
yield point[:2] |
|
|
|
@classmethod |
|
def get_category_alphabet(cls, categories): |
|
""" |
|
:return: set of chars in alphabet by categories list |
|
:rtype: set |
|
""" |
|
alphabet = set() |
|
for start, end in cls._get_unicode_ranges(categories): |
|
chars = (chr(code) for code in range(start, end + 1)) |
|
alphabet.update(chars) |
|
return alphabet |
|
|
|
@classmethod |
|
def identify_category(cls, char): |
|
""" |
|
:return: category |
|
:rtype: str |
|
""" |
|
with open(cls.file_path, encoding="utf-8") as file: |
|
data = json.load(file) |
|
|
|
|
|
try: |
|
category = unicodedata.name(char).split()[0] |
|
except (TypeError, ValueError): |
|
pass |
|
else: |
|
if category in data["aliases"]: |
|
return category |
|
|
|
|
|
code = ord(char) |
|
for point in data["points"]: |
|
if point[0] <= code <= point[1]: |
|
return point[2] |
|
|
|
@classmethod |
|
def get_all_categories(cls): |
|
with open(cls.file_path, encoding="utf-8") as file: |
|
data = json.load(file) |
|
return set(data["aliases"]) |
|
|
|
|
|
class LanguageIdentifiers: |
|
file_path = os.path.join(DATA_DIRECTORY, "languages.json") |
|
|
|
@classmethod |
|
def get_language_alphabet(cls, languages): |
|
""" |
|
:return: set of chars in alphabet by languages list |
|
:rtype: set |
|
""" |
|
with open(cls.file_path, encoding="utf-8") as file: |
|
data = json.load(file) |
|
alphabet = set() |
|
for lang in languages: |
|
if lang not in data: |
|
raise ValueError(f"Invalid language code: {lang}") |
|
alphabet.update(data[lang]) |
|
return alphabet |
|
|
|
@classmethod |
|
def identify_languages(cls, char): |
|
""" |
|
:return: set of languages which alphabet contains passed char. |
|
:rtype: set |
|
""" |
|
with open(cls.file_path, encoding="utf-8") as file: |
|
data = json.load(file) |
|
languages = set() |
|
for lang, alphabet in data.items(): |
|
if char in alphabet: |
|
languages.add(lang) |
|
return languages |
|
|
|
@classmethod |
|
def get_all_languages(cls): |
|
with open(cls.file_path, encoding="utf-8") as file: |
|
data = json.load(file) |
|
return set(data.keys()) |
|
|
|
|
|
class HomoglyphManager: |
|
def __init__( |
|
self, |
|
categories=None, |
|
languages=None, |
|
alphabet=None, |
|
strategy=ACTION_IGNORE, |
|
ascii_strategy=ACTION_IGNORE, |
|
ascii_range=ASCII_CHAR_RANGE, |
|
): |
|
|
|
if strategy not in (ACTION_LOAD, ACTION_IGNORE, ACTION_REMOVE): |
|
raise ValueError("Invalid strategy") |
|
self.strategy = strategy |
|
self.ascii_strategy = ascii_strategy |
|
self.ascii_range = ascii_range |
|
|
|
|
|
if not categories and not languages and not alphabet: |
|
categories = ("LATIN", "COMMON") |
|
|
|
|
|
self.categories = set(categories or []) |
|
self.languages = set(languages or []) |
|
|
|
|
|
self.alphabet = set(alphabet or []) |
|
if self.categories: |
|
alphabet = UnicodeCategories.get_category_alphabet(self.categories) |
|
self.alphabet.update(alphabet) |
|
if self.languages: |
|
alphabet = LanguageIdentifiers.get_language_alphabet(self.languages) |
|
self.alphabet.update(alphabet) |
|
self.table = self._generate_homoglyph_table(self.alphabet) |
|
|
|
@staticmethod |
|
def _generate_homoglyph_table(alphabet): |
|
table = defaultdict(set) |
|
with open(os.path.join(DATA_DIRECTORY, "confusables.json")) as file: |
|
data = json.load(file) |
|
for char in alphabet: |
|
if char in data: |
|
for homoglyph in data[char]: |
|
if homoglyph in alphabet: |
|
table[char].add(homoglyph) |
|
return table |
|
|
|
@staticmethod |
|
def _generate_restricted_table(source_alphabet, target_alphabet): |
|
table = defaultdict(set) |
|
with open(os.path.join(DATA_DIRECTORY, "confusables.json")) as file: |
|
data = json.load(file) |
|
for char in source_alphabet: |
|
if char in data: |
|
for homoglyph in data[char]: |
|
if homoglyph in target_alphabet: |
|
table[char].add(homoglyph) |
|
return table |
|
|
|
@staticmethod |
|
def _uniq_and_sort(data): |
|
result = list(set(data)) |
|
result.sort(key=lambda x: (-len(x), x)) |
|
return result |
|
|
|
def _update_alphabet_with_char(self, char): |
|
|
|
langs = LanguageIdentifiers.identify_languages(char) |
|
if langs: |
|
self.languages.update(langs) |
|
alphabet = LanguageIdentifiers.get_language_alphabet(langs) |
|
self.alphabet.update(alphabet) |
|
else: |
|
|
|
category = UnicodeCategories.identify_category(char) |
|
if category is None: |
|
return False |
|
self.categories.add(category) |
|
alphabet = UnicodeCategories.get_category_alphabet([category]) |
|
self.alphabet.update(alphabet) |
|
|
|
self.table = self._generate_homoglyph_table(self.alphabet) |
|
return True |
|
|
|
def _get_char_variants(self, char): |
|
if char not in self.alphabet: |
|
if self.strategy == ACTION_LOAD: |
|
if not self._update_alphabet_with_char(char): |
|
return [] |
|
elif self.strategy == ACTION_IGNORE: |
|
return [char] |
|
elif self.strategy == ACTION_REMOVE: |
|
return [] |
|
|
|
|
|
alt_chars = self.table.get(char, set()) |
|
if alt_chars: |
|
|
|
alt_chars2 = [self.table.get(alt_char, set()) for alt_char in alt_chars] |
|
|
|
alt_chars.update(*alt_chars2) |
|
|
|
alt_chars.add(char) |
|
|
|
|
|
return self._uniq_and_sort(alt_chars) |
|
|
|
def _get_combinations(self, text, ascii=False): |
|
variations = [] |
|
for char in text: |
|
alt_chars = self._get_char_variants(char) |
|
|
|
if ascii: |
|
alt_chars = [char for char in alt_chars if ord(char) in self.ascii_range] |
|
if not alt_chars and self.ascii_strategy == ACTION_IGNORE: |
|
return |
|
|
|
if alt_chars: |
|
variations.append(alt_chars) |
|
if variations: |
|
for variant in product(*variations): |
|
yield "".join(variant) |
|
|
|
def get_all_combinations(self, text): |
|
return list(self._get_combinations(text)) |
|
|
|
def _convert_to_ascii(self, text): |
|
for variant in self._get_combinations(text, ascii=True): |
|
if max(map(ord, variant)) in self.ascii_range: |
|
yield variant |
|
|
|
def convert_to_ascii(self, text): |
|
return self._uniq_and_sort(self._convert_to_ascii(text)) |
|
|