charlesapochi's picture
Initial commit
41aed49
"""
Updated version of core.py from
https://github.com/yamatt/homoglyphs/tree/main/homoglyphs_fork
for modern python3
"""
from collections import defaultdict
import json
from itertools import product
import os
import unicodedata
# Actions if char not in alphabet
ACTION_LOAD = 1 # load category for this char
ACTION_IGNORE = 2 # add char to result
ACTION_REMOVE = 3 # remove char from result
ASCII_CHAR_RANGE = range(128)
CURRENT_DIRECTORY = os.path.dirname(os.path.abspath(__file__))
DATA_DIRECTORY = os.path.join(CURRENT_DIRECTORY, "data")
class UnicodeCategories:
"""
Work with aliases from ISO 15924.
https://en.wikipedia.org/wiki/ISO_15924#List_of_codes
"""
file_path = os.path.join(DATA_DIRECTORY, "categories.json")
@classmethod
def _get_unicode_ranges(cls, categories):
"""
:return: iter: (start code, end code)
:rtype: list
"""
with open(cls.file_path, encoding="utf-8") as file:
data = json.load(file)
for category in categories:
if category not in data["aliases"]:
raise ValueError(f"Invalid category: {category}")
for point in data["points"]:
if point[2] in categories:
yield point[:2]
@classmethod
def get_category_alphabet(cls, categories):
"""
:return: set of chars in alphabet by categories list
:rtype: set
"""
alphabet = set()
for start, end in cls._get_unicode_ranges(categories):
chars = (chr(code) for code in range(start, end + 1))
alphabet.update(chars)
return alphabet
@classmethod
def identify_category(cls, char):
"""
:return: category
:rtype: str
"""
with open(cls.file_path, encoding="utf-8") as file:
data = json.load(file)
# try detect category by unicodedata
try:
category = unicodedata.name(char).split()[0]
except (TypeError, ValueError):
pass
else:
if category in data["aliases"]:
return category
# try detect category by ranges from JSON file.
code = ord(char)
for point in data["points"]:
if point[0] <= code <= point[1]:
return point[2]
@classmethod
def get_all_categories(cls):
with open(cls.file_path, encoding="utf-8") as file:
data = json.load(file)
return set(data["aliases"])
class LanguageIdentifiers:
file_path = os.path.join(DATA_DIRECTORY, "languages.json")
@classmethod
def get_language_alphabet(cls, languages):
"""
:return: set of chars in alphabet by languages list
:rtype: set
"""
with open(cls.file_path, encoding="utf-8") as file:
data = json.load(file)
alphabet = set()
for lang in languages:
if lang not in data:
raise ValueError(f"Invalid language code: {lang}")
alphabet.update(data[lang])
return alphabet
@classmethod
def identify_languages(cls, char):
"""
:return: set of languages which alphabet contains passed char.
:rtype: set
"""
with open(cls.file_path, encoding="utf-8") as file:
data = json.load(file)
languages = set()
for lang, alphabet in data.items():
if char in alphabet:
languages.add(lang)
return languages
@classmethod
def get_all_languages(cls):
with open(cls.file_path, encoding="utf-8") as file:
data = json.load(file)
return set(data.keys())
class HomoglyphManager:
def __init__(
self,
categories=None,
languages=None,
alphabet=None,
strategy=ACTION_IGNORE,
ascii_strategy=ACTION_IGNORE,
ascii_range=ASCII_CHAR_RANGE,
):
# strategies
if strategy not in (ACTION_LOAD, ACTION_IGNORE, ACTION_REMOVE):
raise ValueError("Invalid strategy")
self.strategy = strategy
self.ascii_strategy = ascii_strategy
self.ascii_range = ascii_range
# HomoglyphManager must be initialized by any alphabet for correct work
if not categories and not languages and not alphabet:
categories = ("LATIN", "COMMON")
# cats and langs
self.categories = set(categories or [])
self.languages = set(languages or [])
# alphabet
self.alphabet = set(alphabet or [])
if self.categories:
alphabet = UnicodeCategories.get_category_alphabet(self.categories)
self.alphabet.update(alphabet)
if self.languages:
alphabet = LanguageIdentifiers.get_language_alphabet(self.languages)
self.alphabet.update(alphabet)
self.table = self._generate_homoglyph_table(self.alphabet)
@staticmethod
def _generate_homoglyph_table(alphabet):
table = defaultdict(set)
with open(os.path.join(DATA_DIRECTORY, "confusables.json")) as file:
data = json.load(file)
for char in alphabet:
if char in data:
for homoglyph in data[char]:
if homoglyph in alphabet:
table[char].add(homoglyph)
return table
@staticmethod
def _generate_restricted_table(source_alphabet, target_alphabet):
table = defaultdict(set)
with open(os.path.join(DATA_DIRECTORY, "confusables.json")) as file:
data = json.load(file)
for char in source_alphabet:
if char in data:
for homoglyph in data[char]:
if homoglyph in target_alphabet:
table[char].add(homoglyph)
return table
@staticmethod
def _uniq_and_sort(data):
result = list(set(data))
result.sort(key=lambda x: (-len(x), x))
return result
def _update_alphabet_with_char(self, char):
# try detect languages
langs = LanguageIdentifiers.identify_languages(char)
if langs:
self.languages.update(langs)
alphabet = LanguageIdentifiers.get_language_alphabet(langs)
self.alphabet.update(alphabet)
else:
# try detect categories
category = UnicodeCategories.identify_category(char)
if category is None:
return False
self.categories.add(category)
alphabet = UnicodeCategories.get_category_alphabet([category])
self.alphabet.update(alphabet)
# update table for new alphabet
self.table = self._generate_homoglyph_table(self.alphabet)
return True
def _get_char_variants(self, char):
if char not in self.alphabet:
if self.strategy == ACTION_LOAD:
if not self._update_alphabet_with_char(char):
return []
elif self.strategy == ACTION_IGNORE:
return [char]
elif self.strategy == ACTION_REMOVE:
return []
# find alternative chars for current char
alt_chars = self.table.get(char, set())
if alt_chars:
# find alternative chars for alternative chars for current char
alt_chars2 = [self.table.get(alt_char, set()) for alt_char in alt_chars]
# combine all alternatives
alt_chars.update(*alt_chars2)
# add current char to alternatives
alt_chars.add(char)
# uniq, sort and return
return self._uniq_and_sort(alt_chars)
def _get_combinations(self, text, ascii=False):
variations = []
for char in text:
alt_chars = self._get_char_variants(char)
if ascii:
alt_chars = [char for char in alt_chars if ord(char) in self.ascii_range]
if not alt_chars and self.ascii_strategy == ACTION_IGNORE:
return
if alt_chars:
variations.append(alt_chars)
if variations:
for variant in product(*variations):
yield "".join(variant)
def get_all_combinations(self, text):
return list(self._get_combinations(text))
def _convert_to_ascii(self, text):
for variant in self._get_combinations(text, ascii=True):
if max(map(ord, variant)) in self.ascii_range:
yield variant
def convert_to_ascii(self, text):
return self._uniq_and_sort(self._convert_to_ascii(text))