from difflib import Differ from typing import Tuple, Iterable import logging def compute_diff(text1, text2) -> Iterable[Tuple[str, str | None]]: d = Differ() pairs = [ (token[2:], token[0] if token[0] != " " else None) for token in d.compare(text1, text2) ] return _postprocess_compute_diff(pairs) def _postprocess_compute_diff(pairs: Iterable[Tuple[str, str | None]]) -> Iterable[Tuple[str, str | None]]: """Whitespace deletions add additions are missed by the diff component.""" for idx, (char, flag) in enumerate(pairs): if char == " " and flag in ["+", "-"]: if idx > 0 and idx < len(pairs): if pairs[idx - 1][1] == flag or pairs[idx + 1][1] == flag: yield (" ", flag) else: yield ("^", "+") if flag == "+" else ("#", "-") else: yield (char, flag) def get_logger(): logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) return logging.getLogger(__name__) LANGS = { "All": None, "English": "en", "French": "fr", "German": "de", "Spanish": "es", "Italian": "it", "Dutch": "nl", "Polish": "pl", "Portuguese": "pt", "Swedish": "sv", "Bulgarian": "bg", "Romanian": "ro", "Finnish": "fi", "Russian": "ru", "Norwegian Bokmål": "nb", "Czech": "cs", "Thai": "th", "Danish": "da", "Croatian": "hr", "Hungarian": "hu", "Arabic": "ar", "Greek": "el", "Japanese": "ja", "Catalan": "ca", "Serbian": "sr", "Slovenian": "sl", "Slovak": "sk", "Turkish": "tr", "Lithuanian": "lt", "Chinese": "zh", "Estonian": "et", "Latvian": "lv", "Undefined": "xx", "Ukrainian": "uk", "Indonesian": "id", "Hebrew": "he", "Vietnamese": "vi", "Icelandic": "is", "Latin": "la", "Korean": "ko", "Albanian": "sq", "Georgian": "ka", "Malay": "ms", "Bosnian": "bs", "Persian": "fa", "Bengali": "bn", "Galician": "gl", "Kazakh": "kk", "Macedonian": "mk", "Norwegian Nynorsk": "nn", "Hindi": "hi", "Afar": "aa", "Uzbek": "uz", "Somali": "so", "Afrikaans": "af" }