Spaces:

Dadmatech
/

persian_informal_translator

Running

App Files Files Community

mohammadkrb commited on Nov 18, 2023

Commit

6227608

•

1 Parent(s): 3fd00d4

init streamlit based app

Browse files

Files changed (11) hide show

OneShotTransformer.py +600 -0
VerbHandler.py +350 -0
app.py +128 -0
config.yml +5 -0
download_utils.py +65 -0
formality_transformer.py +40 -0
kenlm_wrapper.py +31 -0
main.py +96 -0
requirements.txt +7 -0
tokenizer.py +184 -0
utils.py +302 -0

OneShotTransformer.py ADDED Viewed

	@@ -0,0 +1,600 @@

+import re
+import itertools
+import string
+import utils
+class InformalWord:
+    def __init__(self, lemma, prefixs=None, postfixs=None, pos=None, append_h=False):
+        if prefixs is None:
+            prefixs = []
+        if postfixs is None:
+            postfixs = []
+        self.is_verb = False
+        self.is_mapper = False
+        self.semi_mapper = False
+        self.append_h = append_h
+        self.lemma = lemma
+        self.prefixs = prefixs
+        self.postfixs = postfixs
+        self.pos = pos
+class Prefix:
+    def __init__(self, word, level, formal=None, ignore_poses=None, poses=None, non_connecting_chars=None, connector='nim'):
+        if non_connecting_chars is None:
+            non_connecting_chars = []
+        self.word = word
+        self.level = level
+        self.ignore_poses = ignore_poses
+        self.poses = poses
+        self.connector = connector
+        if formal is None:
+            self.formal = word
+        else:
+            self.formal = formal
+        self.non_connecting_chars = non_connecting_chars
+class Postfix:
+    def __init__(self, word, level, formal=None, ignore_poses=None, non_connecting_chars=None, poses=None, connector='nim'):
+        if non_connecting_chars is None:
+            non_connecting_chars = []
+        self.word = word
+        self.level = level
+        self.ignore_poses = ignore_poses
+        self.poses = poses
+        self.connector = connector
+        if formal is None:
+            self.formal = word
+        else:
+            self.formal = formal
+        self.non_connecting_chars = non_connecting_chars
+class OneShotTransformer:
+    NIM_FASELE = chr(8204)
+    # prefixs
+    HAMUN = Prefix('همون', 1, 'همان',connector='fasele',non_connecting_chars=['ه'])
+    HAMIN = Prefix('همین', 1,connector='fasele')
+    HAR = Prefix('هر', 1,connector='fasele')
+    UN = Prefix('اون', 1, 'آن',connector='fasele',non_connecting_chars=['ه'])
+    IN = Prefix('این', 1,connector='fasele',non_connecting_chars=['ه'])
+    HICH = Prefix('هیچ', 1,connector='nim',non_connecting_chars=['ه', 'ا', 'آ'])
+    B = Prefix('ب', 1, 'به', ignore_poses=['VERB', 'CCONJ', 'SCONJ'],connector='fasele',non_connecting_chars=['ا', 'ه', 'آ'])
+    Y = Prefix('ی', 1, 'یک', ignore_poses=['VERB', 'CCONJ', 'SCONJ'],connector='fasele',non_connecting_chars=['ا', 'آ'])
+    BI = Prefix('بی', 1, ignore_poses=['VERB'],connector='nim',non_connecting_chars=['ا'])
+    POR = Prefix('پر', 1, ignore_poses=['VERB'],connector='nim')
+    pres = [[HAMIN, HAMUN, UN, IN, HAMIN, BI, B, Y, POR, HAR]]
+    #postfixs
+    Y1 = Postfix('ی', 0, ignore_poses=['VERB'], connector='none',non_connecting_chars=['ی', 'ا', 'و', 'آ', 'اً'])
+    TAR = Postfix('تر', 1, connector='nim')
+    TARIN = Postfix('ترین', 1, connector='nim')
+    HAY = Postfix('های', 2, connector='nim')
+    HA = Postfix('ها', 2, connector='nim')
+    A = Postfix('ا', 2, 'ها', ignore_poses=['VERB'], connector='nim',non_connecting_chars=['ا', 'و', 'آ', 'اً'])
+    A1 = Postfix('ای', 2, 'های', ignore_poses=['VERB'], connector='nim',non_connecting_chars=['ا', 'و', 'آ', 'اً'])
+    YY = Postfix('یی', 3, 'یی', ignore_poses=['VERB'], connector='none')
+    M = Postfix('م', 3, ignore_poses=['VERB'], connector='none')
+    M_MAN = Postfix('م', 3, 'من', ignore_poses=['VERB'], connector='fasele')
+    T = Postfix('ت', 3, connector='none')
+    T1 = Postfix('ت', 3, 'تو', connector='fasele')
+    # T2 = Postfix('ت', 3, 'خود', ignore_poses=['VERB'], connector='fasele')
+    SH = Postfix('ش', 3, connector='none')
+    # SH1 = Postfix('ش', 3, 'خود', connector='fasele')
+    # SH2 = Postfix('ش', 3, 'آن', connector='fasele')
+    # SH3 = Postfix('ش', 3, 'او', connector='fasele')
+    MAN = Postfix('مان', 3, connector='nim')
+    MAN1 = Postfix('مان', 3, 'ما', connector='fasele')
+    # MAN2 = Postfix('مان', 3, 'خود', connector='fasele')
+    MUN = Postfix('مون', 3, 'مان', connector='nim')
+    # MUN1 = Postfix('مون', 3, 'خود', connector='fasele')
+    MUN2 = Postfix('مون', 3, 'ما', connector='fasele')
+    TAN = Postfix('تان', 3, connector='nim')
+    # TAN1 = Postfix('تان', 3, 'خود', connector='fasele')
+    TAN2 = Postfix('تان', 3, 'شما', connector='fasele')
+    TUN = Postfix('تون', 3, 'تان', connector='nim')
+    # TUN1 = Postfix('تون', 3, 'خود', connector='fasele')
+    TUN2 = Postfix('تون', 3, 'شما', connector='fasele')
+    SHAN = Postfix('شان', 3, connector='nim')
+    # SHAN1 = Postfix('شان', 3, 'خود', connector='fasele')
+    SHAN2 = Postfix('شان', 3, 'آنان', connector='fasele')
+    SHUN = Postfix('شون', 3, 'شان', connector='nim')
+    # SHUN1 = Postfix('شون', 3, 'خود', connector='fasele')
+    SHUN2 = Postfix('شون', 3, 'آنان', connector='fasele')
+    N = Postfix('ن', 4, 'هستند', ignore_poses=['VERB', 'CCONJ', 'SCONJ'], connector='fasele', non_connecting_chars=['ی'])
+    SHAM = Postfix('شم', 4, 'بشوم',ignore_poses=['VERB'], connector='fasele')
+    SHI= Postfix('شی', 4, 'بشوی',ignore_poses=['VERB'], connector='fasele')
+    SHE= Postfix('شه', 4, 'شود',ignore_poses=['VERB'], connector='fasele')
+    SHIN= Postfix('شین', 4, 'شوید',ignore_poses=['VERB'], connector='fasele')
+    SHID= Postfix('شید', 4, 'شوید',ignore_poses=['VERB'], connector='fasele')
+    SHAAN= Postfix('شن', 4, 'شوند',ignore_poses=['VERB'], connector='fasele')
+    SHAND= Postfix('شند', 4, 'شوند',ignore_poses=['VERB'], connector='fasele')
+    M2 = Postfix('م', 4, 'هم',ignore_poses=['VERB'], connector='fasele')
+    V = Postfix('و', 4, 'را', connector='fasele', non_connecting_chars=['ا', 'ای', 'آ', 'اً'])
+    V1 = Postfix('رو', 4, 'را', connector='fasele')
+    H = Postfix('ه', 4, '', ignore_poses=['VERB', 'CCONJ', 'SCONJ'], connector='none')
+    # H2 = Postfix('ه', 4)
+    M1 = Postfix('م', 4, 'هستم',ignore_poses=['VERB'], connector='fasele')
+    Y2 = Postfix('ی', 4, 'ی', ignore_poses=['VERB'], connector='none')
+    H1 = Postfix('ه', 4, 'است', ignore_poses=['VERB'], connector='fasele', non_connecting_chars=['ا', 'آ', 'اً'])
+    S = Postfix('س', 4, 'است', connector='fasele')
+    ST = Postfix('ست', 4, 'است', connector='fasele')
+    ED = Postfix('ید', 4, 'هستید', ignore_poses=['VERB'], connector='fasele')
+    EN = Postfix('ین', 4, 'هستید', ignore_poses=['VERB'], connector='fasele', non_connecting_chars=['تر'])
+    EM = Postfix('یم', 4, 'هستیم', ignore_poses=['VERB'], connector='fasele')
+    ND = Postfix('ند', 4, 'هستند', ignore_poses=['VERB'], connector='fasele')
+    # posts = [[Y1], [TAR, TARIN], [HA, HAY, A, A1], [M, T, SH, MAN, MUN, TAN, TUN, SHAN, SHUN], [N, S, ST, M1, M2, V, V1,Y2, H, H1, ED, EN, EM, ND, SHAM, SHI, SHID, SHE, SHAND, SHIN, SHAAN]]
+    # posts = [[Y1], [TAR, TARIN], [HA, HAY, A, A1], [YY, M, M_MAN, T, T1, T2, SH, MAN, MAN1, MAN2,MUN,MUN1,MUN2, TAN,TAN1,TAN2, TUN,TUN1,TUN2, SHAN,SHAN1,SHAN2, SHUN, SHUN1, SHUN2], [N, S, ST, M1, M2, V, V1,Y2, H1, ED, EN, EM, ND, SHAM, SHI, SHID, SHE, SHAND, SHIN, SHAAN]]
+    posts = [[Y1], [TAR, TARIN], [HA, HAY, A, A1], [YY, M, M_MAN, T, T1,  SH, MAN, MAN1,MUN,MUN2, TAN,TAN2, TUN,TUN2, SHAN,SHAN2, SHUN, SHUN2], [N, S, ST, M1, M2, V, V1,Y2, H1, ED, EN, EM, ND, SHAM, SHI, SHID, SHE, SHAND, SHIN, SHAAN]]
+    PossessiveـPronouns = [M,T,SH, MAN, MUN, TAN, TUN, SHAN, SHUN]
+    cant_append_h_posts = [Y1, TAR, TARIN]
+    As = [A, A1]
+    def get_separator(self, w1, w2, append_h):
+        connector_2_str = {'none': '', 'nim': OneShotTransformer.NIM_FASELE, 'fasele': ' '}
+        not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و']
+        # if w2 == OneShotTransformer.Y2:
+        #     return ''
+        # if w2 in [OneShotTransformer.M, OneShotTransformer.T, OneShotTransformer.SH] and ( type(w1) == str and w1[-1] in ['ا', 'و']):
+        #     return 'ی'
+        # if type(w1) != str and w1.level == 1:
+        #     return ' '
+        # not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و']
+        # if w1 in [OneShotTransformer.Y, OneShotTransformer.B, OneShotTransformer.HAMIN, OneShotTransformer.IN, OneShotTransformer.HAMUN] or w2 in [OneShotTransformer.ED, OneShotTransformer.EN, OneShotTransformer.EM, OneShotTransformer.ND, OneShotTransformer.H1, OneShotTransformer.M1, OneShotTransformer.S, OneShotTransformer.ST, OneShotTransformer.V, OneShotTransformer.N, OneShotTransformer.M2]:
+        #     return ' '
+        #
+        # if ((type(w1) == str and len(w1)> 0 and w1[-1] in ['ا', 'و']) or (type(w1) != str and  w1.formal[-1] in [ 'ا', 'و']))and w2.level == 3 :
+        #     return 'ی' + '‌'
+        # if (type(w1) == str and len(w1)> 0 and w1[-1] in not_connect_chars) or (type(w1) != str and w1.word[-1] in not_connect_chars):
+        #     return ''
+        all_pres = [p for pres in OneShotTransformer.pres for p in pres]
+        all_posts = [p for posts in OneShotTransformer.posts for p in posts]
+        if type(w1) == str:
+            last_ch = w1[-1]
+        else:
+            last_ch = w1.word[-1]
+        separator = ''
+        extra_sep = ''
+        if type(w1) == str and append_h and w2 in [OneShotTransformer.M, OneShotTransformer.T, OneShotTransformer.SH]:
+            extra_sep = OneShotTransformer.NIM_FASELE + 'ا'
+        if w2 in [OneShotTransformer.M, OneShotTransformer.T, OneShotTransformer.SH, OneShotTransformer.MAN, OneShotTransformer.MUN, OneShotTransformer.TAN, OneShotTransformer.TUN, OneShotTransformer.SHAN, OneShotTransformer.SHUN] and ( last_ch in ['ا', 'و']) :
+            extra_sep = 'ی'
+        if w1 in all_pres:
+            separator = connector_2_str[w1.connector]
+        if w2 in all_posts:
+            separator = connector_2_str[w2.connector]
+        # replace nim_fasele with '' for non connected words
+        if last_ch in not_connect_chars and separator == OneShotTransformer.NIM_FASELE:
+            separator = ''
+        return extra_sep + separator
+    def lemma_to_formals(self, iword):
+        out_iwords = [iword]
+        if iword.lemma in self.mapper and self.iword2str(iword) != self.mapper[iword.lemma]:
+            for map_words in self.mapper[iword.lemma]:
+                new_iw = InformalWord(lemma=map_words,prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos, append_h=iword.append_h)
+                if not iword.prefixs and not iword.postfixs:
+                    new_iw.is_mapper = True
+                    new_iw.semi_mapper = True
+                else:
+                    new_iw.semi_mapper = True
+                out_iwords.append(new_iw)
+        formal_verbs = self.verb_to_formal_func(iword.lemma)
+        if formal_verbs is not None:
+            for f_v in formal_verbs:
+                new_iw = InformalWord(lemma=f_v,prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos, append_h=iword.append_h)
+                new_iw.is_verb = True
+                out_iwords.append(new_iw)
+        return out_iwords
+    def should_ignore_by_postagg(self, iword):
+        post_pres = [pre for pre in iword.prefixs] + [post for post in iword.postfixs]
+        for p in post_pres:
+            if (p.ignore_poses and iword.pos in p.ignore_poses) or (p.poses and iword.pos not in p.poses):
+                return True
+        return False
+    def filtered_based_on_rules(self, iword):
+        #YY
+        ha_p = [OneShotTransformer.A, OneShotTransformer.HA]
+        if iword.postfixs and OneShotTransformer.YY in iword.postfixs and not all(p in ha_p + [OneShotTransformer.YY] for p in iword.postfixs):
+            return True
+        #hasti!
+        if (iword.postfixs and len(iword.postfixs) == 1 and OneShotTransformer.Y2 in iword.postfixs and iword.lemma and iword.lemma[-1] in ['و', 'ا']) or (iword.postfixs and len(iword.postfixs) == 2 and OneShotTransformer.Y2 in iword.postfixs and iword.postfixs[0] in [OneShotTransformer.A, OneShotTransformer.HA]):
+            return True
+        #non connecting chars
+        if iword.prefixs:
+            last_pre = iword.prefixs[-1]
+            if last_pre.non_connecting_chars and iword.lemma and any(iword.lemma.startswith(ch) for ch in last_pre.non_connecting_chars):
+                return True
+        if iword.postfixs:
+            first_post = iword.postfixs[0]
+            if first_post.non_connecting_chars and iword.lemma and any(iword.lemma.endswith(ch) for ch in first_post.non_connecting_chars):
+                return True
+        #hidden H # goshnashe
+        if not iword.semi_mapper and not iword.append_h and iword.lemma and iword.lemma[-1] == 'ه' and iword.postfixs  and iword.lemma not in self.non_hidden_h_words:
+            return True
+        # h + h
+        if iword.prefixs and iword.postfixs and len(iword.lemma) < 2:
+            return True
+        # خونهه - خونششونه
+        if iword.append_h and (OneShotTransformer.H in iword.postfixs or (len(iword.postfixs) == 1 and OneShotTransformer.H1 in iword.postfixs) ):
+           return True
+        if iword.prefixs and (OneShotTransformer.B in iword.prefixs or OneShotTransformer.Y in iword.prefixs) and (iword.lemma and iword.lemma[0] in ['ا', 'ی', 'و']):
+            return True
+        if iword.lemma in self.isolated_words and (iword.prefixs or iword.postfixs):
+            return True
+        # verb + postfixs ex:  برنامه
+        if (iword.is_verb and iword.prefixs) or(iword.is_verb and iword.postfixs and (len(iword.postfixs) > 1 or not any(p in iword.postfixs for p in OneShotTransformer.PossessiveـPronouns +[OneShotTransformer.V]))):
+            return True
+        return False
+    def iword2str(self, iword):
+        sorted_prefixs = list(sorted(iword.prefixs, key=lambda prefix: prefix.level))
+        sorted_postfixs = list(sorted(iword.postfixs, key=lambda postfix: postfix.level))
+        concated_str = ''
+        zipped_prefixs = [(sorted_prefixs[i], sorted_prefixs[i + 1]) if i < len(sorted_prefixs) - 1 else (
+        sorted_prefixs[i], iword.lemma) for i in range(len(sorted_prefixs))]
+        for prev_prefix, prefix in zipped_prefixs:
+            separator = self.get_separator(prev_prefix, prefix, append_h=False)
+            prefix_formal = prev_prefix.formal
+            concated_str += prefix_formal
+            concated_str += separator
+        concated_str += iword.lemma
+        zipped_postfix = [(sorted_postfixs[i - 1], sorted_postfixs[i]) if i > 0 else (iword.lemma, sorted_postfixs[i])
+                          for i in range(len(sorted_postfixs))]
+        for postfix, next_postfix in zipped_postfix:
+            separator = self.get_separator(postfix, next_postfix, append_h=iword.append_h)
+            concated_str += separator
+            postfix_formal = next_postfix.formal
+            concated_str += postfix_formal
+        return concated_str
+    def to_formals(self, iword):
+        str_iwords = []
+        all_iwords = self.lemma_to_formals(iword)
+        for iword in all_iwords:
+            # if iword.lemma == 'اون':
+            #     print('')
+            if len(iword.lemma) == 1 and iword.lemma != 'و':
+                str_iwords.append(('', None))
+                continue
+            if self.filtered_based_on_rules(iword):
+                str_iwords.append(('', None))
+                continue
+            if self.should_ignore_by_postagg(iword):
+                str_iwords.append(('', None))
+                continue
+            if not iword.is_verb and not iword.semi_mapper and iword.lemma not in self.vocab:
+                str_iwords.append(('', None))
+                continue
+            concated_str = self.iword2str(iword)
+            str_iwords.append((concated_str, iword))
+        return str_iwords
+    def un_in(self, iword):
+        new_lemma = iword.lemma.replace('ون', 'ان')
+        if new_lemma != iword.lemma:
+            return InformalWord(lemma=new_lemma, prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos)
+        else:
+            return False
+    def prefix_obj(self, word):
+        op_separete = {'م': 'من', 'ت': 'تو', 'ش': 'آن', 'تان': 'شما', 'تون': 'شما', 'شون': 'آنان', 'شان': 'آنان',
+                       'مان': 'ما', 'مون': 'ما'}
+        candidates = []
+        formal = ''
+        m = self.pre_obj_pattern.match(word)
+        if m:
+            tokens = m.groups()
+            if tokens[0] == 'باها':
+                formal += 'با'
+            else:
+                formal += tokens[0]
+            formal_obj = op_separete[tokens[1]]
+            formal += ' '
+            formal += formal_obj
+            if tokens[2] is not None:
+                formal += ' '
+                formal += 'هم'
+            alts = {'هم': 'هستم', 'آن': 'او'}
+            tokens = [[w] for w in formal.split()]
+            for t in tokens:
+                if t[0] in alts:
+                    t.append(alts[t[0]])
+            candidates = itertools.product(*tokens)
+            candidates = [' '.join(cnd) for cnd in candidates]
+        return [(c, c) for c in candidates]
+    def append_tanvin_hat(self, iword):
+        if len(iword.lemma) > 1 and iword.lemma[0] == 'ا' and iword.lemma[-1] != 'ا':
+            new_lemma = 'آ' + iword.lemma[1:]
+            return InformalWord(lemma=new_lemma, prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos)
+        if len(iword.lemma) > 1 and iword.lemma[-1] == 'ا':
+            new_lemma = iword.lemma[:-1] + 'اً'
+            return InformalWord(lemma=new_lemma, prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos)
+        return False
+    def append_h(self, iword):
+        not_apply = self.verb_to_formal_func(iword.lemma) or (iword.lemma and iword.lemma[-1] in ['ا', 'و', 'ی'])  or len(iword.lemma) <= 1 or iword.lemma =='' or iword.lemma[-1] == 'ه' or (OneShotTransformer.H in iword.postfixs and len(iword.postfixs) == 1) or any(p in iword.postfixs for p in OneShotTransformer.As) or(OneShotTransformer.V in iword.postfixs) or (iword.postfixs and iword.postfixs[0].word[0] in ['ی', 'و','ا'])
+        ######## when add h?
+        new_lemma = iword.lemma + 'ه'
+        ############# new_lemma in self.vocab
+        if len(iword.postfixs) > 0 and not any([p in OneShotTransformer.cant_append_h_posts for p in iword.postfixs]) and not not_apply and new_lemma not in self.non_hidden_h_words:
+        # if len(iword.postfixs) > 0 and not not_apply and new_lemma in self.vocab and new_lemma not in self.non_hidden_h_words:
+            return InformalWord(lemma=new_lemma, prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos, append_h= True)
+        return False
+    def __init__(self, vocab, mapper, verb_to_formal_func, ignore_words, postfix_mapper, isolated_words, non_hidden_h_words):
+        self.vocab = vocab
+        self.mapper = mapper
+        self.verb_to_formal_func = verb_to_formal_func
+        self.ignore_words = ignore_words
+        self.postfix_mapper = postfix_mapper
+        self.isolated_words = isolated_words
+        self.non_hidden_h_words = non_hidden_h_words
+        self.operators = [self.un_in, self.append_h, self.append_tanvin_hat]
+        patt = r'(از|به|باها)(مان|شون|شان|مون|م|تون|تان|ت|ش)(م)?$'
+        self.pre_obj_pattern = re.compile(patt)
+    def all_sequence_of_postfixs(self, word, index):
+        all_seqs  =[]
+        for p in OneShotTransformer.posts[index]:
+            p_w = p.word
+            if word.startswith(p_w):
+                w = word[len(p_w):]
+                if len(w) == 0:
+                    all_seqs.append(p)
+                else:
+                    if index < len(OneShotTransformer.posts) -1 :
+                        resp = self.all_sequence_of_postfixs(w, index+1)
+                        if len(resp) > 0:
+                            for item in resp:
+                                if type(item) == list:
+                                    item.append(p)
+                                    sequence_with_p = item
+                                else:
+                                    sequence_with_p = [p, item]
+                                all_seqs.append(sequence_with_p)
+        if index < len(OneShotTransformer.posts) - 1:
+            resp = self.all_sequence_of_postfixs(word, index + 1)
+            all_seqs.extend(resp)
+        else:
+            return all_seqs
+        return all_seqs
+    def combine(self, l1, l2):
+        if len(l1) == 0:
+            return l2
+        elif len(l2) == 0:
+            return l1
+        return list(itertools.product(l1, l2))
+    def get_expand(self, iword):
+        all_possible_words = []
+        for subset_operators in utils.powerset(self.operators):
+            new_iword = InformalWord(lemma=iword.lemma, prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos)
+            for so in subset_operators:
+                so_resp = so(new_iword)
+                if so_resp:
+                    new_iword = so_resp
+            all_possible_words.append(new_iword)
+        return all_possible_words
+    def match_postfixs(self, word, pos):
+        possible_combinatios = []
+        for i in range(len(OneShotTransformer.posts)):
+            for p in OneShotTransformer.posts[i]:
+                p_word = p.word
+                p_indxs = [indx for indx, ch in enumerate(word) if word[indx:indx+len(p_word)] == p_word]
+                for p_indx in p_indxs:
+                    if p_indx != -1:
+                        lemma = word[:p_indx]
+                        pp = word[p_indx + len(p_word):]
+                        if len(pp) ==0:
+                            iw = InformalWord(lemma=lemma, postfixs=[p], pos=pos)
+                            possible_combinatios.append(iw)
+                            continue
+                        if i < len(OneShotTransformer.posts) -1:
+                            all_postfix = self.all_sequence_of_postfixs(pp, index=i+1)
+                            if len(all_postfix) > 0:
+                                for pfixs in all_postfix:
+                                    if type(pfixs) == list:
+                                        pfixs.append(p)
+                                    else:
+                                        pfixs = [p, pfixs]
+                                    iw = InformalWord(lemma=lemma, postfixs=pfixs, pos=pos)
+                                    possible_combinatios.append(iw)
+                        elif len(pp) == 0:
+                            iw = InformalWord(lemma=lemma, postfixs=[p], pos=pos)
+                            possible_combinatios.append(iw)
+        return possible_combinatios
+    def match_prefixs(self, word, pos):
+        possible_combinatios = []
+        for i in range(len(OneShotTransformer.pres)):
+            for p in OneShotTransformer.pres[i]:
+                if word.startswith(p.word):
+                    lemma = word[len(p.word):]
+                    prefixs = [p]
+                    iw = InformalWord(lemma=lemma, prefixs=prefixs, postfixs=[], pos=pos)
+                    possible_combinatios.append(iw)
+                    return possible_combinatios
+        return []
+    def parse_word(self, iword):
+        parsed_resp = []
+        prefixed_word = self.match_prefixs(iword.lemma,pos=iword.pos)
+        prefixed_word.append(iword)
+        parsed_resp.extend(prefixed_word)
+        for pw in prefixed_word:
+            postfixed_iwords = self.match_postfixs(pw.lemma,pos=iword.pos)
+            for piw in postfixed_iwords:
+                piw.prefixs = pw.prefixs
+                parsed_resp.append(piw)
+        return parsed_resp
+    def is_seqs_of_verbs(self, txt):
+        words = txt.split()
+        if len(words) < 2:
+            return False
+        for w in words:
+            formal_verb = self.verb_to_formal_func(w)
+            if formal_verb is None:
+                return False
+        if words[-1] in ['است', 'هست']:
+            return False
+        return True
+    def filter_results(self, word_lemmas):
+        return list(filter(lambda wl: len(wl[0])>0 and wl[0][-1] != '‌' and not self.is_seqs_of_verbs(wl[0]), word_lemmas))
+    def concatenate_formal_words(self, pre, next):
+        """
+        خانه +‌ ت -> خانه‌ات
+        دیگر + ای -> دیگری
+        """
+        nim_fasele = '‌'
+        not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و']
+        if len(pre) < 1 :
+            return next
+        if pre[-1] in ['ه'] and next in ['م', 'ت', 'ش']:
+            return pre + nim_fasele + 'ا' + next
+        if pre[-1] == 'ا'and next.split() and next.split()[0] in ['م', 'ت', 'ش', 'مان', 'تان', 'شان']:
+            return pre + nim_fasele + 'ی' + next
+        if pre[-1] not in ['ه'] and next in ['ای']:
+            return pre + 'ی'
+        out = pre  + next
+        if pre[-1] not in not_connect_chars or next.startswith('ها') or pre[-1] in ['ه'] or pre + nim_fasele + next in self.vocab:
+            out = pre + nim_fasele + next
+        if self.verb_to_formal_func(next):
+            out = pre + ' ' + next
+        return out
+    def handle_nim_fasele_words(self, word, pos):
+        def extract_lemma_nim_fasele_words(word, pos):
+            formal_prefixs = []
+            formal_postfixs = []
+            prefixs = {'اون': 'آن', 'همون': 'همین'}
+            postfixs = self.postfix_mapper
+            tokens = word.split('‌')
+            index = 0
+            for i in range(len(tokens)):
+                index = i
+                if tokens[i] not in prefixs:
+                    break
+                else:
+                    formal_prefixs.append(prefixs[tokens[i]])
+            for i in range(len(tokens), index, -1):
+                current_tok = '‌'.join(tokens[index:i])
+                if current_tok in self.vocab or tokens[i - 1] not in postfixs:
+                    return formal_prefixs, current_tok, formal_postfixs
+                else:
+                    formal_postfixs.append(postfixs[tokens[i - 1]])
+            return formal_prefixs, current_tok, formal_postfixs
+        nim_fasele = '‌'
+        candidates = []
+        formal_word = ''
+        verbs = self.verb_to_formal_func(word)
+        if verbs:
+            return [(v, v) for v in verbs]
+        all_candidates = set()
+        # lemma
+        formal_prefixs, lemma, formal_postfixs = extract_lemma_nim_fasele_words(word, pos)
+        word_lemmas = self.transform(lemma, pos, ignore_nim_fasele=True)
+        # lemma with postfix should len=1
+        one_token_words = [wl for wl in word_lemmas if len(wl[0].split()) == 1]
+        if formal_postfixs and one_token_words:
+            all_formal_lemma_candidates = one_token_words
+        else:
+            all_formal_lemma_candidates = word_lemmas
+        if not all_formal_lemma_candidates:
+                if formal_postfixs or formal_prefixs:
+                    all_formal_lemma_candidates = [(lemma, lemma)]
+                else:
+                    tokens = lemma.split(nim_fasele)
+                    if all(self.transform(t, None, ignore_nim_fasele=True) for t in tokens):
+                        w = ' '.join(tokens)
+                        return [(w, w)]
+                    else:
+                        return []
+        for cnd_lemma, formal_word_lemma in all_formal_lemma_candidates:
+            formal_word = ''
+            toks = formal_prefixs + [cnd_lemma] + formal_postfixs
+            for index, t in enumerate(toks):
+                formal_word = self.concatenate_formal_words(formal_word, t)
+            all_candidates.add((formal_word, formal_word_lemma))
+            #     if t in self.postfix_mapper:
+            #         formal_t = self.postfix_mapper[t]
+            #     else:
+            #         transform_outputs = self.transform(t, pos)
+            #         if not transform_outputs:
+            #             formal_t = t
+            #         else:
+            #             one_word_outputs = [ft for ft in transform_outputs if len(ft.split()) == 1]
+            #             if one_word_outputs:
+            #                 if t in one_word_outputs:
+            #                     formal_t = t
+            #                 else:
+            #                     formal_t = one_word_outputs[0]
+            #             else:
+            #                 formal_t = transform_outputs.pop()
+        return all_candidates
+    def transform(self, word, pos, ignore_nim_fasele=False):
+        """ignore emoji , punctuation, numbers"""
+        ignore_chars = '.1234567890!@#$%^&*()_+۱۲۳۴۵۶۷۸۹÷؟×−+?><}،,{":' + string.ascii_lowercase + string.ascii_uppercase
+        if any(ic in word for ic in ignore_chars) or utils.if_emoji(word):
+            return [(word, word)]
+        """handle nim fasele"""
+        nim_fasele = '‌'
+        if not ignore_nim_fasele and nim_fasele in word:
+            return self.handle_nim_fasele_words(word, pos)
+        # pass ignore words and accept as correct informal word!
+        if word in self.ignore_words and not word in self.mapper:
+            return [(word, word)]
+        formal_prefix_obj = self.prefix_obj(word)
+        if formal_prefix_obj:
+            return formal_prefix_obj
+        iword = InformalWord(lemma=word, pos=pos)
+        expanded_candidates = []
+        candidates = self.parse_word(iword)
+        #just verbs
+        if any(c.is_verb for c in candidates):
+            candidates = [c for c in candidates if c.is_verb]
+        for cnd in candidates:
+            expanded_candidates.extend(self.get_expand(cnd))
+        word_iwords = []
+        for ec in expanded_candidates:
+            word_iwords.extend(self.to_formals(ec))
+        if any(f[1] and (f[1].is_mapper or f[1].is_verb) for f in word_iwords if f[1] is not None):
+            word_iwords = [f for f in word_iwords if f[1] and (f[1].is_mapper or f[1].is_verb)]
+        # else:
+        word_lemmas_set = [(w, iword.lemma) for w, iword in word_iwords if iword is not None]
+        word_lemmas_set = set(word_lemmas_set)
+        out = self.filter_results(word_lemmas_set)
+        # if type(out) == str:
+        #     out = [out]
+        # out = set(out)
+        return out
+if __name__ == '__main__':
+    transformer = OneShotTransformer(None, None, None)
+    candidates =  transformer.match_postfixs('کارامم')
+    print(candidates)

VerbHandler.py ADDED Viewed

	@@ -0,0 +1,350 @@

+import re
+from enum import Enum
+from hazm import Normalizer
+import pandas as pd
+Formality = Enum('Formality', 'formal informal')
+VerbTime = Enum('VerbTime', 'past present future')
+Person = Enum('Person', 'Man To An Ma Shoma Anha')
+Number = Enum('Number', 'Mofrad Jam')
+class Verb:
+    def __init__(self, root, formality, time, pp, person, number):
+        self.root = root
+        self.formality = formality
+        self.time = time
+        self.person = person
+        self.number = number
+        self.pp = pp
+class VerbHandler():
+    def __init__(self, csv_verb_addr, csv_irregular_verbs_mapper):
+        self.posfix_mapper = {'ه': 'د', 'ن': 'ند', 'ین': 'ید'}
+        self.objective_pr_mapper = {'شون':'شان', 'تون':'تان', 'مون':'مان'}
+        self.init_mapper = {'کنه': 'بکنه', 'کنم':'بکنم', 'کنی':'بکنی', 'کنیم': 'بکنیم', 'کنین':'بکنین', 'کنید':'بکنید', 'کنن':'بکنن', 'کنند':'بکنند'}
+        self.out_mapper = {'می‌ایی': 'می‌آیی'}
+        self.init_mapper.update({'شم':'بشم', 'شی':'بشی', 'شن':'بشن', 'شین':'بشین' ,'شه':'بشه', 'شیم': 'بشیم'})
+        self.bons = self.load_bons(csv_verb_addr)
+        self.irregular_verbs = self.load_irregular_mapper(csv_irregular_verbs_mapper)
+        self.informal_past_bons = self.get_bons(type=Formality.informal, time=VerbTime.past)
+        self.informal_present_bons = self.get_bons(type=Formality.informal, time=VerbTime.present)
+        self.formal_past_bons = self.get_bons(type=Formality.formal, time=VerbTime.past)
+        self.formal_present_bons =self.get_bons(type=Formality.formal, time=VerbTime.present) + ['هست']
+        self.all_past_bons = self.formal_past_bons + self.informal_past_bons
+        self.all_present_bons = self.formal_present_bons + self.informal_present_bons
+        self.verb_mapper = {b:{'formal':self.bons[b]['formal']} for b in self.bons if self.bons[b]['type'] == Formality.informal}
+        self.solve_alef_issue()
+        self.compile_patterns()
+    def load_irregular_mapper(self, csv_addr):
+        df = pd.read_csv(csv_addr)
+        mapper = {informal: formal for _, (informal, formal) in df.iterrows()}
+        return mapper
+    def load_bons(self, csv_addr):
+        normalizer = Normalizer()
+        df = pd.read_csv(csv_addr)
+        df = df.fillna('')
+        bons = {}
+        for i, row in df.iterrows():
+            if row[2]:
+                row[2] = normalizer.normalize(row[2])
+                bons[row[2]] = {'type': Formality.formal, 'time': VerbTime.past}
+            if row[3]:
+                row[3] = normalizer.normalize(row[3])
+                bons[row[3]] = {'type': Formality.formal, 'time': VerbTime.present}
+            if row[10]:
+                bs = row[10].split()
+                for b in bs:
+                    bons[b] = {'type': Formality.informal, 'time': VerbTime.past, 'formal': row[2]}
+            if row[11]:
+                bs = row[11].split()
+                for b in bs:
+                    bons[b] = {'type': Formality.informal, 'time': VerbTime.present, 'formal': row[3]}
+        return bons
+    def get_bons(self, type, time):
+        return [b for b in self.bons if self.bons[b]['type'] == type and self.bons[b]['time'] == time]
+    def solve_alef_issue(self):
+        replace_alef_y = lambda v : 'ی' + v[1:]
+        replace_A_YA = lambda v : 'یا' + v[1:]
+        informal_past_start_with_alef = list(map(replace_alef_y, [v for v in self.informal_past_bons if v.startswith('ا') and not v.startswith('ای')]))
+        formal_past_start_with_alef = list(map(replace_alef_y, [v for v in self.formal_past_bons if v.startswith('ا') and not v.startswith('ای')]))
+        informal_present_start_with_alef = list(map(replace_alef_y, [v for v in self.informal_present_bons if v.startswith('ا') and not v.startswith('ای')]))
+        formal_present_start_with_alef = list(map(replace_alef_y, [v for v in self.formal_present_bons if v.startswith('ا') and not v.startswith('ای')]))
+        self.alef_mapper = {}
+        self.informal_past_start_with_alef = informal_past_start_with_alef + list(
+            map(replace_A_YA, [v for v in self.informal_past_bons if v.startswith('آ')]))
+        self.informal_present_start_with_alef = informal_present_start_with_alef + list(
+            map(replace_A_YA, [v for v in self.informal_present_bons if v.startswith('آ')]))
+        self.formal_past_start_with_alef = formal_past_start_with_alef + list(
+            map(replace_A_YA, [v for v in self.formal_past_bons if v.startswith('آ')]))
+        self.formal_present_start_with_alef = formal_present_start_with_alef + list(
+            map(replace_A_YA, [v for v in self.formal_present_bons if v.startswith('آ')]))
+        for verb in self.informal_past_start_with_alef + self.informal_present_start_with_alef + self.formal_past_start_with_alef + self.formal_present_start_with_alef:
+            if verb[:2] == 'یا':
+                origin = 'آ' + verb[2:]
+            else:
+                origin = 'ا' + verb[1:]
+            self.alef_mapper[verb] = origin
+        self.alef_mapper['یای'] = 'آی'
+        remove_a_hat = lambda w: w.replace('آ', 'ا')
+        self.formal_past_bons = list(
+            filter(lambda w: w != '', map(remove_a_hat, self.formal_past_bons + self.formal_past_start_with_alef)))
+        self.formal_present_bons = list(map(remove_a_hat, self.formal_present_bons + self.formal_present_start_with_alef)) + [
+            'یای'] + ['آی']
+        self.informal_past_bons = list(
+            filter(lambda w: w != '', map(remove_a_hat, self.informal_past_bons + self.informal_past_start_with_alef)))
+        self.informal_present_bons = list(
+            map(remove_a_hat, self.informal_present_bons + self.informal_present_start_with_alef)) + [
+                                       'یای'] + ['آی']
+        # sorted by length
+        self.formal_present_bons = sorted(self.formal_present_bons, key=lambda w: -len(w))
+        self.formal_past_bons = sorted(self.formal_past_bons, key=lambda w: -len(w))
+        self.informal_present_bons = sorted(self.informal_present_bons, key=lambda w: -len(w))
+        self.informal_past_bons = sorted(self.informal_past_bons, key=lambda w: -len(w))
+        verb_v_keys = [word for word in self.verb_mapper if 'آ' in word]
+        alef_verb_v_keys = [word for word in self.alef_mapper if 'آ' in word]
+        for v in verb_v_keys:
+            self.verb_mapper[v.replace('آ', 'ا')] = self.verb_mapper[v]
+        for v in alef_verb_v_keys:
+            self.alef_mapper[v.replace('آ', 'ا')] = self.alef_mapper[v]
+    def compile_patterns(self):
+            ME_r = '|'.join(['می','می‌'])
+            B_r = 'ب'
+            not_r = 'ن'
+            past_ends = ['م', 'ی', 'ه', 'یم', 'ین', 'ید', 'ند', '', 'ن']
+            present_ends = ['م', 'ی', 'ه', 'یم', 'ین', 'ن', 'ید', 'ند', 'د', '']
+            naghli_ends = ['ه‌ام', 'ه‌ای', 'ه', 'ه‌ایم', 'ه‌اید', 'ه‌اند']
+            objective_pronouns = ['م', 'ت', 'ش', 'مون', 'تون', 'شون']
+            informal_past_r = '|'.join(self.informal_past_bons)
+            formal_past_r = '|'.join(self.formal_past_bons)
+            informal_present_r = '|'.join(self.informal_present_bons)
+            formal_present_r = '|'.join(self.formal_present_bons)
+            verb_postfix_past_r = '|'.join(past_ends)
+            verb_postfix__present_r = '|'.join(present_ends)
+            objective_pronouns_r = '|'.join(objective_pronouns)
+            naghli_ends_r = '|'.join(naghli_ends)
+            """
+            #گذشته‌ی ساده
+            # r1 =  past_r + verb_postfix_r + objectiveـpronouns_r
+            #گذشته‌ی ناتمام
+            # r2  = '(' + ME + ')'+ past_r +verb_postfix_r + objectiveـpronouns_r
+            #گذشته‌ی استمراری
+            # r3 =  '(' + DASHT + ')'+ past_r +  verb_postfix_r +objectiveـpronouns_r
+            #گذشته‌ی نقلی
+            # r4 = past_r + '(' + '|'.join(naghli_ends) + ')' +objectiveـpronouns_r
+            #گذشته‌ی پیشین
+            # r5 = past_r + verb_postfix_r + '(' + BUD + ')' + verb_postfix_r + objectiveـpronouns_r
+            #حال ساده
+            # r6 = present_r + verb_postfix_r
+           #حال ناتمام
+            # r7 =  '(' + ME + ')'+ present_r +  verb_postfix_r + objectiveـpronouns_r
+            #حال استمراری
+            # r8 = '( ' + DAR + ')'+ verb_postfix_r + '(' + ME + ')' + present_r + verb_postfix_r+ objectiveـpronouns_r
+            #آینده‌ی ساده
+            # r9 = '( ' + KHAH + ')'+ verb_postfix_r + present_r +objectiveـpronouns_r
+            #التزامی - گذشته
+            # r10 = present_r + '(ه)'+  '(' + BASH +  ')' + verb_postfix_r + objectiveـpronouns_r
+            #التزامی - حال
+            # r11 = '(ب)' + present_r + verb_postfix_r +objectiveـpronouns_r
+            """
+            #
+            # + : fealhaye rasmi + pasvan informal , hale sade baraye bazi fela ( hast, kon)
+            # formal
+            formal_present_pattern_b = '({})({})({})?({})?$'.format(B_r, formal_present_r, verb_postfix__present_r, objective_pronouns_r)
+            formal_present_pattern_n_me = '({})?({})({})({})?({})?$'.format(not_r, ME_r, formal_present_r, verb_postfix__present_r, objective_pronouns_r)
+            formal_present_pattern_n = '({})?({})({})?({})?$'.format(not_r, formal_present_r, verb_postfix__present_r, objective_pronouns_r)
+            formal_past_pattern = '({})?({})?({})({}|{})({})?$'.format(not_r, ME_r, formal_past_r, naghli_ends_r,
+                                                                        verb_postfix_past_r, objective_pronouns_r)
+            self.formal_past_verb_pattern = re.compile(formal_past_pattern)
+            self.formal_present_verb_pattern_b = re.compile(formal_present_pattern_b)
+            self.formal_present_verb_pattern_n_me = re.compile(formal_present_pattern_n_me)
+            self.formal_present_verb_pattern_n = re.compile(formal_present_pattern_n)
+            #informal
+            informal_present_pattern_b = '({})({})({})?({})?$'.format(B_r, informal_present_r, verb_postfix__present_r,
+                                                                     objective_pronouns_r)
+            informal_present_pattern_n_me = '({})?({})({})({})?({})?$'.format(not_r, ME_r, informal_present_r,
+                                                                             verb_postfix__present_r, objective_pronouns_r)
+            informal_present_pattern_n = '({})?({})({})?({})?$'.format(not_r, informal_present_r, verb_postfix__present_r,
+                                                                      objective_pronouns_r)
+            informal_past_pattern = '({})?({})?({})({}|{})({})?$'.format(not_r, ME_r, informal_past_r, naghli_ends_r,
+                                                                        verb_postfix_past_r, objective_pronouns_r)
+            self.informal_past_verb_pattern = re.compile(informal_past_pattern)
+            self.informal_present_verb_pattern_b = re.compile(informal_present_pattern_b)
+            self.informal_present_verb_pattern_n_me = re.compile(informal_present_pattern_n_me)
+            self.informal_present_verb_pattern_n = re.compile(informal_present_pattern_n)
+    def parse(self, token):
+        outputs = []
+        match_dict_formal = {'tense': '', 'root': '', 'neg': '', 'postfix': '', 'not_r': '', 'op': '', 'b': '', 'me': '', 'naghli':''}
+        match_dict_informal = {'tense': '', 'root': '', 'neg': '', 'postfix': '', 'not_r': '', 'op': '', 'b': '', 'me': '', 'naghli':''}
+        formal_past_match = self.formal_past_verb_pattern.match(token)
+        informal_past_match = self.informal_past_verb_pattern.match(token)
+        formal_present_match_b = self.formal_present_verb_pattern_b.match(token)
+        informal_present_match_b = self.informal_present_verb_pattern_b.match(token)
+        formal_present_match_n_me = self.formal_present_verb_pattern_n_me.match(token)
+        informal_present_match_n_me = self.informal_present_verb_pattern_n_me.match(token)
+        formal_present_match_n = self.formal_present_verb_pattern_n.match(token)
+        informal_present_match_n = self.informal_present_verb_pattern_n.match(token)
+        present_group_to_dict_b = lambda g: {k:g[i] for i,k in enumerate(['b', 'root', 'postfix', 'op'])}
+        present_group_to_dict_n_me = lambda g: {k:g[i] for i,k in enumerate(['neg', 'me','root', 'postfix','op'])}
+        present_group_to_dict_n = lambda g: {k:g[i] for i,k in enumerate(['neg','root', 'postfix','op'])}
+        past_group_to_dict = lambda g: {k:g[i] for i,k in enumerate(['neg', 'me', 'root', 'postfix', 'op'])}
+        formal_match = formal_past_match or formal_present_match_b or formal_present_match_n_me or formal_present_match_n
+        informal_match = informal_past_match or informal_present_match_b or informal_present_match_n_me or informal_present_match_n
+        if formal_match:
+            if formal_past_match:
+                match_dict_formal = past_group_to_dict(formal_past_match.groups())
+                match_dict_formal['tense'] = 'past'
+            else:
+                if formal_present_match_b:
+                    match_dict_formal = present_group_to_dict_b(formal_present_match_b.groups())
+                elif formal_present_match_n_me:
+                    match_dict_formal = present_group_to_dict_n_me(formal_present_match_n_me.groups())
+                elif formal_present_match_n:
+                    match_dict_formal = present_group_to_dict_n(formal_present_match_n.groups())
+                match_dict_formal['tense'] = 'present'
+            outputs.append(match_dict_formal)
+        if informal_match:
+            if informal_past_match:
+                match_dict_informal = past_group_to_dict(informal_past_match.groups())
+                match_dict_informal['tense'] = 'past'
+            else:
+                if informal_present_match_b:
+                    match_dict_informal = present_group_to_dict_b(informal_present_match_b.groups())
+                elif informal_present_match_n_me:
+                    match_dict_informal = present_group_to_dict_n_me(informal_present_match_n_me.groups())
+                elif informal_present_match_n:
+                    match_dict_informal = present_group_to_dict_n(informal_present_match_n.groups())
+                match_dict_informal['tense'] = 'present'
+            outputs.append(match_dict_informal)
+        for match_dict in outputs:
+            for key,val in match_dict.items():
+                if val is None:
+                    match_dict[key] = ''
+            # print(match_dict)
+        return outputs
+    def formal_concatenate(self, match_dict, should_smooth):
+        out_dict = {'بیای': 'بیا', 'نیای': 'نیا'}
+        if match_dict['root'] == 'است' and match_dict['neg'] != '':
+            return 'نیست' + match_dict['postfix']
+        if self.if_simple_present(match_dict) or self.if_only_me(match_dict):
+            return None
+        if should_smooth:
+            if match_dict['prefix'] != '' and match_dict['prefix'][0] == 'م':
+                pass
+            else:
+                match_dict['root'] = 'یا' + match_dict['root'][1:]
+            # if len(match_dict['prefix']) == 3:
+            #     match_dict['prefix'] = 'می'
+        if match_dict['prefix'] == 'ب' and match_dict['root'] and match_dict['root'][0] == 'ا':
+            match_dict['root'] = 'ی' + match_dict['root'][1:]
+        out = match_dict['neg'] + match_dict['prefix'] + match_dict['root'] + match_dict['postfix'] + match_dict['op']
+        if out in out_dict:
+            out = out_dict[out]
+        return out
+    def _set_match_dict_prefix(self, match_dict):
+        match_dict['prefix'] = ''
+        if 'me' in match_dict and match_dict['me'] != '':
+            if len(match_dict['me']) < 3:
+                match_dict['me'] = 'می‌'
+            match_dict['prefix'] = match_dict['me']
+        elif 'b' in match_dict and match_dict['b'] != '':
+            match_dict['prefix'] = match_dict['b']
+        return match_dict
+    def if_simple_present(self, match_dict):
+        if match_dict['root'] != '' and match_dict['tense'] == 'present' and match_dict['prefix'] == '' and match_dict['neg'] == '':
+            if match_dict['root'] not in ['کن', 'هست', 'است', 'دار', 'نیست', 'باش']:
+                return True
+        return False
+    def if_only_me(self, match_dict):
+        if match_dict['root'] != '' and match_dict['tense'] == 'present' and match_dict['prefix'] !='' and match_dict['prefix'][0] == 'م' and match_dict['postfix'] == '':
+            return True
+        return False
+    def is_masdar(self, match_dict):
+        return  match_dict['root'] in self.all_past_bons and match_dict['me'] == '' and match_dict['postfix'] =='ن' and match_dict['op'] == ''
+    def informal_to_formal(self, token):
+        # irregular verbs checking
+        if token in self.irregular_verbs:
+            return [self.irregular_verbs[token]]
+        if token in self.init_mapper:
+            token = self.init_mapper[token]
+        outputs = []
+        if len(token) < 3:
+            return None
+        should_smooth = False
+        all_match_dicts = self.parse(token)
+        ### بدهدم
+        #برد
+        if len(all_match_dicts) == 2 :
+            if all_match_dicts[1]['root'] in self.verb_mapper and self.verb_mapper[all_match_dicts[1]['root']]['formal'] == all_match_dicts[0]['root'] and all_match_dicts[1]['op'] != '':
+                del all_match_dicts[1]
+            elif all_match_dicts[1] == {'b': 'ب', 'root': 'ر', 'postfix': 'د', 'op': '', 'tense': 'present'}:
+                del all_match_dicts[1]
+        ##
+        is_masdar = False
+        for match_dict in all_match_dicts:
+            if self.is_masdar(match_dict):
+                is_masdar = True
+            #نان بان
+            if match_dict['root'] != '' and match_dict['root'][0] == 'ا' and 'me' not in match_dict and ('b' in match_dict or match_dict['neg'] == 'ن'):
+                return None
+            if match_dict['root'] != '':
+                root = match_dict['root']
+                objective_pr = match_dict['op']
+                postfix = match_dict['postfix']
+                if root in self.alef_mapper:
+                    should_smooth = True
+                    match_dict['root'] = self.alef_mapper[root]
+                if match_dict['root'] in self.verb_mapper:
+                    match_dict['root'] = self.verb_mapper[ match_dict['root']]['formal']
+                if postfix in self.posfix_mapper:
+                    match_dict['postfix'] = self.posfix_mapper[postfix]
+                if match_dict['postfix'] == 'د' and match_dict['tense'] == 'past':
+                    match_dict['postfix'] = 'ه'
+                if objective_pr in self.objective_pr_mapper:
+                    match_dict['op'] = self.objective_pr_mapper[objective_pr]
+                match_dict['prefix'] = ''
+                if 'neg' not in match_dict:
+                    match_dict['neg'] = ''
+                match_dict = self._set_match_dict_prefix(match_dict)
+                formal_verb = self.formal_concatenate(match_dict, should_smooth)
+                outputs.append(formal_verb)
+        not_none_outpts = [o for o in outputs if o is not None]
+        for index, item in enumerate(not_none_outpts):
+            if item in self.out_mapper:
+                not_none_outpts[index] = self.out_mapper[item]
+        if not_none_outpts:
+            # append bon
+            if len(not_none_outpts) == 1 and is_masdar:
+                masdar = not_none_outpts[0][:-2] + 'ن'
+                not_none_outpts.append(masdar)
+            return not_none_outpts
+        return None

app.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import streamlit as st
+import os
+import itertools
+import os
+from pathlib import Path
+import yaml
+from download_utils import download_dataset
+import utils
+from formality_transformer import FormalityTransformer
+from hazm import SentenceTokenizer
+def translate_short_sent(model, sent):
+    out_dict = {}
+    txt = utils.cleanify(sent)
+    is_valid = lambda w: model.oneshot_transformer.transform(w, None)
+    cnd_tokens = model.informal_tokenizer.tokenize(txt, is_valid)
+    for tokens in cnd_tokens:
+        tokens = [t for t in tokens if t != '']
+        new_tokens = []
+        for t in tokens:
+            new_tokens.extend(t.split())
+        txt = ' '.join(new_tokens)
+        tokens = txt.split()
+        candidates = []
+        for index in range(len(tokens)):
+            tok = tokens[index]
+            cnd = set()
+            pos = None
+            if model.verb_handler.informal_to_formal(tok):
+                pos = 'VERB'
+            f_words_lemma = model.oneshot_transformer.transform(tok, pos)
+            f_words_lemma = list(f_words_lemma)
+            for index, (word, lemma) in enumerate(f_words_lemma):
+                if pos != 'VERB' and tok not in model.mapper and model.should_filtered_by_one_bigram(lemma, word, tok):
+                    f_words_lemma[index] = (tok, tok)
+                else:
+                    word_toks = word.split()
+                    word_repr = ''
+                    for t in word_toks:
+                        word_repr += ' ' + t
+                    word_repr = word_repr.strip()
+                    word_repr = model.repalce_for_gpt2(word_repr)
+                    f_words_lemma[index] = (word, word_repr)
+            if f_words_lemma:
+                cnd.update(f_words_lemma)
+            else:
+                cnd = {(tok, tok)}
+            candidates.append(cnd)
+        all_combinations = itertools.product(*candidates)
+        all_combinations_list = list(all_combinations)
+        for id, cnd in enumerate(all_combinations_list):
+            normal_seq = ' '.join([c[0] for c in cnd])
+            lemma_seq = ' '.join([c[1] for c in cnd])
+            lemma_seq = utils.clean_text_for_lm(lemma_seq)
+            out_dict[id] = (normal_seq, lemma_seq)
+        candidates = [[item[0] for item in candidate_phrases] for candidate_phrases in candidates]
+        return model.lm_obj.get_best(candidates)
+def translate(model, sentence_tokenizer, txt):
+    sents = sentence_tokenizer.tokenize(txt)
+    formal_output = ''
+    for sentence in sents:
+        formal_sentence = translate_short_sent(model, sentence)
+        formal_output += ' ' + formal_sentence
+    return formal_output
+class Informal2Formal:
+    def __init__(self) -> None:
+        #download or load files
+        DEFAULT_CACHE_DIR = os.path.join(str(Path.home()), '.dadmatools', 'informal2formal')
+        config = load_config('dadmatools/informal2formal/config.yml')
+        file_urls = config['files'].values()
+        download_dataset(file_urls, DEFAULT_CACHE_DIR, filename=None)
+        # set assets files address
+        verbs_csv_addr = os.path.join(DEFAULT_CACHE_DIR, 'verbs.csv')
+        irregular_verbs_mapper = os.path.join(DEFAULT_CACHE_DIR, 'irregular_verb_mapper.csv')
+        lm_addr = os.path.join(DEFAULT_CACHE_DIR,'3gram.bin')
+        assets_file_addr = os.path.join(DEFAULT_CACHE_DIR,'assets.pkl')
+        self.sentence_tokenizer = SentenceTokenizer()
+        self.model = FormalityTransformer(asset_file_addr=assets_file_addr,
+                                    irregular_verbs_mapper_addr=irregular_verbs_mapper, verbs_csv_addr=verbs_csv_addr, lm_addr=lm_addr)
+def load_config(config_file):
+    with open(config_file, "r") as file:
+        config = yaml.safe_load(file)
+    return config
+st.cache(suppress_st_warning=True, allow_output_mutation=True)
+st.set_page_config(page_title="Persian Informal to formal translator")
+# @st.cache(suppress_st_warning=True, allow_output_mutation=True)
+def load_model():
+    DEFAULT_CACHE_DIR = os.path.join(str(Path.home()), '.dadmatools', 'informal2formal')
+    config = load_config('config.yml')
+    file_urls = config['files'].values()
+    download_dataset(file_urls, DEFAULT_CACHE_DIR, filename=None)
+    # set assets files address
+    verbs_csv_addr = os.path.join(DEFAULT_CACHE_DIR, 'verbs.csv')
+    irregular_verbs_mapper = os.path.join(DEFAULT_CACHE_DIR, 'irregular_verb_mapper.csv')
+    lm_addr = os.path.join(DEFAULT_CACHE_DIR,'3gram.bin')
+    assets_file_addr = os.path.join(DEFAULT_CACHE_DIR,'assets.pkl')
+    model = FormalityTransformer(asset_file_addr=assets_file_addr,
+                                    irregular_verbs_mapper_addr=irregular_verbs_mapper, verbs_csv_addr=verbs_csv_addr, lm_addr=lm_addr)
+    return model
+st.title("Persian/Farsi Formality Transformer")
+st.write("Translate informal Persian texts to formal")
+user_input: str = st.text_area(
+    "Input text",
+    height=200,
+    max_chars=5120,
+)
+if st.button("Run"):
+    model = load_model()
+    sentence_tokenizer = SentenceTokenizer()
+    translated_text =  translate(model, sentence_tokenizer, user_input)
+    st.success(translated_text)

config.yml ADDED Viewed

	@@ -0,0 +1,5 @@

+files:
+  lm: https://huggingface.co/datasets/Dadmatech/informal2formal/resolve/main/3gram.bin
+  assets: https://huggingface.co/datasets/Dadmatech/informal2formal/resolve/main/assets.pkl
+  irregular_verb: https://huggingface.co/datasets/Dadmatech/informal2formal/raw/main/irregular_verb_mapper.csv
+  verbs: https://huggingface.co/datasets/Dadmatech/informal2formal/raw/main/verbs.csv

download_utils.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import os
+import sys
+import requests
+from tqdm import tqdm
+def download_dataset(urls, dest_dir, filename=None):
+    # source_code: https://github.com/sirbowen78/lab/blob/master/file_handling/dl_file1.py
+    # This example script downloads python program for mac.
+    # Home directory of Mac, pathlib.Path module make this easy.
+    # home_path = Path.home()
+    # This is the sub directory under home directory.
+    # sub_path = "tmp"
+    # The header of the dl link has a Content-Length which is in bytes.
+    # The bytes is in string hence has to convert to integer.
+    os.makedirs(dest_dir, exist_ok=True)
+    for url in urls:
+        if 'drive.google' in url:
+            import gdown
+            # import os
+            # print('gdown downloadddd output: ', dest_dir )
+            # print(dest_dir, filename)
+            # dest_dir = os.path.join(dest_dir,'peyma.zip')
+            return gdown.download(url, quiet=False, output=filename)
+        try:
+            filesize = int(requests.head(url).headers["Content-Length"])
+        except KeyError:
+            print('unknown file length')
+            filesize = -1
+        # os.path.basename returns python-3.8.5-macosx10.9.pkg,
+        # without this module I will have to manually split the url by "/"
+        # then get the last index with -1.
+        # Example:
+        # url.split("/")[-1]
+        filename = os.path.basename(url)
+        # make the sub directory, exists_ok=True will not have exception if the sub dir does not exists.
+        # the dir will be created if not exists.
+        os.makedirs(dest_dir, exist_ok=True)
+        # The absolute path to download the python program to.
+        dl_path = os.path.join(dest_dir, filename)
+        chunk_size = 1024
+        if os.path.exists(dl_path):
+            print(f'file {dl_path} already exist')
+            return dl_path
+        # Use the requests.get with stream enable, with iter_content by chunk size,
+        # the contents will be written to the dl_path.
+        # tqdm tracks the progress by progress.update(datasize)
+        with requests.get(url, stream=True) as r, open(dl_path, "wb") as f, tqdm(
+                unit="B",  # unit string to be displayed.
+                unit_scale=True,  # let tqdm to determine the scale in kilo, mega..etc.
+                unit_divisor=1024,  # is used when unit_scale is true
+                total=filesize,  # the total iteration.
+                file=sys.stdout,  # default goes to stderr, this is the display on console.
+                desc=filename  # prefix to be displayed on progress bar.
+        ) as progress:
+            for chunk in r.iter_content(chunk_size=chunk_size):
+                # download the file chunk by chunk
+                datasize = f.write(chunk)
+                # on each chunk update the progress bar.
+                progress.update(datasize)
+    return True

formality_transformer.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import pickle
+from kenlm_wrapper import Kelm_Wrapper
+from OneShotTransformer import OneShotTransformer
+from VerbHandler import VerbHandler
+import kenlm
+from tokenizer import InformalTokenizer
+class FormalityTransformer:
+    def __init__(self, asset_file_addr, verbs_csv_addr, irregular_verbs_mapper_addr, lm_addr ):
+        assets = pickle.load(open(asset_file_addr, 'rb'))
+        self.vocab = assets['vocab']
+        self.word_ends_tanvin = assets['word_ends_tanvin']
+        self.non_hidden_h_words = assets['non_hidden_h_words']
+        self.isolated_words = assets['isolated_words']
+        self.ignore_words = assets['ignore_words']
+        self.mapper = assets['mapper']
+        self.postfix_mapper = assets['postfix_mapper']
+        postfixes = assets['postfixes']
+        self.informal_tokenizer = InformalTokenizer(self.vocab, postfixes)
+        self.verb_handler = VerbHandler(csv_verb_addr=verbs_csv_addr, csv_irregular_verbs_mapper=irregular_verbs_mapper_addr)
+        self.oneshot_transformer = OneShotTransformer(self.vocab, self.mapper, self.verb_handler.informal_to_formal,
+                                                      ignore_words=self.ignore_words,
+                                                      postfix_mapper=self.postfix_mapper,
+                                                      isolated_words=self.isolated_words,
+                                                      non_hidden_h_words=self.non_hidden_h_words)
+        lm_model = kenlm.Model(lm_addr)
+        self.lm_obj = Kelm_Wrapper(lm_model)
+    def should_filtered_by_one_bigram(self, lemma, word, original_word):
+        NIM_FASELE = '‌'
+        return original_word in self.vocab and (len(word.split()) > 1 or NIM_FASELE in word)
+    def repalce_for_gpt2(self, word_repr):
+        if word_repr in self.word_ends_tanvin:
+            return word_repr[:-2] + 'ا'
+        return word_repr

kenlm_wrapper.py ADDED Viewed

	@@ -0,0 +1,31 @@

+class Kelm_Wrapper:
+    def __init__(self, model):
+        self.model = model
+    def get_best_candidate_word(self, default_phrases, candidate_phrases, index):
+        candidate_texts = [' '.join(default_phrases[:index]) + ' ' + cnd + ' ' + ' '.join(default_phrases[index+1:]) for cnd in candidate_phrases]
+        scores = list(map(self.model.score, candidate_texts))
+        return scores.index(max(scores))
+    def get_best_ongram_phrases(self, candidates_list):
+        bests = []
+        for candidate_phrase in candidates_list:
+            scores = list(map(self.model.score, candidate_phrase))
+            best_phrase = candidate_phrase[scores.index(max(scores))]
+            bests.append(best_phrase)
+        return bests
+    def get_best(self, candidates_list):
+        bests = []
+        default_phrases = self.get_best_ongram_phrases(candidates_list)
+        # print(default_phrases)
+        for index in range(len(candidates_list)):
+            if len(candidates_list[index]) > 1:
+                best_phrase_index = self.get_best_candidate_word(default_phrases, candidates_list[index], index)
+                bests.append(candidates_list[index][best_phrase_index])
+            else:
+                bests.append(candidates_list[index][0])
+        return ' '.join(bests)

main.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import itertools
+import os
+from pathlib import Path
+import yaml
+from download_utils import download_dataset
+import utils
+from formality_transformer import FormalityTransformer
+from hazm import SentenceTokenizer
+def translate_short_sent(model, sent):
+    out_dict = {}
+    txt = utils.cleanify(sent)
+    is_valid = lambda w: model.oneshot_transformer.transform(w, None)
+    cnd_tokens = model.informal_tokenizer.tokenize(txt, is_valid)
+    for tokens in cnd_tokens:
+        tokens = [t for t in tokens if t != '']
+        new_tokens = []
+        for t in tokens:
+            new_tokens.extend(t.split())
+        txt = ' '.join(new_tokens)
+        tokens = txt.split()
+        candidates = []
+        for index in range(len(tokens)):
+            tok = tokens[index]
+            cnd = set()
+            pos = None
+            if model.verb_handler.informal_to_formal(tok):
+                pos = 'VERB'
+            f_words_lemma = model.oneshot_transformer.transform(tok, pos)
+            f_words_lemma = list(f_words_lemma)
+            for index, (word, lemma) in enumerate(f_words_lemma):
+                if pos != 'VERB' and tok not in model.mapper and model.should_filtered_by_one_bigram(lemma, word, tok):
+                    f_words_lemma[index] = (tok, tok)
+                else:
+                    word_toks = word.split()
+                    word_repr = ''
+                    for t in word_toks:
+                        word_repr += ' ' + t
+                    word_repr = word_repr.strip()
+                    word_repr = model.repalce_for_gpt2(word_repr)
+                    f_words_lemma[index] = (word, word_repr)
+            if f_words_lemma:
+                cnd.update(f_words_lemma)
+            else:
+                cnd = {(tok, tok)}
+            candidates.append(cnd)
+        all_combinations = itertools.product(*candidates)
+        all_combinations_list = list(all_combinations)
+        for id, cnd in enumerate(all_combinations_list):
+            normal_seq = ' '.join([c[0] for c in cnd])
+            lemma_seq = ' '.join([c[1] for c in cnd])
+            lemma_seq = utils.clean_text_for_lm(lemma_seq)
+            out_dict[id] = (normal_seq, lemma_seq)
+        candidates = [[item[0] for item in candidate_phrases] for candidate_phrases in candidates]
+        return model.lm_obj.get_best(candidates)
+def translate(model, sentence_tokenizer, txt):
+    sents = sentence_tokenizer.tokenize(txt)
+    formal_output = ''
+    for sentence in sents:
+        formal_sentence = translate_short_sent(model, sentence)
+        formal_output += ' ' + formal_sentence
+    return formal_output
+def load_config(config_file):
+    with open(config_file, "r") as file:
+        config = yaml.safe_load(file)
+    return config
+if __name__ == '__main__':
+    #download or load files
+    DEFAULT_CACHE_DIR = os.path.join(str(Path.home()), '.dadmatools', 'informal2formal')
+    config = load_config('config.yml')
+    file_urls = config['files'].values()
+    download_dataset(file_urls, DEFAULT_CACHE_DIR, filename=None)
+    # set assets files address
+    verbs_csv_addr = os.path.join(DEFAULT_CACHE_DIR, 'verbs.csv')
+    irregular_verbs_mapper = os.path.join(DEFAULT_CACHE_DIR, 'irregular_verb_mapper.csv')
+    lm_addr = os.path.join(DEFAULT_CACHE_DIR,'3gram.bin')
+    assets_file_addr = os.path.join(DEFAULT_CACHE_DIR,'assets.pkl')
+    #test on a sample
+    sentence_tokenizer = SentenceTokenizer()
+    model = FormalityTransformer(asset_file_addr=assets_file_addr,
+                                 irregular_verbs_mapper_addr=irregular_verbs_mapper, verbs_csv_addr=verbs_csv_addr, lm_addr=lm_addr)
+    print(translate(model, sentence_tokenizer, 'اینو میشه واسه تبدیل تموم جملات محاوره استفاده کرد اگه خواستین'))

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+pandas
+hazm
+datasets
+PyYAML
+kenlm
+streamlit
+git+https://github.com/kpu/kenlm@master#egg=kenlm

tokenizer.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import itertools
+import utils
+class InformalTokenizer:
+    def __init__(self, vocab, postfixes):
+        self.vocab = vocab
+        self.pres = InformalTokenizer.get_prefixs()
+        self.posts = postfixes
+    @staticmethod
+    def get_prefixs():
+        return ['نا', 'بی', 'هر', 'می']
+    @staticmethod
+    def get_postfixs(informal_postfix_addr):
+        with open(informal_postfix_addr, 'r') as f:
+           ps = f.read().splitlines()
+        return ps
+    def is_pre_post_word(self, w):
+        nim_fasele = '‌'
+        ws = w.split(nim_fasele)
+        pre, pos, v = [0,1,2]
+        is_pre_pos = False
+        state = pre
+        valid_w = ''
+        for w in ws:
+            if state == pre:
+                if w in self.pres:
+                    valid_w += nim_fasele + w
+                    is_pre_pos = True
+                    continue
+                elif w in self.posts:
+                    valid_w += nim_fasele + w
+                    is_pre_pos = True
+                    state = pos
+                    continue
+                state = v
+                valid_w += nim_fasele + w
+                continue
+            if state == pos:
+                if w in self.posts:
+                    valid_w += nim_fasele + w
+                    continue
+                return False
+            if state == v:
+                if w in self.posts:
+                    is_pre_pos = True
+                    state = pos
+                    valid_w += nim_fasele + w
+                    continue
+                if w in self.vocab:
+                    valid_w += nim_fasele + w
+                    if valid_w not in self.vocab:
+                        return False
+                    continue
+                return False
+        if not is_pre_pos:
+            return False
+        return True
+    def get_valid_word(self, words):
+        seps = ['', '‌']
+        all_seqs = []
+        count = len(words)
+        lst = list(itertools.product(seps, repeat=count-1))
+        for item in lst:
+            seq = ''
+            for word, sep in zip(words[:-1], item):
+                seq += word + sep
+            seq += words[-1]
+            all_seqs.append(seq)
+        return [w for w in all_seqs if w in self.vocab or self.is_pre_post_word(w)]
+    def get_candidates(self, tokens, index=0, current_seq = ' '):
+        if index == len(tokens):
+            return current_seq
+        word = tokens[index]
+        next_word, next_next_word = [None, None]
+        if index < len(tokens) -1:
+            next_word = tokens[index+1]
+        if index < len(tokens) -2:
+            next_next_word = tokens[index+2]
+        cnds = []
+        if next_word is not None:
+            v_words = self.get_valid_word([word, next_word])
+            if v_words:
+                for v_w in v_words:
+                    current_seq1 = current_seq + ' ' + v_w
+                    cnds2 = self.get_candidates(tokens,index+2,  current_seq1)
+                    if type(cnds2) == str:
+                        cnds.append(cnds2)
+                    else:
+                        cnds.extend(cnds2)
+        if next_next_word is not None:
+            v_words = self.get_valid_word([word, next_word, next_next_word])
+            if v_words:
+                for v_w in v_words:
+                    current_seq2 = current_seq + ' ' + v_w
+                    cnds3 = self.get_candidates(tokens,index+3,  current_seq2)
+                    if type(cnds3) == str:
+                        cnds.append(cnds3)
+                    else:
+                        cnds.extend(cnds3)
+        current_seq = current_seq + ' ' + word
+        cnds1 = self.get_candidates(tokens,index+1,  current_seq)
+        if type(cnds1) == str:
+            cnds.append(cnds1)
+        else:
+            cnds.extend(cnds1)
+        return [c.strip() for c in cnds]
+    def seperate_conjs(self, word, validator):
+        conjs = ['و', 'در', 'با', 'تا', 'که', 'از', 'تو', 'من', 'شما']
+        cnds = utils.split_conj_words(word, conjs)
+        valid_cnds = [c for c in cnds if validator(c)]
+        if valid_cnds:
+            return  valid_cnds
+        return [word]
+    def tokenize(self, txt, validator):
+        tokens = txt.split()
+        all_cnds = []
+        for t in tokens:
+            if not validator(t):
+                ws = self.seperate_conjs(t, validator)
+            else:
+                ws = [t]
+            all_cnds.append(ws)
+        all_cnd_tokens = itertools.product(*all_cnds)
+        txts = list(map(self.get_dense_tokens, all_cnd_tokens))
+        return txts
+    def get_dense_tokens(self, tokens):
+        PRE, WORD, POST = 0,1,2
+        out_tokens = []
+        nim_fasele = '‌'
+        current_word = ''
+        state = WORD
+        for i, t in enumerate(tokens):
+            if state == WORD:
+                if t in self.pres:
+                    out_tokens.append(current_word)
+                    current_word = t
+                    state = PRE
+                if t in self.posts:
+                    current_word += nim_fasele
+                    current_word += t
+                    state = POST
+                if t not in self.pres and t not in self.posts:
+                    out_tokens.append(current_word)
+                    current_word = t
+                continue
+            if state == PRE:
+                if t in self.pres:
+                    current_word += nim_fasele
+                    current_word += t
+                if t in self.posts:
+                    out_tokens.append(current_word)
+                    current_word = t
+                    state = WORD
+                if t not in self.pres and t not in self.posts:
+                    current_word += nim_fasele
+                    current_word += t
+                    state = WORD
+                continue
+            if state == POST:
+                if t in self.pres:
+                    out_tokens.append(current_word)
+                    current_word = t
+                    state = PRE
+                if t in self.posts:
+                    current_word += nim_fasele
+                    current_word += t
+                if t not in self.pres and t not in self.posts:
+                    out_tokens.append(current_word)
+                    current_word = t
+                    state = WORD
+        if out_tokens[-1] != current_word:
+            out_tokens.append(current_word)
+        return out_tokens

utils.py ADDED Viewed

	@@ -0,0 +1,302 @@

+from functools import reduce
+import itertools
+import json
+import re
+import string
+import pandas as pd
+from hazm import Normalizer, WordTokenizer
+normalizer = Normalizer()
+tokenizer = WordTokenizer(separate_emoji=True)
+def seprate_emoji_string(txt):
+        try:
+            oRes = re.compile(u'(['
+                              u'\U0001F300-\U0001F64F'
+                              u'\U0001F680-\U0001F6FF'
+                              u'\u2600-\u26FF\u2700-\u27BF]+)',
+                              re.UNICODE)
+        except re.error:
+            oRes = re.compile(u'(('
+                              u'\ud83c[\udf00-\udfff]|'
+                              u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
+                              u'[\u2600-\u26FF\u2700-\u27BF])+)',
+                              re.UNICODE)
+        return oRes.sub(r'  \1  ', txt)
+def cleanify(txt):
+    txt = txt.strip()
+    txt = re.sub('\s+', ' ', txt)
+    txt = re.sub('\u200f', '', txt)
+    txt = re.sub('‌+', '‌', txt)
+    txt = re.sub('‌ ', ' ', txt)
+    txt = re.sub(' ‌', ' ', txt)
+    txt = normalizer.normalize(txt)
+    txt = seprate_emoji_string(txt)
+    txt = ' '.join(tokenizer.tokenize(txt))
+    return txt
+def clean_text_for_lm(txt):
+    ignore_chars = '.1234567890!@#$%^&*()_+۱۲۳۴۵۶۷۸۹÷؟×−+?><}،,{":' + string.ascii_lowercase + string.ascii_uppercase
+    tokens = txt.split()
+    clean_tokens = [t for t in tokens if not (any(ic in t for ic in ignore_chars) or if_emoji(t))]
+    return ' '.join(clean_tokens)
+def add_to_mapper(mapping_list):
+    print(len(mapping_list))
+    df = pd.read_csv('resources/mapper.csv', delimiter=',', index_col=None)
+    print(df.columns)
+    for item in mapping_list:
+        df = df.append({'formal': item[1], 'informal': item[0]}, ignore_index=True)
+    df.to_csv('resources/mapper.csv', index=False)
+def extract_non_convertable_words(corpus_addr, tokenizer, normalizer, transformer, output_addr, vocab):
+    f = open(corpus_addr)
+    non_convertables = {}
+    seen_words = set()
+    nim_fasele = '‌'
+    for i, line in enumerate(f):
+        print(i)
+        # if i > 500:
+        #     break
+        line = normalizer.normalize(line)
+        tokens = tokenizer.tokenize(line)
+        for t in tokens:
+        #     if nim_fasele in t:
+        #         print(t)
+            if t in seen_words:
+                if t in non_convertables:
+                    non_convertables[t] += 1
+            else:
+                candidates = transformer.transform(t, None)
+                # if not candidates and any(t.startswith(pre) for pre in ['از', 'در', 'چند', 'هر', 'هیچ', 'هم', 'با', 'بی', 'تا', 'و']):
+                #     print(t)
+                if not candidates:
+                    non_convertables[t] = 1
+                seen_words.add(t)
+    words_count = sorted([(word, count) for word, count in non_convertables.items()], key=lambda item: item[1], reverse=True)
+    words_count = [str(word) + ' ########### ' + str(count) for (word, count) in words_count]
+    with open(output_addr, 'w+') as f:
+        f.write('\n'.join(words_count))
+def generate_irrgular_informal_verbs():
+    """
+    برمیگرده میوفته برمیداره برمیگردونه درمیاره ایستادن نمیومد وامیسته
+    اومد
+    نیومد
+    اومدی
+    نیومدی
+    میومدی
+    نیومده
+    یومد
+    میومده
+    """
+    mapping_verbs = []
+    past_ends = ['م', 'ی', 'ه', 'یم', 'ین', 'ید', 'ند', '', 'ن']
+    neg = ['ن', '']
+    pre = ['می', 'ب']
+    pre_verbs = [('بر', 'دار'), ('در', 'یار'), ('وا', 'ست'), ('بر', 'گرد'), ('ور', 'دار'), ('بر', 'گشت')]
+    extras = ['ن', 'نمی', 'می']
+    mapper = {'ه':'د', 'ن': 'ند', 'ین': 'ید', 'ور': 'بر', 'ست':'ایست', 'وا':'', 'یار':'آور'}
+    for item in pre_verbs:
+        for pe in past_ends:
+            for ex in extras:
+                p_end = pe
+                item0 = item[0]
+                item1 = item[1]
+                inf = item0 + ex + item1 + p_end
+                inf = inf.replace('یی', 'ی')
+                if item0 in mapper:
+                    item0 = mapper[item0]
+                if item1 in mapper:
+                    item1 = mapper[item1]
+                if p_end in mapper:
+                    p_end = mapper[p_end]
+                formal = item0 + ex + item1 + p_end
+                formal = formal.replace('می', 'می‌')
+                formal = formal.replace('نآ', 'نیا')
+                mapping_verbs.append([formal, inf])
+    bons = ['یومد', 'یوفت']
+    v_mapper = {'یومد': 'یامد', 'یوفت': 'افت'}
+    verbs = itertools.product(neg, pre, bons, past_ends)
+    for v in verbs:
+        if v[0] == 'ن' and v[1] == 'ب' or (v[2] == 'یومد' and v[1] == 'ب'):
+            continue
+        inf = v[0] + v[1] + v[2] + v[3]
+        inf = inf.replace('یی', 'ی')
+        pe = v[3]
+        if pe in mapper:
+            pe = mapper[pe]
+        formal = v[0] + v[1]  +  '‌' + v_mapper[v[2]] + pe
+        formal = formal.replace('ی‌ی', 'ی')
+        formal = formal.replace('یا', 'ی‌آ')
+        formal = formal.replace('دد', 'ده')
+        formal = formal.replace('ب‌ا', 'بی')
+        mapping_verbs.append([formal, inf])
+    add_to_mapper(mapping_verbs)
+def load_vocab(vocab_addr='resources/words.dat'):
+    vocab = {}
+    with open(vocab_addr, 'r', encoding='utf-8') as f:
+        for line in f:
+            try:
+                word, freq, p_tags = line.strip().split('\t')
+                vocab[word] = {'freq': freq, 'tags': p_tags}
+            except:
+                word = line.strip()
+                vocab[word] = {'freq': 1, 'tags': 'NUM'}
+    return vocab
+def if_connect(word1, word2):
+    not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و']
+    if any(w =='' for w in [word1, word2]) or word1[-1] in not_connect_chars:
+        return True
+    return False
+def split_conj_words(word, conjs):
+    candidates = set()
+    sorted_conjs = sorted(conjs, key=lambda x: len(x), reverse=True)
+    for c in sorted_conjs:
+        indx = word.find(c)
+        if indx != -1 and indx in [0, len(word)-1]:
+            pre_w = word[:indx]
+            next_w = word[indx+len(c) :]
+            if if_connect(pre_w, c) and if_connect(c, next_w):
+                cnd = ' '.join([pre_w, c, next_w])
+                cnd = cnd.strip()
+                candidates.add(cnd)
+    return list(candidates)
+def is_formal_prefixed(word, vocab):
+    not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و']
+    nim_fasele = '‌'
+    m1 = re.match('(.+)های(م|ت|ش|مان|تان|شان)?$', word)
+    m2 = re.match('(.+[ا|و|ی])ی(م|ت|ش|مان|تان|شان)$', word)
+    m3 = re.match('(.+[^ا^و^ی])(م|ت|ش|مان|تان|شان)$', word)
+    m4 = re.match('(.+)(ها)$', word)
+    m5 = re.match('(.+[ه|ی]‌)(اش|ام|ات)$', word)
+    if m3 or m2:
+        prefix_word = list(filter(lambda m: m is not None, [m3, m2]))[0].group(1)
+        if prefix_word in vocab:
+            return True
+    m_fired = list(filter(lambda m: m is not None, [m1, m4, m5]))
+    if len(m_fired) > 0:
+        # print(word, m_fired[0].groups())
+        prefix_word = m_fired[0].group(1)
+        if prefix_word[-1] != nim_fasele and prefix_word[-1] not in not_connect_chars:
+            return False
+        if prefix_word[-1] == nim_fasele and not (prefix_word[:-1] in vocab):
+            return False
+        if prefix_word[-1] != nim_fasele and not (prefix_word in vocab):
+            return False
+        return True
+    return False
+def spelling_similairty(word):
+    all_possible = []
+    possible_repeated = get_possible_repeated_word(word)
+    all_possible = possible_repeated
+    if word in all_possible:
+        all_possible.remove(word)
+    return all_possible
+def add_nim_alef_hat_dictionary(vocab):
+    word_with_hat = filter(lambda w: 'آ' in w, vocab)
+    word_with_nim = filter(lambda w: '‌' in w, vocab)
+    mapper1 = {w.replace('آ', 'ا').replace('‌', ''): w for w in word_with_hat}
+    mapper2 = {w.replace('‌', ''): w for w in word_with_nim}
+    mapper1.update(mapper2)
+    return mapper1
+def generate_spell_mapper(vocab):
+    hat = 'آ'
+    tanvin =  'اً'
+    nim =  '‌'
+    hamzeh = 'أ'
+    hamzeh_y = 'ئ'
+    sp_mapper = {hamzeh_y: ['ی'], hat: ['ا'], tanvin: ['ن', 'ا'], nim:['', ' '], hamzeh:['ا', '']}
+    special_chars = [hat, tanvin, nim, hamzeh]
+    out = {}
+    for word in vocab:
+        p_words = [word.replace(sp, sp_alt) for sp in special_chars for sp_alt in sp_mapper[sp]]
+        spell_errors = []
+        p_words = list(set(p_words) - set([word]))
+        for pw in p_words:
+            if pw in out:
+                out[pw].add(word)
+            else:
+                out[pw] = {word}
+    out = {w: list(out[w]) for w in out}
+    with open('spell_checker_mapper.json', 'w+', encoding='utf-8') as f:
+        json.dump(out, f, ensure_ascii=False, indent=1)
+def create_mapper_tanvin_hamze_hat_nim_fasele():
+    mapper = {}
+    hats_word = open('resources/spell/words_with_hat.txt').read().splitlines()
+    nim_words = open('resources/spell/words_with_nim.txt').read().splitlines()
+    tanvin_words = open('resources/spell/words_with_tanvin.txt').read().splitlines()
+    hat_ch = 'آ'
+    nim_fasele = '‌'
+    for w in hats_word:
+        w_without_h = w.replace(hat_ch, 'ا')
+        mapper[w_without_h] = w
+    for w in nim_words:
+        w_without_nim = w.remove(nim_fasele)
+        mapper[w_without_nim] = w
+        w_space_instead_nim = w.replace(nim_fasele, ' ')
+        mapper[w_space_instead_nim] = w
+def extract_lemma_nim_fasele_words(word, vocab):
+        prefixs = ['اون']
+        postfixs = {'ست': 'است', 'هام':'هایم', 'ام':'ام', 'ها':'ها', 'هامون':'هایمان', 'ترین': 'ترین', 'هایشان':'هایشان'}
+        tokens = word.split('‌')
+        index = 0
+        for i in range(len(tokens)):
+            index = i
+            if tokens[i] not in prefixs:
+                break
+        for i in range(len(tokens), 0, -1):
+            current_tok = '‌'.join(tokens[index:i])
+            if current_tok in vocab or  tokens[i-1] not in postfixs:
+                return current_tok
+def if_emoji(text):
+    # Wide UCS-4 build
+    try:
+        oRes = re.compile(u'(['
+                          u'\U0001F300-\U0001F64F'
+                          u'\U0001F680-\U0001F6FF'
+                          u'\u2600-\u26FF\u2700-\u27BF]+)',
+                          re.UNICODE)
+    except re.error:
+        # Narrow UCS-2 build
+        oRes = re.compile(u'(('
+                          u'\ud83c[\udf00-\udfff]|'
+                          u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
+                          u'[\u2600-\u26FF\u2700-\u27BF])+)',
+                          re.UNICODE)
+    return oRes.findall(text)
+def powerset(lst):
+    return reduce(lambda result, x: result + [subset + [x] for subset in result],
+                  lst, [[]])