File size: 1,896 Bytes
6227608
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41

import pickle
from kenlm_wrapper import Kelm_Wrapper
from OneShotTransformer import OneShotTransformer
from VerbHandler import VerbHandler
import kenlm
from tokenizer import InformalTokenizer


class FormalityTransformer:
    def __init__(self, asset_file_addr, verbs_csv_addr, irregular_verbs_mapper_addr, lm_addr ):
        assets = pickle.load(open(asset_file_addr, 'rb'))
        self.vocab = assets['vocab']
        self.word_ends_tanvin = assets['word_ends_tanvin']
        self.non_hidden_h_words = assets['non_hidden_h_words']
        self.isolated_words = assets['isolated_words']
        self.ignore_words = assets['ignore_words']
        self.mapper = assets['mapper']
        self.postfix_mapper = assets['postfix_mapper']
        postfixes = assets['postfixes']

        self.informal_tokenizer = InformalTokenizer(self.vocab, postfixes)
        self.verb_handler = VerbHandler(csv_verb_addr=verbs_csv_addr, csv_irregular_verbs_mapper=irregular_verbs_mapper_addr)
        self.oneshot_transformer = OneShotTransformer(self.vocab, self.mapper, self.verb_handler.informal_to_formal,
                                                      ignore_words=self.ignore_words,
                                                      postfix_mapper=self.postfix_mapper,
                                                      isolated_words=self.isolated_words,
                                                      non_hidden_h_words=self.non_hidden_h_words)
        lm_model = kenlm.Model(lm_addr)
        self.lm_obj = Kelm_Wrapper(lm_model)


    def should_filtered_by_one_bigram(self, lemma, word, original_word):
        NIM_FASELE = '‌'
        return original_word in self.vocab and (len(word.split()) > 1 or NIM_FASELE in word)

    def repalce_for_gpt2(self, word_repr):
        if word_repr in self.word_ends_tanvin:
            return word_repr[:-2] + 'ا'
        return word_repr