|
|
|
import pickle |
|
from kenlm_wrapper import Kelm_Wrapper |
|
from OneShotTransformer import OneShotTransformer |
|
from VerbHandler import VerbHandler |
|
import kenlm |
|
from tokenizer import InformalTokenizer |
|
|
|
|
|
class FormalityTransformer: |
|
def __init__(self, asset_file_addr, verbs_csv_addr, irregular_verbs_mapper_addr, lm_addr ): |
|
assets = pickle.load(open(asset_file_addr, 'rb')) |
|
self.vocab = assets['vocab'] |
|
self.word_ends_tanvin = assets['word_ends_tanvin'] |
|
self.non_hidden_h_words = assets['non_hidden_h_words'] |
|
self.isolated_words = assets['isolated_words'] |
|
self.ignore_words = assets['ignore_words'] |
|
self.mapper = assets['mapper'] |
|
self.postfix_mapper = assets['postfix_mapper'] |
|
postfixes = assets['postfixes'] |
|
|
|
self.informal_tokenizer = InformalTokenizer(self.vocab, postfixes) |
|
self.verb_handler = VerbHandler(csv_verb_addr=verbs_csv_addr, csv_irregular_verbs_mapper=irregular_verbs_mapper_addr) |
|
self.oneshot_transformer = OneShotTransformer(self.vocab, self.mapper, self.verb_handler.informal_to_formal, |
|
ignore_words=self.ignore_words, |
|
postfix_mapper=self.postfix_mapper, |
|
isolated_words=self.isolated_words, |
|
non_hidden_h_words=self.non_hidden_h_words) |
|
lm_model = kenlm.Model(lm_addr) |
|
self.lm_obj = Kelm_Wrapper(lm_model) |
|
|
|
|
|
def should_filtered_by_one_bigram(self, lemma, word, original_word): |
|
NIM_FASELE = '' |
|
return original_word in self.vocab and (len(word.split()) > 1 or NIM_FASELE in word) |
|
|
|
def repalce_for_gpt2(self, word_repr): |
|
if word_repr in self.word_ends_tanvin: |
|
return word_repr[:-2] + 'ا' |
|
return word_repr |
|
|