File size: 1,896 Bytes
6227608 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
import pickle
from kenlm_wrapper import Kelm_Wrapper
from OneShotTransformer import OneShotTransformer
from VerbHandler import VerbHandler
import kenlm
from tokenizer import InformalTokenizer
class FormalityTransformer:
def __init__(self, asset_file_addr, verbs_csv_addr, irregular_verbs_mapper_addr, lm_addr ):
assets = pickle.load(open(asset_file_addr, 'rb'))
self.vocab = assets['vocab']
self.word_ends_tanvin = assets['word_ends_tanvin']
self.non_hidden_h_words = assets['non_hidden_h_words']
self.isolated_words = assets['isolated_words']
self.ignore_words = assets['ignore_words']
self.mapper = assets['mapper']
self.postfix_mapper = assets['postfix_mapper']
postfixes = assets['postfixes']
self.informal_tokenizer = InformalTokenizer(self.vocab, postfixes)
self.verb_handler = VerbHandler(csv_verb_addr=verbs_csv_addr, csv_irregular_verbs_mapper=irregular_verbs_mapper_addr)
self.oneshot_transformer = OneShotTransformer(self.vocab, self.mapper, self.verb_handler.informal_to_formal,
ignore_words=self.ignore_words,
postfix_mapper=self.postfix_mapper,
isolated_words=self.isolated_words,
non_hidden_h_words=self.non_hidden_h_words)
lm_model = kenlm.Model(lm_addr)
self.lm_obj = Kelm_Wrapper(lm_model)
def should_filtered_by_one_bigram(self, lemma, word, original_word):
NIM_FASELE = ''
return original_word in self.vocab and (len(word.split()) > 1 or NIM_FASELE in word)
def repalce_for_gpt2(self, word_repr):
if word_repr in self.word_ends_tanvin:
return word_repr[:-2] + 'ا'
return word_repr
|