import itertools import utils class InformalTokenizer: def __init__(self, vocab, postfixes): self.vocab = vocab self.pres = InformalTokenizer.get_prefixs() self.posts = postfixes @staticmethod def get_prefixs(): return ['نا', 'بی', 'هر', 'می'] @staticmethod def get_postfixs(informal_postfix_addr): with open(informal_postfix_addr, 'r') as f: ps = f.read().splitlines() return ps def is_pre_post_word(self, w): nim_fasele = '‌' ws = w.split(nim_fasele) pre, pos, v = [0,1,2] is_pre_pos = False state = pre valid_w = '' for w in ws: if state == pre: if w in self.pres: valid_w += nim_fasele + w is_pre_pos = True continue elif w in self.posts: valid_w += nim_fasele + w is_pre_pos = True state = pos continue state = v valid_w += nim_fasele + w continue if state == pos: if w in self.posts: valid_w += nim_fasele + w continue return False if state == v: if w in self.posts: is_pre_pos = True state = pos valid_w += nim_fasele + w continue if w in self.vocab: valid_w += nim_fasele + w if valid_w not in self.vocab: return False continue return False if not is_pre_pos: return False return True def get_valid_word(self, words): seps = ['', '‌'] all_seqs = [] count = len(words) lst = list(itertools.product(seps, repeat=count-1)) for item in lst: seq = '' for word, sep in zip(words[:-1], item): seq += word + sep seq += words[-1] all_seqs.append(seq) return [w for w in all_seqs if w in self.vocab or self.is_pre_post_word(w)] def get_candidates(self, tokens, index=0, current_seq = ' '): if index == len(tokens): return current_seq word = tokens[index] next_word, next_next_word = [None, None] if index < len(tokens) -1: next_word = tokens[index+1] if index < len(tokens) -2: next_next_word = tokens[index+2] cnds = [] if next_word is not None: v_words = self.get_valid_word([word, next_word]) if v_words: for v_w in v_words: current_seq1 = current_seq + ' ' + v_w cnds2 = self.get_candidates(tokens,index+2, current_seq1) if type(cnds2) == str: cnds.append(cnds2) else: cnds.extend(cnds2) if next_next_word is not None: v_words = self.get_valid_word([word, next_word, next_next_word]) if v_words: for v_w in v_words: current_seq2 = current_seq + ' ' + v_w cnds3 = self.get_candidates(tokens,index+3, current_seq2) if type(cnds3) == str: cnds.append(cnds3) else: cnds.extend(cnds3) current_seq = current_seq + ' ' + word cnds1 = self.get_candidates(tokens,index+1, current_seq) if type(cnds1) == str: cnds.append(cnds1) else: cnds.extend(cnds1) return [c.strip() for c in cnds] def seperate_conjs(self, word, validator): conjs = ['و', 'در', 'با', 'تا', 'که', 'از', 'تو', 'من', 'شما'] cnds = utils.split_conj_words(word, conjs) valid_cnds = [c for c in cnds if validator(c)] if valid_cnds: return valid_cnds return [word] def tokenize(self, txt, validator): tokens = txt.split() all_cnds = [] for t in tokens: if not validator(t): ws = self.seperate_conjs(t, validator) else: ws = [t] all_cnds.append(ws) all_cnd_tokens = itertools.product(*all_cnds) txts = list(map(self.get_dense_tokens, all_cnd_tokens)) return txts def get_dense_tokens(self, tokens): PRE, WORD, POST = 0,1,2 out_tokens = [] nim_fasele = '‌' current_word = '' state = WORD for i, t in enumerate(tokens): if state == WORD: if t in self.pres: out_tokens.append(current_word) current_word = t state = PRE if t in self.posts: current_word += nim_fasele current_word += t state = POST if t not in self.pres and t not in self.posts: out_tokens.append(current_word) current_word = t continue if state == PRE: if t in self.pres: current_word += nim_fasele current_word += t if t in self.posts: out_tokens.append(current_word) current_word = t state = WORD if t not in self.pres and t not in self.posts: current_word += nim_fasele current_word += t state = WORD continue if state == POST: if t in self.pres: out_tokens.append(current_word) current_word = t state = PRE if t in self.posts: current_word += nim_fasele current_word += t if t not in self.pres and t not in self.posts: out_tokens.append(current_word) current_word = t state = WORD if out_tokens[-1] != current_word: out_tokens.append(current_word) return out_tokens