import itertools
import utils
class InformalTokenizer:
    def __init__(self, vocab, postfixes):
        self.vocab = vocab
        self.pres = InformalTokenizer.get_prefixs()
        self.posts = postfixes

    @staticmethod
    def get_prefixs():
        return ['نا', 'بی', 'هر', 'می']

    @staticmethod
    def get_postfixs(informal_postfix_addr):
        with open(informal_postfix_addr, 'r') as f:
           ps = f.read().splitlines()
        return ps


    def is_pre_post_word(self, w):
        nim_fasele = '‌'
        ws = w.split(nim_fasele)
        pre, pos, v = [0,1,2]
        is_pre_pos = False
        state = pre
        valid_w = ''
        for w in ws:
            if state == pre:
                if w in self.pres:
                    valid_w += nim_fasele + w
                    is_pre_pos = True
                    continue
                elif w in self.posts:
                    valid_w += nim_fasele + w
                    is_pre_pos = True
                    state = pos
                    continue
                state = v
                valid_w += nim_fasele + w
                continue

            if state == pos:
                if w in self.posts:
                    valid_w += nim_fasele + w
                    continue
                return False
            if state == v:
                if w in self.posts:
                    is_pre_pos = True
                    state = pos
                    valid_w += nim_fasele + w
                    continue
                if w in self.vocab:
                    valid_w += nim_fasele + w
                    if valid_w not in self.vocab:
                        return False
                    continue

                return False
        if not is_pre_pos:
            return False
        return True


    def get_valid_word(self, words):
        seps = ['', '‌']
        all_seqs = []
        count = len(words)
        lst = list(itertools.product(seps, repeat=count-1))
        for item in lst:
            seq = ''
            for word, sep in zip(words[:-1], item):
                seq += word + sep
            seq += words[-1]
            all_seqs.append(seq)
        return [w for w in all_seqs if w in self.vocab or self.is_pre_post_word(w)]

    def get_candidates(self, tokens, index=0, current_seq = ' '):
        if index == len(tokens):
            return current_seq
        word = tokens[index]
        next_word, next_next_word = [None, None]
        if index < len(tokens) -1:
            next_word = tokens[index+1]
        if index < len(tokens) -2:
            next_next_word = tokens[index+2]
        cnds = []
        if next_word is not None:
            v_words = self.get_valid_word([word, next_word])
            if v_words:
                for v_w in v_words:
                    current_seq1 = current_seq + ' ' + v_w
                    cnds2 = self.get_candidates(tokens,index+2,  current_seq1)
                    if type(cnds2) == str:
                        cnds.append(cnds2)
                    else:
                        cnds.extend(cnds2)
        if next_next_word is not None:
            v_words = self.get_valid_word([word, next_word, next_next_word])
            if v_words:
                for v_w in v_words:
                    current_seq2 = current_seq + ' ' + v_w
                    cnds3 = self.get_candidates(tokens,index+3,  current_seq2)
                    if type(cnds3) == str:
                        cnds.append(cnds3)
                    else:
                        cnds.extend(cnds3)
        current_seq = current_seq + ' ' + word
        cnds1 = self.get_candidates(tokens,index+1,  current_seq)
        if type(cnds1) == str:
            cnds.append(cnds1)
        else:
            cnds.extend(cnds1)
        return [c.strip() for c in cnds]

    def seperate_conjs(self, word, validator):
        conjs = ['و', 'در', 'با', 'تا', 'که', 'از', 'تو', 'من', 'شما']
        cnds = utils.split_conj_words(word, conjs)
        valid_cnds = [c for c in cnds if validator(c)]
        if valid_cnds:
            return  valid_cnds
        return [word]

    def tokenize(self, txt, validator):
        tokens = txt.split()
        all_cnds = []
        for t in tokens:
            if not validator(t):
                ws = self.seperate_conjs(t, validator)
            else:
                ws = [t]
            all_cnds.append(ws)
        all_cnd_tokens = itertools.product(*all_cnds)
        txts = list(map(self.get_dense_tokens, all_cnd_tokens))
        return txts

    def get_dense_tokens(self, tokens):
        PRE, WORD, POST = 0,1,2
        out_tokens = []
        nim_fasele = '‌'
        current_word = ''
        state = WORD
        for i, t in enumerate(tokens):
            if state == WORD:
                if t in self.pres:
                    out_tokens.append(current_word)
                    current_word = t
                    state = PRE
                if t in self.posts:
                    current_word += nim_fasele
                    current_word += t
                    state = POST
                if t not in self.pres and t not in self.posts:
                    out_tokens.append(current_word)
                    current_word = t
                continue
            if state == PRE:
                if t in self.pres:
                    current_word += nim_fasele
                    current_word += t
                if t in self.posts:
                    out_tokens.append(current_word)
                    current_word = t
                    state = WORD
                if t not in self.pres and t not in self.posts:
                    current_word += nim_fasele
                    current_word += t
                    state = WORD
                continue
            if state == POST:
                if t in self.pres:
                    out_tokens.append(current_word)
                    current_word = t
                    state = PRE
                if t in self.posts:
                    current_word += nim_fasele
                    current_word += t
                if t not in self.pres and t not in self.posts:
                    out_tokens.append(current_word)
                    current_word = t
                    state = WORD
        if out_tokens[-1] != current_word:
            out_tokens.append(current_word)
        return out_tokens