File size: 20,015 Bytes
6227608
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
import re
from enum import Enum
from hazm import Normalizer
import pandas as pd


Formality = Enum('Formality', 'formal informal')
VerbTime = Enum('VerbTime', 'past present future')
Person = Enum('Person', 'Man To An Ma Shoma Anha')
Number = Enum('Number', 'Mofrad Jam')
class Verb:
    def __init__(self, root, formality, time, pp, person, number):
        self.root = root
        self.formality = formality
        self.time = time
        self.person = person
        self.number = number
        self.pp = pp

class VerbHandler():
    def __init__(self, csv_verb_addr, csv_irregular_verbs_mapper):
        self.posfix_mapper = {'ه': 'د', 'ن': 'ند', 'ین': 'ید'}
        self.objective_pr_mapper = {'شون':'شان', 'تون':'تان', 'مون':'مان'}
        self.init_mapper = {'کنه': 'بکنه', 'کنم':'بکنم', 'کنی':'بکنی', 'کنیم': 'بکنیم', 'کنین':'بکنین', 'کنید':'بکنید', 'کنن':'بکنن', 'کنند':'بکنند'}
        self.out_mapper = {'می‌ایی': 'می‌آیی'}
        self.init_mapper.update({'شم':'بشم', 'شی':'بشی', 'شن':'بشن', 'شین':'بشین' ,'شه':'بشه', 'شیم': 'بشیم'})
        self.bons = self.load_bons(csv_verb_addr)
        self.irregular_verbs = self.load_irregular_mapper(csv_irregular_verbs_mapper)
        self.informal_past_bons = self.get_bons(type=Formality.informal, time=VerbTime.past)
        self.informal_present_bons = self.get_bons(type=Formality.informal, time=VerbTime.present)

        self.formal_past_bons = self.get_bons(type=Formality.formal, time=VerbTime.past)
        self.formal_present_bons =self.get_bons(type=Formality.formal, time=VerbTime.present) + ['هست']
        self.all_past_bons = self.formal_past_bons + self.informal_past_bons
        self.all_present_bons = self.formal_present_bons + self.informal_present_bons
        self.verb_mapper = {b:{'formal':self.bons[b]['formal']} for b in self.bons if self.bons[b]['type'] == Formality.informal}
        self.solve_alef_issue()
        self.compile_patterns()


    def load_irregular_mapper(self, csv_addr):
        df = pd.read_csv(csv_addr)
        mapper = {informal: formal for _, (informal, formal) in df.iterrows()}
        return mapper

    def load_bons(self, csv_addr):
        normalizer = Normalizer()
        df = pd.read_csv(csv_addr)
        df = df.fillna('')
        bons = {}
        for i, row in df.iterrows():
            if row[2]:
                row[2] = normalizer.normalize(row[2])
                bons[row[2]] = {'type': Formality.formal, 'time': VerbTime.past}
            if row[3]:
                row[3] = normalizer.normalize(row[3])
                bons[row[3]] = {'type': Formality.formal, 'time': VerbTime.present}
            if row[10]:
                bs = row[10].split()
                for b in bs:
                    bons[b] = {'type': Formality.informal, 'time': VerbTime.past, 'formal': row[2]}
            if row[11]:
                bs = row[11].split()
                for b in bs:
                    bons[b] = {'type': Formality.informal, 'time': VerbTime.present, 'formal': row[3]}
        return bons

    def get_bons(self, type, time):
        return [b for b in self.bons if self.bons[b]['type'] == type and self.bons[b]['time'] == time]

    def solve_alef_issue(self):
        replace_alef_y = lambda v : 'ی' + v[1:]
        replace_A_YA = lambda v : 'یا' + v[1:]
        informal_past_start_with_alef = list(map(replace_alef_y, [v for v in self.informal_past_bons if v.startswith('ا') and not v.startswith('ای')]))
        formal_past_start_with_alef = list(map(replace_alef_y, [v for v in self.formal_past_bons if v.startswith('ا') and not v.startswith('ای')]))
        informal_present_start_with_alef = list(map(replace_alef_y, [v for v in self.informal_present_bons if v.startswith('ا') and not v.startswith('ای')]))
        formal_present_start_with_alef = list(map(replace_alef_y, [v for v in self.formal_present_bons if v.startswith('ا') and not v.startswith('ای')]))
        self.alef_mapper = {}
        self.informal_past_start_with_alef = informal_past_start_with_alef + list(
            map(replace_A_YA, [v for v in self.informal_past_bons if v.startswith('آ')]))
        self.informal_present_start_with_alef = informal_present_start_with_alef + list(
            map(replace_A_YA, [v for v in self.informal_present_bons if v.startswith('آ')]))
        self.formal_past_start_with_alef = formal_past_start_with_alef + list(
            map(replace_A_YA, [v for v in self.formal_past_bons if v.startswith('آ')]))
        self.formal_present_start_with_alef = formal_present_start_with_alef + list(
            map(replace_A_YA, [v for v in self.formal_present_bons if v.startswith('آ')]))
        for verb in self.informal_past_start_with_alef + self.informal_present_start_with_alef + self.formal_past_start_with_alef + self.formal_present_start_with_alef:
            if verb[:2] == 'یا':
                origin = 'آ' + verb[2:]
            else:
                origin = 'ا' + verb[1:]
            self.alef_mapper[verb] = origin
        self.alef_mapper['یای'] = 'آی'
        remove_a_hat = lambda w: w.replace('آ', 'ا')
        self.formal_past_bons = list(
            filter(lambda w: w != '', map(remove_a_hat, self.formal_past_bons + self.formal_past_start_with_alef)))
        self.formal_present_bons = list(map(remove_a_hat, self.formal_present_bons + self.formal_present_start_with_alef)) + [
            'یای'] + ['آی']
        self.informal_past_bons = list(
            filter(lambda w: w != '', map(remove_a_hat, self.informal_past_bons + self.informal_past_start_with_alef)))
        self.informal_present_bons = list(
            map(remove_a_hat, self.informal_present_bons + self.informal_present_start_with_alef)) + [
                                       'یای'] + ['آی']
        # sorted by length
        self.formal_present_bons = sorted(self.formal_present_bons, key=lambda w: -len(w))
        self.formal_past_bons = sorted(self.formal_past_bons, key=lambda w: -len(w))
        self.informal_present_bons = sorted(self.informal_present_bons, key=lambda w: -len(w))
        self.informal_past_bons = sorted(self.informal_past_bons, key=lambda w: -len(w))
        verb_v_keys = [word for word in self.verb_mapper if 'آ' in word]
        alef_verb_v_keys = [word for word in self.alef_mapper if 'آ' in word]
        for v in verb_v_keys:
            self.verb_mapper[v.replace('آ', 'ا')] = self.verb_mapper[v]
        for v in alef_verb_v_keys:
            self.alef_mapper[v.replace('آ', 'ا')] = self.alef_mapper[v]


    def compile_patterns(self):
            ME_r = '|'.join(['می','می‌'])
            B_r = 'ب'
            not_r = 'ن'
            past_ends = ['م', 'ی', 'ه', 'یم', 'ین', 'ید', 'ند', '', 'ن']
            present_ends = ['م', 'ی', 'ه', 'یم', 'ین', 'ن', 'ید', 'ند', 'د', '']
            naghli_ends = ['ه‌ام', 'ه‌ای', 'ه', 'ه‌ایم', 'ه‌اید', 'ه‌اند']
            objective_pronouns = ['م', 'ت', 'ش', 'مون', 'تون', 'شون']

            informal_past_r = '|'.join(self.informal_past_bons)
            formal_past_r = '|'.join(self.formal_past_bons)
            informal_present_r = '|'.join(self.informal_present_bons)
            formal_present_r = '|'.join(self.formal_present_bons)
            verb_postfix_past_r = '|'.join(past_ends)
            verb_postfix__present_r = '|'.join(present_ends)
            objective_pronouns_r = '|'.join(objective_pronouns)
            naghli_ends_r = '|'.join(naghli_ends)
            """
            #گذشته‌ی ساده
            # r1 =  past_r + verb_postfix_r + objectiveـpronouns_r
            #گذشته‌ی ناتمام
            # r2  = '(' + ME + ')'+ past_r +verb_postfix_r + objectiveـpronouns_r
    
            #گذشته‌ی استمراری
            # r3 =  '(' + DASHT + ')'+ past_r +  verb_postfix_r +objectiveـpronouns_r
    
            #گذشته‌ی نقلی
            # r4 = past_r + '(' + '|'.join(naghli_ends) + ')' +objectiveـpronouns_r
    
            #گذشته‌ی پیشین
            # r5 = past_r + verb_postfix_r + '(' + BUD + ')' + verb_postfix_r + objectiveـpronouns_r
    
            #حال ساده
            # r6 = present_r + verb_postfix_r
    
           #حال ناتمام
            # r7 =  '(' + ME + ')'+ present_r +  verb_postfix_r + objectiveـpronouns_r
    
            #حال استمراری
            # r8 = '( ' + DAR + ')'+ verb_postfix_r + '(' + ME + ')' + present_r + verb_postfix_r+ objectiveـpronouns_r
    
            #آینده‌ی ساده
            # r9 = '( ' + KHAH + ')'+ verb_postfix_r + present_r +objectiveـpronouns_r
    
            #التزامی - گذشته
            # r10 = present_r + '(ه)'+  '(' + BASH +  ')' + verb_postfix_r + objectiveـpronouns_r
    
            #التزامی - حال
            # r11 = '(ب)' + present_r + verb_postfix_r +objectiveـpronouns_r
            """
            #
            # + : fealhaye rasmi + pasvan informal , hale sade baraye bazi fela ( hast, kon)
            # formal
            formal_present_pattern_b = '({})({})({})?({})?$'.format(B_r, formal_present_r, verb_postfix__present_r, objective_pronouns_r)
            formal_present_pattern_n_me = '({})?({})({})({})?({})?$'.format(not_r, ME_r, formal_present_r, verb_postfix__present_r, objective_pronouns_r)
            formal_present_pattern_n = '({})?({})({})?({})?$'.format(not_r, formal_present_r, verb_postfix__present_r, objective_pronouns_r)
            formal_past_pattern = '({})?({})?({})({}|{})({})?$'.format(not_r, ME_r, formal_past_r, naghli_ends_r,
                                                                        verb_postfix_past_r, objective_pronouns_r)
            self.formal_past_verb_pattern = re.compile(formal_past_pattern)
            self.formal_present_verb_pattern_b = re.compile(formal_present_pattern_b)
            self.formal_present_verb_pattern_n_me = re.compile(formal_present_pattern_n_me)
            self.formal_present_verb_pattern_n = re.compile(formal_present_pattern_n)

            #informal
            informal_present_pattern_b = '({})({})({})?({})?$'.format(B_r, informal_present_r, verb_postfix__present_r,
                                                                     objective_pronouns_r)
            informal_present_pattern_n_me = '({})?({})({})({})?({})?$'.format(not_r, ME_r, informal_present_r,
                                                                             verb_postfix__present_r, objective_pronouns_r)
            informal_present_pattern_n = '({})?({})({})?({})?$'.format(not_r, informal_present_r, verb_postfix__present_r,
                                                                      objective_pronouns_r)
            informal_past_pattern = '({})?({})?({})({}|{})({})?$'.format(not_r, ME_r, informal_past_r, naghli_ends_r,
                                                                        verb_postfix_past_r, objective_pronouns_r)
            self.informal_past_verb_pattern = re.compile(informal_past_pattern)
            self.informal_present_verb_pattern_b = re.compile(informal_present_pattern_b)
            self.informal_present_verb_pattern_n_me = re.compile(informal_present_pattern_n_me)
            self.informal_present_verb_pattern_n = re.compile(informal_present_pattern_n)


    def parse(self, token):
        outputs = []

        match_dict_formal = {'tense': '', 'root': '', 'neg': '', 'postfix': '', 'not_r': '', 'op': '', 'b': '', 'me': '', 'naghli':''}
        match_dict_informal = {'tense': '', 'root': '', 'neg': '', 'postfix': '', 'not_r': '', 'op': '', 'b': '', 'me': '', 'naghli':''}
        formal_past_match = self.formal_past_verb_pattern.match(token)
        informal_past_match = self.informal_past_verb_pattern.match(token)
        formal_present_match_b = self.formal_present_verb_pattern_b.match(token)
        informal_present_match_b = self.informal_present_verb_pattern_b.match(token)
        formal_present_match_n_me = self.formal_present_verb_pattern_n_me.match(token)
        informal_present_match_n_me = self.informal_present_verb_pattern_n_me.match(token)
        formal_present_match_n = self.formal_present_verb_pattern_n.match(token)
        informal_present_match_n = self.informal_present_verb_pattern_n.match(token)
        present_group_to_dict_b = lambda g: {k:g[i] for i,k in enumerate(['b', 'root', 'postfix', 'op'])}
        present_group_to_dict_n_me = lambda g: {k:g[i] for i,k in enumerate(['neg', 'me','root', 'postfix','op'])}
        present_group_to_dict_n = lambda g: {k:g[i] for i,k in enumerate(['neg','root', 'postfix','op'])}
        past_group_to_dict = lambda g: {k:g[i] for i,k in enumerate(['neg', 'me', 'root', 'postfix', 'op'])}
        formal_match = formal_past_match or formal_present_match_b or formal_present_match_n_me or formal_present_match_n
        informal_match = informal_past_match or informal_present_match_b or informal_present_match_n_me or informal_present_match_n
        if formal_match:
            if formal_past_match:
                match_dict_formal = past_group_to_dict(formal_past_match.groups())
                match_dict_formal['tense'] = 'past'
            else:
                if formal_present_match_b:
                    match_dict_formal = present_group_to_dict_b(formal_present_match_b.groups())
                elif formal_present_match_n_me:
                    match_dict_formal = present_group_to_dict_n_me(formal_present_match_n_me.groups())
                elif formal_present_match_n:
                    match_dict_formal = present_group_to_dict_n(formal_present_match_n.groups())
                match_dict_formal['tense'] = 'present'
            outputs.append(match_dict_formal)
        if informal_match:
            if informal_past_match:
                match_dict_informal = past_group_to_dict(informal_past_match.groups())
                match_dict_informal['tense'] = 'past'
            else:
                if informal_present_match_b:
                    match_dict_informal = present_group_to_dict_b(informal_present_match_b.groups())
                elif informal_present_match_n_me:
                    match_dict_informal = present_group_to_dict_n_me(informal_present_match_n_me.groups())
                elif informal_present_match_n:
                    match_dict_informal = present_group_to_dict_n(informal_present_match_n.groups())
                match_dict_informal['tense'] = 'present'
            outputs.append(match_dict_informal)
        for match_dict in outputs:
            for key,val in match_dict.items():
                if val is None:
                    match_dict[key] = ''
            # print(match_dict)
        return outputs

    def formal_concatenate(self, match_dict, should_smooth):
        out_dict = {'بیای': 'بیا', 'نیای': 'نیا'}
        if match_dict['root'] == 'است' and match_dict['neg'] != '':
            return 'نیست' + match_dict['postfix']
        if self.if_simple_present(match_dict) or self.if_only_me(match_dict):
            return None
        if should_smooth:
            if match_dict['prefix'] != '' and match_dict['prefix'][0] == 'م':
                pass
            else:
                match_dict['root'] = 'یا' + match_dict['root'][1:]
            # if len(match_dict['prefix']) == 3:
            #     match_dict['prefix'] = 'می'
        if match_dict['prefix'] == 'ب' and match_dict['root'] and match_dict['root'][0] == 'ا':
            match_dict['root'] = 'ی' + match_dict['root'][1:]
        out = match_dict['neg'] + match_dict['prefix'] + match_dict['root'] + match_dict['postfix'] + match_dict['op']
        if out in out_dict:
            out = out_dict[out]

        return out

    def _set_match_dict_prefix(self, match_dict):
        match_dict['prefix'] = ''
        if 'me' in match_dict and match_dict['me'] != '':
            if len(match_dict['me']) < 3:
                match_dict['me'] = 'می‌'
            match_dict['prefix'] = match_dict['me']
        elif 'b' in match_dict and match_dict['b'] != '':
            match_dict['prefix'] = match_dict['b']
        return match_dict

    def if_simple_present(self, match_dict):
        if match_dict['root'] != '' and match_dict['tense'] == 'present' and match_dict['prefix'] == '' and match_dict['neg'] == '':
            if match_dict['root'] not in ['کن', 'هست', 'است', 'دار', 'نیست', 'باش']:
                return True
        return False

    def if_only_me(self, match_dict):
        if match_dict['root'] != '' and match_dict['tense'] == 'present' and match_dict['prefix'] !='' and match_dict['prefix'][0] == 'م' and match_dict['postfix'] == '':
            return True
        return False

    def is_masdar(self, match_dict):
        return  match_dict['root'] in self.all_past_bons and match_dict['me'] == '' and match_dict['postfix'] =='ن' and match_dict['op'] == ''

    def informal_to_formal(self, token):
        # irregular verbs checking
        if token in self.irregular_verbs:
            return [self.irregular_verbs[token]]
        if token in self.init_mapper:
            token = self.init_mapper[token]
        outputs = []
        if len(token) < 3:
            return None
        should_smooth = False
        all_match_dicts = self.parse(token)

        ### بدهدم
        #برد
        if len(all_match_dicts) == 2 :
            if all_match_dicts[1]['root'] in self.verb_mapper and self.verb_mapper[all_match_dicts[1]['root']]['formal'] == all_match_dicts[0]['root'] and all_match_dicts[1]['op'] != '':
                del all_match_dicts[1]
            elif all_match_dicts[1] == {'b': 'ب', 'root': 'ر', 'postfix': 'د', 'op': '', 'tense': 'present'}:
                del all_match_dicts[1]
        ##
        is_masdar = False
        for match_dict in all_match_dicts:
            if self.is_masdar(match_dict):
                is_masdar = True
            #نان بان
            if match_dict['root'] != '' and match_dict['root'][0] == 'ا' and 'me' not in match_dict and ('b' in match_dict or match_dict['neg'] == 'ن'):
                return None
            if match_dict['root'] != '':
                root = match_dict['root']
                objective_pr = match_dict['op']
                postfix = match_dict['postfix']
                if root in self.alef_mapper:
                    should_smooth = True
                    match_dict['root'] = self.alef_mapper[root]
                if match_dict['root'] in self.verb_mapper:
                    match_dict['root'] = self.verb_mapper[ match_dict['root']]['formal']
                if postfix in self.posfix_mapper:
                    match_dict['postfix'] = self.posfix_mapper[postfix]
                if match_dict['postfix'] == 'د' and match_dict['tense'] == 'past':
                    match_dict['postfix'] = 'ه'
                if objective_pr in self.objective_pr_mapper:
                    match_dict['op'] = self.objective_pr_mapper[objective_pr]
                match_dict['prefix'] = ''
                if 'neg' not in match_dict:
                    match_dict['neg'] = ''
                match_dict = self._set_match_dict_prefix(match_dict)
                formal_verb = self.formal_concatenate(match_dict, should_smooth)
                outputs.append(formal_verb)
        not_none_outpts = [o for o in outputs if o is not None]
        for index, item in enumerate(not_none_outpts):
            if item in self.out_mapper:
                not_none_outpts[index] = self.out_mapper[item]
        if not_none_outpts:
            # append bon
            if len(not_none_outpts) == 1 and is_masdar:
                masdar = not_none_outpts[0][:-2] + 'ن'
                not_none_outpts.append(masdar)
            return not_none_outpts
        return None