|
import re |
|
from enum import Enum |
|
from hazm import Normalizer |
|
import pandas as pd |
|
|
|
|
|
Formality = Enum('Formality', 'formal informal') |
|
VerbTime = Enum('VerbTime', 'past present future') |
|
Person = Enum('Person', 'Man To An Ma Shoma Anha') |
|
Number = Enum('Number', 'Mofrad Jam') |
|
class Verb: |
|
def __init__(self, root, formality, time, pp, person, number): |
|
self.root = root |
|
self.formality = formality |
|
self.time = time |
|
self.person = person |
|
self.number = number |
|
self.pp = pp |
|
|
|
class VerbHandler(): |
|
def __init__(self, csv_verb_addr, csv_irregular_verbs_mapper): |
|
self.posfix_mapper = {'ه': 'د', 'ن': 'ند', 'ین': 'ید'} |
|
self.objective_pr_mapper = {'شون':'شان', 'تون':'تان', 'مون':'مان'} |
|
self.init_mapper = {'کنه': 'بکنه', 'کنم':'بکنم', 'کنی':'بکنی', 'کنیم': 'بکنیم', 'کنین':'بکنین', 'کنید':'بکنید', 'کنن':'بکنن', 'کنند':'بکنند'} |
|
self.out_mapper = {'میایی': 'میآیی'} |
|
self.init_mapper.update({'شم':'بشم', 'شی':'بشی', 'شن':'بشن', 'شین':'بشین' ,'شه':'بشه', 'شیم': 'بشیم'}) |
|
self.bons = self.load_bons(csv_verb_addr) |
|
self.irregular_verbs = self.load_irregular_mapper(csv_irregular_verbs_mapper) |
|
self.informal_past_bons = self.get_bons(type=Formality.informal, time=VerbTime.past) |
|
self.informal_present_bons = self.get_bons(type=Formality.informal, time=VerbTime.present) |
|
|
|
self.formal_past_bons = self.get_bons(type=Formality.formal, time=VerbTime.past) |
|
self.formal_present_bons =self.get_bons(type=Formality.formal, time=VerbTime.present) + ['هست'] |
|
self.all_past_bons = self.formal_past_bons + self.informal_past_bons |
|
self.all_present_bons = self.formal_present_bons + self.informal_present_bons |
|
self.verb_mapper = {b:{'formal':self.bons[b]['formal']} for b in self.bons if self.bons[b]['type'] == Formality.informal} |
|
self.solve_alef_issue() |
|
self.compile_patterns() |
|
|
|
|
|
def load_irregular_mapper(self, csv_addr): |
|
df = pd.read_csv(csv_addr) |
|
mapper = {informal: formal for _, (informal, formal) in df.iterrows()} |
|
return mapper |
|
|
|
def load_bons(self, csv_addr): |
|
normalizer = Normalizer() |
|
df = pd.read_csv(csv_addr) |
|
df = df.fillna('') |
|
bons = {} |
|
for i, row in df.iterrows(): |
|
if row[2]: |
|
row[2] = normalizer.normalize(row[2]) |
|
bons[row[2]] = {'type': Formality.formal, 'time': VerbTime.past} |
|
if row[3]: |
|
row[3] = normalizer.normalize(row[3]) |
|
bons[row[3]] = {'type': Formality.formal, 'time': VerbTime.present} |
|
if row[10]: |
|
bs = row[10].split() |
|
for b in bs: |
|
bons[b] = {'type': Formality.informal, 'time': VerbTime.past, 'formal': row[2]} |
|
if row[11]: |
|
bs = row[11].split() |
|
for b in bs: |
|
bons[b] = {'type': Formality.informal, 'time': VerbTime.present, 'formal': row[3]} |
|
return bons |
|
|
|
def get_bons(self, type, time): |
|
return [b for b in self.bons if self.bons[b]['type'] == type and self.bons[b]['time'] == time] |
|
|
|
def solve_alef_issue(self): |
|
replace_alef_y = lambda v : 'ی' + v[1:] |
|
replace_A_YA = lambda v : 'یا' + v[1:] |
|
informal_past_start_with_alef = list(map(replace_alef_y, [v for v in self.informal_past_bons if v.startswith('ا') and not v.startswith('ای')])) |
|
formal_past_start_with_alef = list(map(replace_alef_y, [v for v in self.formal_past_bons if v.startswith('ا') and not v.startswith('ای')])) |
|
informal_present_start_with_alef = list(map(replace_alef_y, [v for v in self.informal_present_bons if v.startswith('ا') and not v.startswith('ای')])) |
|
formal_present_start_with_alef = list(map(replace_alef_y, [v for v in self.formal_present_bons if v.startswith('ا') and not v.startswith('ای')])) |
|
self.alef_mapper = {} |
|
self.informal_past_start_with_alef = informal_past_start_with_alef + list( |
|
map(replace_A_YA, [v for v in self.informal_past_bons if v.startswith('آ')])) |
|
self.informal_present_start_with_alef = informal_present_start_with_alef + list( |
|
map(replace_A_YA, [v for v in self.informal_present_bons if v.startswith('آ')])) |
|
self.formal_past_start_with_alef = formal_past_start_with_alef + list( |
|
map(replace_A_YA, [v for v in self.formal_past_bons if v.startswith('آ')])) |
|
self.formal_present_start_with_alef = formal_present_start_with_alef + list( |
|
map(replace_A_YA, [v for v in self.formal_present_bons if v.startswith('آ')])) |
|
for verb in self.informal_past_start_with_alef + self.informal_present_start_with_alef + self.formal_past_start_with_alef + self.formal_present_start_with_alef: |
|
if verb[:2] == 'یا': |
|
origin = 'آ' + verb[2:] |
|
else: |
|
origin = 'ا' + verb[1:] |
|
self.alef_mapper[verb] = origin |
|
self.alef_mapper['یای'] = 'آی' |
|
remove_a_hat = lambda w: w.replace('آ', 'ا') |
|
self.formal_past_bons = list( |
|
filter(lambda w: w != '', map(remove_a_hat, self.formal_past_bons + self.formal_past_start_with_alef))) |
|
self.formal_present_bons = list(map(remove_a_hat, self.formal_present_bons + self.formal_present_start_with_alef)) + [ |
|
'یای'] + ['آی'] |
|
self.informal_past_bons = list( |
|
filter(lambda w: w != '', map(remove_a_hat, self.informal_past_bons + self.informal_past_start_with_alef))) |
|
self.informal_present_bons = list( |
|
map(remove_a_hat, self.informal_present_bons + self.informal_present_start_with_alef)) + [ |
|
'یای'] + ['آی'] |
|
|
|
self.formal_present_bons = sorted(self.formal_present_bons, key=lambda w: -len(w)) |
|
self.formal_past_bons = sorted(self.formal_past_bons, key=lambda w: -len(w)) |
|
self.informal_present_bons = sorted(self.informal_present_bons, key=lambda w: -len(w)) |
|
self.informal_past_bons = sorted(self.informal_past_bons, key=lambda w: -len(w)) |
|
verb_v_keys = [word for word in self.verb_mapper if 'آ' in word] |
|
alef_verb_v_keys = [word for word in self.alef_mapper if 'آ' in word] |
|
for v in verb_v_keys: |
|
self.verb_mapper[v.replace('آ', 'ا')] = self.verb_mapper[v] |
|
for v in alef_verb_v_keys: |
|
self.alef_mapper[v.replace('آ', 'ا')] = self.alef_mapper[v] |
|
|
|
|
|
def compile_patterns(self): |
|
ME_r = '|'.join(['می','می']) |
|
B_r = 'ب' |
|
not_r = 'ن' |
|
past_ends = ['م', 'ی', 'ه', 'یم', 'ین', 'ید', 'ند', '', 'ن'] |
|
present_ends = ['م', 'ی', 'ه', 'یم', 'ین', 'ن', 'ید', 'ند', 'د', ''] |
|
naghli_ends = ['هام', 'های', 'ه', 'هایم', 'هاید', 'هاند'] |
|
objective_pronouns = ['م', 'ت', 'ش', 'مون', 'تون', 'شون'] |
|
|
|
informal_past_r = '|'.join(self.informal_past_bons) |
|
formal_past_r = '|'.join(self.formal_past_bons) |
|
informal_present_r = '|'.join(self.informal_present_bons) |
|
formal_present_r = '|'.join(self.formal_present_bons) |
|
verb_postfix_past_r = '|'.join(past_ends) |
|
verb_postfix__present_r = '|'.join(present_ends) |
|
objective_pronouns_r = '|'.join(objective_pronouns) |
|
naghli_ends_r = '|'.join(naghli_ends) |
|
""" |
|
#گذشتهی ساده |
|
# r1 = past_r + verb_postfix_r + objectiveـpronouns_r |
|
#گذشتهی ناتمام |
|
# r2 = '(' + ME + ')'+ past_r +verb_postfix_r + objectiveـpronouns_r |
|
|
|
#گذشتهی استمراری |
|
# r3 = '(' + DASHT + ')'+ past_r + verb_postfix_r +objectiveـpronouns_r |
|
|
|
#گذشتهی نقلی |
|
# r4 = past_r + '(' + '|'.join(naghli_ends) + ')' +objectiveـpronouns_r |
|
|
|
#گذشتهی پیشین |
|
# r5 = past_r + verb_postfix_r + '(' + BUD + ')' + verb_postfix_r + objectiveـpronouns_r |
|
|
|
#حال ساده |
|
# r6 = present_r + verb_postfix_r |
|
|
|
#حال ناتمام |
|
# r7 = '(' + ME + ')'+ present_r + verb_postfix_r + objectiveـpronouns_r |
|
|
|
#حال استمراری |
|
# r8 = '( ' + DAR + ')'+ verb_postfix_r + '(' + ME + ')' + present_r + verb_postfix_r+ objectiveـpronouns_r |
|
|
|
#آیندهی ساده |
|
# r9 = '( ' + KHAH + ')'+ verb_postfix_r + present_r +objectiveـpronouns_r |
|
|
|
#التزامی - گذشته |
|
# r10 = present_r + '(ه)'+ '(' + BASH + ')' + verb_postfix_r + objectiveـpronouns_r |
|
|
|
#التزامی - حال |
|
# r11 = '(ب)' + present_r + verb_postfix_r +objectiveـpronouns_r |
|
""" |
|
|
|
|
|
|
|
formal_present_pattern_b = '({})({})({})?({})?$'.format(B_r, formal_present_r, verb_postfix__present_r, objective_pronouns_r) |
|
formal_present_pattern_n_me = '({})?({})({})({})?({})?$'.format(not_r, ME_r, formal_present_r, verb_postfix__present_r, objective_pronouns_r) |
|
formal_present_pattern_n = '({})?({})({})?({})?$'.format(not_r, formal_present_r, verb_postfix__present_r, objective_pronouns_r) |
|
formal_past_pattern = '({})?({})?({})({}|{})({})?$'.format(not_r, ME_r, formal_past_r, naghli_ends_r, |
|
verb_postfix_past_r, objective_pronouns_r) |
|
self.formal_past_verb_pattern = re.compile(formal_past_pattern) |
|
self.formal_present_verb_pattern_b = re.compile(formal_present_pattern_b) |
|
self.formal_present_verb_pattern_n_me = re.compile(formal_present_pattern_n_me) |
|
self.formal_present_verb_pattern_n = re.compile(formal_present_pattern_n) |
|
|
|
|
|
informal_present_pattern_b = '({})({})({})?({})?$'.format(B_r, informal_present_r, verb_postfix__present_r, |
|
objective_pronouns_r) |
|
informal_present_pattern_n_me = '({})?({})({})({})?({})?$'.format(not_r, ME_r, informal_present_r, |
|
verb_postfix__present_r, objective_pronouns_r) |
|
informal_present_pattern_n = '({})?({})({})?({})?$'.format(not_r, informal_present_r, verb_postfix__present_r, |
|
objective_pronouns_r) |
|
informal_past_pattern = '({})?({})?({})({}|{})({})?$'.format(not_r, ME_r, informal_past_r, naghli_ends_r, |
|
verb_postfix_past_r, objective_pronouns_r) |
|
self.informal_past_verb_pattern = re.compile(informal_past_pattern) |
|
self.informal_present_verb_pattern_b = re.compile(informal_present_pattern_b) |
|
self.informal_present_verb_pattern_n_me = re.compile(informal_present_pattern_n_me) |
|
self.informal_present_verb_pattern_n = re.compile(informal_present_pattern_n) |
|
|
|
|
|
def parse(self, token): |
|
outputs = [] |
|
|
|
match_dict_formal = {'tense': '', 'root': '', 'neg': '', 'postfix': '', 'not_r': '', 'op': '', 'b': '', 'me': '', 'naghli':''} |
|
match_dict_informal = {'tense': '', 'root': '', 'neg': '', 'postfix': '', 'not_r': '', 'op': '', 'b': '', 'me': '', 'naghli':''} |
|
formal_past_match = self.formal_past_verb_pattern.match(token) |
|
informal_past_match = self.informal_past_verb_pattern.match(token) |
|
formal_present_match_b = self.formal_present_verb_pattern_b.match(token) |
|
informal_present_match_b = self.informal_present_verb_pattern_b.match(token) |
|
formal_present_match_n_me = self.formal_present_verb_pattern_n_me.match(token) |
|
informal_present_match_n_me = self.informal_present_verb_pattern_n_me.match(token) |
|
formal_present_match_n = self.formal_present_verb_pattern_n.match(token) |
|
informal_present_match_n = self.informal_present_verb_pattern_n.match(token) |
|
present_group_to_dict_b = lambda g: {k:g[i] for i,k in enumerate(['b', 'root', 'postfix', 'op'])} |
|
present_group_to_dict_n_me = lambda g: {k:g[i] for i,k in enumerate(['neg', 'me','root', 'postfix','op'])} |
|
present_group_to_dict_n = lambda g: {k:g[i] for i,k in enumerate(['neg','root', 'postfix','op'])} |
|
past_group_to_dict = lambda g: {k:g[i] for i,k in enumerate(['neg', 'me', 'root', 'postfix', 'op'])} |
|
formal_match = formal_past_match or formal_present_match_b or formal_present_match_n_me or formal_present_match_n |
|
informal_match = informal_past_match or informal_present_match_b or informal_present_match_n_me or informal_present_match_n |
|
if formal_match: |
|
if formal_past_match: |
|
match_dict_formal = past_group_to_dict(formal_past_match.groups()) |
|
match_dict_formal['tense'] = 'past' |
|
else: |
|
if formal_present_match_b: |
|
match_dict_formal = present_group_to_dict_b(formal_present_match_b.groups()) |
|
elif formal_present_match_n_me: |
|
match_dict_formal = present_group_to_dict_n_me(formal_present_match_n_me.groups()) |
|
elif formal_present_match_n: |
|
match_dict_formal = present_group_to_dict_n(formal_present_match_n.groups()) |
|
match_dict_formal['tense'] = 'present' |
|
outputs.append(match_dict_formal) |
|
if informal_match: |
|
if informal_past_match: |
|
match_dict_informal = past_group_to_dict(informal_past_match.groups()) |
|
match_dict_informal['tense'] = 'past' |
|
else: |
|
if informal_present_match_b: |
|
match_dict_informal = present_group_to_dict_b(informal_present_match_b.groups()) |
|
elif informal_present_match_n_me: |
|
match_dict_informal = present_group_to_dict_n_me(informal_present_match_n_me.groups()) |
|
elif informal_present_match_n: |
|
match_dict_informal = present_group_to_dict_n(informal_present_match_n.groups()) |
|
match_dict_informal['tense'] = 'present' |
|
outputs.append(match_dict_informal) |
|
for match_dict in outputs: |
|
for key,val in match_dict.items(): |
|
if val is None: |
|
match_dict[key] = '' |
|
|
|
return outputs |
|
|
|
def formal_concatenate(self, match_dict, should_smooth): |
|
out_dict = {'بیای': 'بیا', 'نیای': 'نیا'} |
|
if match_dict['root'] == 'است' and match_dict['neg'] != '': |
|
return 'نیست' + match_dict['postfix'] |
|
if self.if_simple_present(match_dict) or self.if_only_me(match_dict): |
|
return None |
|
if should_smooth: |
|
if match_dict['prefix'] != '' and match_dict['prefix'][0] == 'م': |
|
pass |
|
else: |
|
match_dict['root'] = 'یا' + match_dict['root'][1:] |
|
|
|
|
|
if match_dict['prefix'] == 'ب' and match_dict['root'] and match_dict['root'][0] == 'ا': |
|
match_dict['root'] = 'ی' + match_dict['root'][1:] |
|
out = match_dict['neg'] + match_dict['prefix'] + match_dict['root'] + match_dict['postfix'] + match_dict['op'] |
|
if out in out_dict: |
|
out = out_dict[out] |
|
|
|
return out |
|
|
|
def _set_match_dict_prefix(self, match_dict): |
|
match_dict['prefix'] = '' |
|
if 'me' in match_dict and match_dict['me'] != '': |
|
if len(match_dict['me']) < 3: |
|
match_dict['me'] = 'می' |
|
match_dict['prefix'] = match_dict['me'] |
|
elif 'b' in match_dict and match_dict['b'] != '': |
|
match_dict['prefix'] = match_dict['b'] |
|
return match_dict |
|
|
|
def if_simple_present(self, match_dict): |
|
if match_dict['root'] != '' and match_dict['tense'] == 'present' and match_dict['prefix'] == '' and match_dict['neg'] == '': |
|
if match_dict['root'] not in ['کن', 'هست', 'است', 'دار', 'نیست', 'باش']: |
|
return True |
|
return False |
|
|
|
def if_only_me(self, match_dict): |
|
if match_dict['root'] != '' and match_dict['tense'] == 'present' and match_dict['prefix'] !='' and match_dict['prefix'][0] == 'م' and match_dict['postfix'] == '': |
|
return True |
|
return False |
|
|
|
def is_masdar(self, match_dict): |
|
return match_dict['root'] in self.all_past_bons and match_dict['me'] == '' and match_dict['postfix'] =='ن' and match_dict['op'] == '' |
|
|
|
def informal_to_formal(self, token): |
|
|
|
if token in self.irregular_verbs: |
|
return [self.irregular_verbs[token]] |
|
if token in self.init_mapper: |
|
token = self.init_mapper[token] |
|
outputs = [] |
|
if len(token) < 3: |
|
return None |
|
should_smooth = False |
|
all_match_dicts = self.parse(token) |
|
|
|
|
|
|
|
if len(all_match_dicts) == 2 : |
|
if all_match_dicts[1]['root'] in self.verb_mapper and self.verb_mapper[all_match_dicts[1]['root']]['formal'] == all_match_dicts[0]['root'] and all_match_dicts[1]['op'] != '': |
|
del all_match_dicts[1] |
|
elif all_match_dicts[1] == {'b': 'ب', 'root': 'ر', 'postfix': 'د', 'op': '', 'tense': 'present'}: |
|
del all_match_dicts[1] |
|
|
|
is_masdar = False |
|
for match_dict in all_match_dicts: |
|
if self.is_masdar(match_dict): |
|
is_masdar = True |
|
|
|
if match_dict['root'] != '' and match_dict['root'][0] == 'ا' and 'me' not in match_dict and ('b' in match_dict or match_dict['neg'] == 'ن'): |
|
return None |
|
if match_dict['root'] != '': |
|
root = match_dict['root'] |
|
objective_pr = match_dict['op'] |
|
postfix = match_dict['postfix'] |
|
if root in self.alef_mapper: |
|
should_smooth = True |
|
match_dict['root'] = self.alef_mapper[root] |
|
if match_dict['root'] in self.verb_mapper: |
|
match_dict['root'] = self.verb_mapper[ match_dict['root']]['formal'] |
|
if postfix in self.posfix_mapper: |
|
match_dict['postfix'] = self.posfix_mapper[postfix] |
|
if match_dict['postfix'] == 'د' and match_dict['tense'] == 'past': |
|
match_dict['postfix'] = 'ه' |
|
if objective_pr in self.objective_pr_mapper: |
|
match_dict['op'] = self.objective_pr_mapper[objective_pr] |
|
match_dict['prefix'] = '' |
|
if 'neg' not in match_dict: |
|
match_dict['neg'] = '' |
|
match_dict = self._set_match_dict_prefix(match_dict) |
|
formal_verb = self.formal_concatenate(match_dict, should_smooth) |
|
outputs.append(formal_verb) |
|
not_none_outpts = [o for o in outputs if o is not None] |
|
for index, item in enumerate(not_none_outpts): |
|
if item in self.out_mapper: |
|
not_none_outpts[index] = self.out_mapper[item] |
|
if not_none_outpts: |
|
|
|
if len(not_none_outpts) == 1 and is_masdar: |
|
masdar = not_none_outpts[0][:-2] + 'ن' |
|
not_none_outpts.append(masdar) |
|
return not_none_outpts |
|
return None |