persian_informal_translator / VerbHandler.py
mohammadkrb's picture
init streamlit based app
6227608
import re
from enum import Enum
from hazm import Normalizer
import pandas as pd
Formality = Enum('Formality', 'formal informal')
VerbTime = Enum('VerbTime', 'past present future')
Person = Enum('Person', 'Man To An Ma Shoma Anha')
Number = Enum('Number', 'Mofrad Jam')
class Verb:
def __init__(self, root, formality, time, pp, person, number):
self.root = root
self.formality = formality
self.time = time
self.person = person
self.number = number
self.pp = pp
class VerbHandler():
def __init__(self, csv_verb_addr, csv_irregular_verbs_mapper):
self.posfix_mapper = {'ه': 'د', 'ن': 'ند', 'ین': 'ید'}
self.objective_pr_mapper = {'شون':'شان', 'تون':'تان', 'مون':'مان'}
self.init_mapper = {'کنه': 'بکنه', 'کنم':'بکنم', 'کنی':'بکنی', 'کنیم': 'بکنیم', 'کنین':'بکنین', 'کنید':'بکنید', 'کنن':'بکنن', 'کنند':'بکنند'}
self.out_mapper = {'می‌ایی': 'می‌آیی'}
self.init_mapper.update({'شم':'بشم', 'شی':'بشی', 'شن':'بشن', 'شین':'بشین' ,'شه':'بشه', 'شیم': 'بشیم'})
self.bons = self.load_bons(csv_verb_addr)
self.irregular_verbs = self.load_irregular_mapper(csv_irregular_verbs_mapper)
self.informal_past_bons = self.get_bons(type=Formality.informal, time=VerbTime.past)
self.informal_present_bons = self.get_bons(type=Formality.informal, time=VerbTime.present)
self.formal_past_bons = self.get_bons(type=Formality.formal, time=VerbTime.past)
self.formal_present_bons =self.get_bons(type=Formality.formal, time=VerbTime.present) + ['هست']
self.all_past_bons = self.formal_past_bons + self.informal_past_bons
self.all_present_bons = self.formal_present_bons + self.informal_present_bons
self.verb_mapper = {b:{'formal':self.bons[b]['formal']} for b in self.bons if self.bons[b]['type'] == Formality.informal}
self.solve_alef_issue()
self.compile_patterns()
def load_irregular_mapper(self, csv_addr):
df = pd.read_csv(csv_addr)
mapper = {informal: formal for _, (informal, formal) in df.iterrows()}
return mapper
def load_bons(self, csv_addr):
normalizer = Normalizer()
df = pd.read_csv(csv_addr)
df = df.fillna('')
bons = {}
for i, row in df.iterrows():
if row[2]:
row[2] = normalizer.normalize(row[2])
bons[row[2]] = {'type': Formality.formal, 'time': VerbTime.past}
if row[3]:
row[3] = normalizer.normalize(row[3])
bons[row[3]] = {'type': Formality.formal, 'time': VerbTime.present}
if row[10]:
bs = row[10].split()
for b in bs:
bons[b] = {'type': Formality.informal, 'time': VerbTime.past, 'formal': row[2]}
if row[11]:
bs = row[11].split()
for b in bs:
bons[b] = {'type': Formality.informal, 'time': VerbTime.present, 'formal': row[3]}
return bons
def get_bons(self, type, time):
return [b for b in self.bons if self.bons[b]['type'] == type and self.bons[b]['time'] == time]
def solve_alef_issue(self):
replace_alef_y = lambda v : 'ی' + v[1:]
replace_A_YA = lambda v : 'یا' + v[1:]
informal_past_start_with_alef = list(map(replace_alef_y, [v for v in self.informal_past_bons if v.startswith('ا') and not v.startswith('ای')]))
formal_past_start_with_alef = list(map(replace_alef_y, [v for v in self.formal_past_bons if v.startswith('ا') and not v.startswith('ای')]))
informal_present_start_with_alef = list(map(replace_alef_y, [v for v in self.informal_present_bons if v.startswith('ا') and not v.startswith('ای')]))
formal_present_start_with_alef = list(map(replace_alef_y, [v for v in self.formal_present_bons if v.startswith('ا') and not v.startswith('ای')]))
self.alef_mapper = {}
self.informal_past_start_with_alef = informal_past_start_with_alef + list(
map(replace_A_YA, [v for v in self.informal_past_bons if v.startswith('آ')]))
self.informal_present_start_with_alef = informal_present_start_with_alef + list(
map(replace_A_YA, [v for v in self.informal_present_bons if v.startswith('آ')]))
self.formal_past_start_with_alef = formal_past_start_with_alef + list(
map(replace_A_YA, [v for v in self.formal_past_bons if v.startswith('آ')]))
self.formal_present_start_with_alef = formal_present_start_with_alef + list(
map(replace_A_YA, [v for v in self.formal_present_bons if v.startswith('آ')]))
for verb in self.informal_past_start_with_alef + self.informal_present_start_with_alef + self.formal_past_start_with_alef + self.formal_present_start_with_alef:
if verb[:2] == 'یا':
origin = 'آ' + verb[2:]
else:
origin = 'ا' + verb[1:]
self.alef_mapper[verb] = origin
self.alef_mapper['یای'] = 'آی'
remove_a_hat = lambda w: w.replace('آ', 'ا')
self.formal_past_bons = list(
filter(lambda w: w != '', map(remove_a_hat, self.formal_past_bons + self.formal_past_start_with_alef)))
self.formal_present_bons = list(map(remove_a_hat, self.formal_present_bons + self.formal_present_start_with_alef)) + [
'یای'] + ['آی']
self.informal_past_bons = list(
filter(lambda w: w != '', map(remove_a_hat, self.informal_past_bons + self.informal_past_start_with_alef)))
self.informal_present_bons = list(
map(remove_a_hat, self.informal_present_bons + self.informal_present_start_with_alef)) + [
'یای'] + ['آی']
# sorted by length
self.formal_present_bons = sorted(self.formal_present_bons, key=lambda w: -len(w))
self.formal_past_bons = sorted(self.formal_past_bons, key=lambda w: -len(w))
self.informal_present_bons = sorted(self.informal_present_bons, key=lambda w: -len(w))
self.informal_past_bons = sorted(self.informal_past_bons, key=lambda w: -len(w))
verb_v_keys = [word for word in self.verb_mapper if 'آ' in word]
alef_verb_v_keys = [word for word in self.alef_mapper if 'آ' in word]
for v in verb_v_keys:
self.verb_mapper[v.replace('آ', 'ا')] = self.verb_mapper[v]
for v in alef_verb_v_keys:
self.alef_mapper[v.replace('آ', 'ا')] = self.alef_mapper[v]
def compile_patterns(self):
ME_r = '|'.join(['می','می‌'])
B_r = 'ب'
not_r = 'ن'
past_ends = ['م', 'ی', 'ه', 'یم', 'ین', 'ید', 'ند', '', 'ن']
present_ends = ['م', 'ی', 'ه', 'یم', 'ین', 'ن', 'ید', 'ند', 'د', '']
naghli_ends = ['ه‌ام', 'ه‌ای', 'ه', 'ه‌ایم', 'ه‌اید', 'ه‌اند']
objective_pronouns = ['م', 'ت', 'ش', 'مون', 'تون', 'شون']
informal_past_r = '|'.join(self.informal_past_bons)
formal_past_r = '|'.join(self.formal_past_bons)
informal_present_r = '|'.join(self.informal_present_bons)
formal_present_r = '|'.join(self.formal_present_bons)
verb_postfix_past_r = '|'.join(past_ends)
verb_postfix__present_r = '|'.join(present_ends)
objective_pronouns_r = '|'.join(objective_pronouns)
naghli_ends_r = '|'.join(naghli_ends)
"""
#گذشته‌ی ساده
# r1 = past_r + verb_postfix_r + objectiveـpronouns_r
#گذشته‌ی ناتمام
# r2 = '(' + ME + ')'+ past_r +verb_postfix_r + objectiveـpronouns_r
#گذشته‌ی استمراری
# r3 = '(' + DASHT + ')'+ past_r + verb_postfix_r +objectiveـpronouns_r
#گذشته‌ی نقلی
# r4 = past_r + '(' + '|'.join(naghli_ends) + ')' +objectiveـpronouns_r
#گذشته‌ی پیشین
# r5 = past_r + verb_postfix_r + '(' + BUD + ')' + verb_postfix_r + objectiveـpronouns_r
#حال ساده
# r6 = present_r + verb_postfix_r
#حال ناتمام
# r7 = '(' + ME + ')'+ present_r + verb_postfix_r + objectiveـpronouns_r
#حال استمراری
# r8 = '( ' + DAR + ')'+ verb_postfix_r + '(' + ME + ')' + present_r + verb_postfix_r+ objectiveـpronouns_r
#آینده‌ی ساده
# r9 = '( ' + KHAH + ')'+ verb_postfix_r + present_r +objectiveـpronouns_r
#التزامی - گذشته
# r10 = present_r + '(ه)'+ '(' + BASH + ')' + verb_postfix_r + objectiveـpronouns_r
#التزامی - حال
# r11 = '(ب)' + present_r + verb_postfix_r +objectiveـpronouns_r
"""
#
# + : fealhaye rasmi + pasvan informal , hale sade baraye bazi fela ( hast, kon)
# formal
formal_present_pattern_b = '({})({})({})?({})?$'.format(B_r, formal_present_r, verb_postfix__present_r, objective_pronouns_r)
formal_present_pattern_n_me = '({})?({})({})({})?({})?$'.format(not_r, ME_r, formal_present_r, verb_postfix__present_r, objective_pronouns_r)
formal_present_pattern_n = '({})?({})({})?({})?$'.format(not_r, formal_present_r, verb_postfix__present_r, objective_pronouns_r)
formal_past_pattern = '({})?({})?({})({}|{})({})?$'.format(not_r, ME_r, formal_past_r, naghli_ends_r,
verb_postfix_past_r, objective_pronouns_r)
self.formal_past_verb_pattern = re.compile(formal_past_pattern)
self.formal_present_verb_pattern_b = re.compile(formal_present_pattern_b)
self.formal_present_verb_pattern_n_me = re.compile(formal_present_pattern_n_me)
self.formal_present_verb_pattern_n = re.compile(formal_present_pattern_n)
#informal
informal_present_pattern_b = '({})({})({})?({})?$'.format(B_r, informal_present_r, verb_postfix__present_r,
objective_pronouns_r)
informal_present_pattern_n_me = '({})?({})({})({})?({})?$'.format(not_r, ME_r, informal_present_r,
verb_postfix__present_r, objective_pronouns_r)
informal_present_pattern_n = '({})?({})({})?({})?$'.format(not_r, informal_present_r, verb_postfix__present_r,
objective_pronouns_r)
informal_past_pattern = '({})?({})?({})({}|{})({})?$'.format(not_r, ME_r, informal_past_r, naghli_ends_r,
verb_postfix_past_r, objective_pronouns_r)
self.informal_past_verb_pattern = re.compile(informal_past_pattern)
self.informal_present_verb_pattern_b = re.compile(informal_present_pattern_b)
self.informal_present_verb_pattern_n_me = re.compile(informal_present_pattern_n_me)
self.informal_present_verb_pattern_n = re.compile(informal_present_pattern_n)
def parse(self, token):
outputs = []
match_dict_formal = {'tense': '', 'root': '', 'neg': '', 'postfix': '', 'not_r': '', 'op': '', 'b': '', 'me': '', 'naghli':''}
match_dict_informal = {'tense': '', 'root': '', 'neg': '', 'postfix': '', 'not_r': '', 'op': '', 'b': '', 'me': '', 'naghli':''}
formal_past_match = self.formal_past_verb_pattern.match(token)
informal_past_match = self.informal_past_verb_pattern.match(token)
formal_present_match_b = self.formal_present_verb_pattern_b.match(token)
informal_present_match_b = self.informal_present_verb_pattern_b.match(token)
formal_present_match_n_me = self.formal_present_verb_pattern_n_me.match(token)
informal_present_match_n_me = self.informal_present_verb_pattern_n_me.match(token)
formal_present_match_n = self.formal_present_verb_pattern_n.match(token)
informal_present_match_n = self.informal_present_verb_pattern_n.match(token)
present_group_to_dict_b = lambda g: {k:g[i] for i,k in enumerate(['b', 'root', 'postfix', 'op'])}
present_group_to_dict_n_me = lambda g: {k:g[i] for i,k in enumerate(['neg', 'me','root', 'postfix','op'])}
present_group_to_dict_n = lambda g: {k:g[i] for i,k in enumerate(['neg','root', 'postfix','op'])}
past_group_to_dict = lambda g: {k:g[i] for i,k in enumerate(['neg', 'me', 'root', 'postfix', 'op'])}
formal_match = formal_past_match or formal_present_match_b or formal_present_match_n_me or formal_present_match_n
informal_match = informal_past_match or informal_present_match_b or informal_present_match_n_me or informal_present_match_n
if formal_match:
if formal_past_match:
match_dict_formal = past_group_to_dict(formal_past_match.groups())
match_dict_formal['tense'] = 'past'
else:
if formal_present_match_b:
match_dict_formal = present_group_to_dict_b(formal_present_match_b.groups())
elif formal_present_match_n_me:
match_dict_formal = present_group_to_dict_n_me(formal_present_match_n_me.groups())
elif formal_present_match_n:
match_dict_formal = present_group_to_dict_n(formal_present_match_n.groups())
match_dict_formal['tense'] = 'present'
outputs.append(match_dict_formal)
if informal_match:
if informal_past_match:
match_dict_informal = past_group_to_dict(informal_past_match.groups())
match_dict_informal['tense'] = 'past'
else:
if informal_present_match_b:
match_dict_informal = present_group_to_dict_b(informal_present_match_b.groups())
elif informal_present_match_n_me:
match_dict_informal = present_group_to_dict_n_me(informal_present_match_n_me.groups())
elif informal_present_match_n:
match_dict_informal = present_group_to_dict_n(informal_present_match_n.groups())
match_dict_informal['tense'] = 'present'
outputs.append(match_dict_informal)
for match_dict in outputs:
for key,val in match_dict.items():
if val is None:
match_dict[key] = ''
# print(match_dict)
return outputs
def formal_concatenate(self, match_dict, should_smooth):
out_dict = {'بیای': 'بیا', 'نیای': 'نیا'}
if match_dict['root'] == 'است' and match_dict['neg'] != '':
return 'نیست' + match_dict['postfix']
if self.if_simple_present(match_dict) or self.if_only_me(match_dict):
return None
if should_smooth:
if match_dict['prefix'] != '' and match_dict['prefix'][0] == 'م':
pass
else:
match_dict['root'] = 'یا' + match_dict['root'][1:]
# if len(match_dict['prefix']) == 3:
# match_dict['prefix'] = 'می'
if match_dict['prefix'] == 'ب' and match_dict['root'] and match_dict['root'][0] == 'ا':
match_dict['root'] = 'ی' + match_dict['root'][1:]
out = match_dict['neg'] + match_dict['prefix'] + match_dict['root'] + match_dict['postfix'] + match_dict['op']
if out in out_dict:
out = out_dict[out]
return out
def _set_match_dict_prefix(self, match_dict):
match_dict['prefix'] = ''
if 'me' in match_dict and match_dict['me'] != '':
if len(match_dict['me']) < 3:
match_dict['me'] = 'می‌'
match_dict['prefix'] = match_dict['me']
elif 'b' in match_dict and match_dict['b'] != '':
match_dict['prefix'] = match_dict['b']
return match_dict
def if_simple_present(self, match_dict):
if match_dict['root'] != '' and match_dict['tense'] == 'present' and match_dict['prefix'] == '' and match_dict['neg'] == '':
if match_dict['root'] not in ['کن', 'هست', 'است', 'دار', 'نیست', 'باش']:
return True
return False
def if_only_me(self, match_dict):
if match_dict['root'] != '' and match_dict['tense'] == 'present' and match_dict['prefix'] !='' and match_dict['prefix'][0] == 'م' and match_dict['postfix'] == '':
return True
return False
def is_masdar(self, match_dict):
return match_dict['root'] in self.all_past_bons and match_dict['me'] == '' and match_dict['postfix'] =='ن' and match_dict['op'] == ''
def informal_to_formal(self, token):
# irregular verbs checking
if token in self.irregular_verbs:
return [self.irregular_verbs[token]]
if token in self.init_mapper:
token = self.init_mapper[token]
outputs = []
if len(token) < 3:
return None
should_smooth = False
all_match_dicts = self.parse(token)
### بدهدم
#برد
if len(all_match_dicts) == 2 :
if all_match_dicts[1]['root'] in self.verb_mapper and self.verb_mapper[all_match_dicts[1]['root']]['formal'] == all_match_dicts[0]['root'] and all_match_dicts[1]['op'] != '':
del all_match_dicts[1]
elif all_match_dicts[1] == {'b': 'ب', 'root': 'ر', 'postfix': 'د', 'op': '', 'tense': 'present'}:
del all_match_dicts[1]
##
is_masdar = False
for match_dict in all_match_dicts:
if self.is_masdar(match_dict):
is_masdar = True
#نان بان
if match_dict['root'] != '' and match_dict['root'][0] == 'ا' and 'me' not in match_dict and ('b' in match_dict or match_dict['neg'] == 'ن'):
return None
if match_dict['root'] != '':
root = match_dict['root']
objective_pr = match_dict['op']
postfix = match_dict['postfix']
if root in self.alef_mapper:
should_smooth = True
match_dict['root'] = self.alef_mapper[root]
if match_dict['root'] in self.verb_mapper:
match_dict['root'] = self.verb_mapper[ match_dict['root']]['formal']
if postfix in self.posfix_mapper:
match_dict['postfix'] = self.posfix_mapper[postfix]
if match_dict['postfix'] == 'د' and match_dict['tense'] == 'past':
match_dict['postfix'] = 'ه'
if objective_pr in self.objective_pr_mapper:
match_dict['op'] = self.objective_pr_mapper[objective_pr]
match_dict['prefix'] = ''
if 'neg' not in match_dict:
match_dict['neg'] = ''
match_dict = self._set_match_dict_prefix(match_dict)
formal_verb = self.formal_concatenate(match_dict, should_smooth)
outputs.append(formal_verb)
not_none_outpts = [o for o in outputs if o is not None]
for index, item in enumerate(not_none_outpts):
if item in self.out_mapper:
not_none_outpts[index] = self.out_mapper[item]
if not_none_outpts:
# append bon
if len(not_none_outpts) == 1 and is_masdar:
masdar = not_none_outpts[0][:-2] + 'ن'
not_none_outpts.append(masdar)
return not_none_outpts
return None