mohammadkrb
commited on
Commit
•
6227608
1
Parent(s):
3fd00d4
init streamlit based app
Browse files- OneShotTransformer.py +600 -0
- VerbHandler.py +350 -0
- app.py +128 -0
- config.yml +5 -0
- download_utils.py +65 -0
- formality_transformer.py +40 -0
- kenlm_wrapper.py +31 -0
- main.py +96 -0
- requirements.txt +7 -0
- tokenizer.py +184 -0
- utils.py +302 -0
OneShotTransformer.py
ADDED
@@ -0,0 +1,600 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import itertools
|
3 |
+
import string
|
4 |
+
import utils
|
5 |
+
|
6 |
+
|
7 |
+
class InformalWord:
|
8 |
+
def __init__(self, lemma, prefixs=None, postfixs=None, pos=None, append_h=False):
|
9 |
+
if prefixs is None:
|
10 |
+
prefixs = []
|
11 |
+
if postfixs is None:
|
12 |
+
postfixs = []
|
13 |
+
self.is_verb = False
|
14 |
+
self.is_mapper = False
|
15 |
+
self.semi_mapper = False
|
16 |
+
self.append_h = append_h
|
17 |
+
self.lemma = lemma
|
18 |
+
self.prefixs = prefixs
|
19 |
+
self.postfixs = postfixs
|
20 |
+
self.pos = pos
|
21 |
+
|
22 |
+
class Prefix:
|
23 |
+
def __init__(self, word, level, formal=None, ignore_poses=None, poses=None, non_connecting_chars=None, connector='nim'):
|
24 |
+
if non_connecting_chars is None:
|
25 |
+
non_connecting_chars = []
|
26 |
+
self.word = word
|
27 |
+
self.level = level
|
28 |
+
self.ignore_poses = ignore_poses
|
29 |
+
self.poses = poses
|
30 |
+
self.connector = connector
|
31 |
+
if formal is None:
|
32 |
+
self.formal = word
|
33 |
+
else:
|
34 |
+
self.formal = formal
|
35 |
+
self.non_connecting_chars = non_connecting_chars
|
36 |
+
class Postfix:
|
37 |
+
def __init__(self, word, level, formal=None, ignore_poses=None, non_connecting_chars=None, poses=None, connector='nim'):
|
38 |
+
if non_connecting_chars is None:
|
39 |
+
non_connecting_chars = []
|
40 |
+
self.word = word
|
41 |
+
self.level = level
|
42 |
+
self.ignore_poses = ignore_poses
|
43 |
+
self.poses = poses
|
44 |
+
self.connector = connector
|
45 |
+
if formal is None:
|
46 |
+
self.formal = word
|
47 |
+
else:
|
48 |
+
self.formal = formal
|
49 |
+
self.non_connecting_chars = non_connecting_chars
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
class OneShotTransformer:
|
54 |
+
|
55 |
+
NIM_FASELE = chr(8204)
|
56 |
+
# prefixs
|
57 |
+
HAMUN = Prefix('همون', 1, 'همان',connector='fasele',non_connecting_chars=['ه'])
|
58 |
+
HAMIN = Prefix('همین', 1,connector='fasele')
|
59 |
+
HAR = Prefix('هر', 1,connector='fasele')
|
60 |
+
UN = Prefix('اون', 1, 'آن',connector='fasele',non_connecting_chars=['ه'])
|
61 |
+
IN = Prefix('این', 1,connector='fasele',non_connecting_chars=['ه'])
|
62 |
+
HICH = Prefix('هیچ', 1,connector='nim',non_connecting_chars=['ه', 'ا', 'آ'])
|
63 |
+
B = Prefix('ب', 1, 'به', ignore_poses=['VERB', 'CCONJ', 'SCONJ'],connector='fasele',non_connecting_chars=['ا', 'ه', 'آ'])
|
64 |
+
Y = Prefix('ی', 1, 'یک', ignore_poses=['VERB', 'CCONJ', 'SCONJ'],connector='fasele',non_connecting_chars=['ا', 'آ'])
|
65 |
+
BI = Prefix('بی', 1, ignore_poses=['VERB'],connector='nim',non_connecting_chars=['ا'])
|
66 |
+
POR = Prefix('پر', 1, ignore_poses=['VERB'],connector='nim')
|
67 |
+
pres = [[HAMIN, HAMUN, UN, IN, HAMIN, BI, B, Y, POR, HAR]]
|
68 |
+
#postfixs
|
69 |
+
Y1 = Postfix('ی', 0, ignore_poses=['VERB'], connector='none',non_connecting_chars=['ی', 'ا', 'و', 'آ', 'اً'])
|
70 |
+
TAR = Postfix('تر', 1, connector='nim')
|
71 |
+
TARIN = Postfix('ترین', 1, connector='nim')
|
72 |
+
HAY = Postfix('های', 2, connector='nim')
|
73 |
+
HA = Postfix('ها', 2, connector='nim')
|
74 |
+
A = Postfix('ا', 2, 'ها', ignore_poses=['VERB'], connector='nim',non_connecting_chars=['ا', 'و', 'آ', 'اً'])
|
75 |
+
A1 = Postfix('ای', 2, 'های', ignore_poses=['VERB'], connector='nim',non_connecting_chars=['ا', 'و', 'آ', 'اً'])
|
76 |
+
YY = Postfix('یی', 3, 'یی', ignore_poses=['VERB'], connector='none')
|
77 |
+
M = Postfix('م', 3, ignore_poses=['VERB'], connector='none')
|
78 |
+
M_MAN = Postfix('م', 3, 'من', ignore_poses=['VERB'], connector='fasele')
|
79 |
+
T = Postfix('ت', 3, connector='none')
|
80 |
+
T1 = Postfix('ت', 3, 'تو', connector='fasele')
|
81 |
+
# T2 = Postfix('ت', 3, 'خود', ignore_poses=['VERB'], connector='fasele')
|
82 |
+
SH = Postfix('ش', 3, connector='none')
|
83 |
+
# SH1 = Postfix('ش', 3, 'خود', connector='fasele')
|
84 |
+
# SH2 = Postfix('ش', 3, 'آن', connector='fasele')
|
85 |
+
# SH3 = Postfix('ش', 3, 'او', connector='fasele')
|
86 |
+
MAN = Postfix('مان', 3, connector='nim')
|
87 |
+
MAN1 = Postfix('مان', 3, 'ما', connector='fasele')
|
88 |
+
# MAN2 = Postfix('مان', 3, 'خود', connector='fasele')
|
89 |
+
MUN = Postfix('مون', 3, 'مان', connector='nim')
|
90 |
+
# MUN1 = Postfix('مون', 3, 'خود', connector='fasele')
|
91 |
+
MUN2 = Postfix('مون', 3, 'ما', connector='fasele')
|
92 |
+
TAN = Postfix('تان', 3, connector='nim')
|
93 |
+
# TAN1 = Postfix('تان', 3, 'خود', connector='fasele')
|
94 |
+
TAN2 = Postfix('تان', 3, 'شما', connector='fasele')
|
95 |
+
TUN = Postfix('تون', 3, 'تان', connector='nim')
|
96 |
+
# TUN1 = Postfix('تون', 3, 'خود', connector='fasele')
|
97 |
+
TUN2 = Postfix('تون', 3, 'شما', connector='fasele')
|
98 |
+
SHAN = Postfix('شان', 3, connector='nim')
|
99 |
+
# SHAN1 = Postfix('شان', 3, 'خود', connector='fasele')
|
100 |
+
SHAN2 = Postfix('شان', 3, 'آنان', connector='fasele')
|
101 |
+
SHUN = Postfix('شون', 3, 'شان', connector='nim')
|
102 |
+
# SHUN1 = Postfix('شون', 3, 'خود', connector='fasele')
|
103 |
+
SHUN2 = Postfix('شون', 3, 'آنان', connector='fasele')
|
104 |
+
N = Postfix('ن', 4, 'هستند', ignore_poses=['VERB', 'CCONJ', 'SCONJ'], connector='fasele', non_connecting_chars=['ی'])
|
105 |
+
SHAM = Postfix('شم', 4, 'بشوم',ignore_poses=['VERB'], connector='fasele')
|
106 |
+
SHI= Postfix('شی', 4, 'بشوی',ignore_poses=['VERB'], connector='fasele')
|
107 |
+
SHE= Postfix('شه', 4, 'شود',ignore_poses=['VERB'], connector='fasele')
|
108 |
+
SHIN= Postfix('شین', 4, 'شوید',ignore_poses=['VERB'], connector='fasele')
|
109 |
+
SHID= Postfix('شید', 4, 'شوید',ignore_poses=['VERB'], connector='fasele')
|
110 |
+
SHAAN= Postfix('شن', 4, 'شوند',ignore_poses=['VERB'], connector='fasele')
|
111 |
+
SHAND= Postfix('شند', 4, 'شوند',ignore_poses=['VERB'], connector='fasele')
|
112 |
+
M2 = Postfix('م', 4, 'هم',ignore_poses=['VERB'], connector='fasele')
|
113 |
+
V = Postfix('و', 4, 'را', connector='fasele', non_connecting_chars=['ا', 'ای', 'آ', 'اً'])
|
114 |
+
V1 = Postfix('رو', 4, 'را', connector='fasele')
|
115 |
+
H = Postfix('ه', 4, '', ignore_poses=['VERB', 'CCONJ', 'SCONJ'], connector='none')
|
116 |
+
# H2 = Postfix('ه', 4)
|
117 |
+
M1 = Postfix('م', 4, 'هستم',ignore_poses=['VERB'], connector='fasele')
|
118 |
+
Y2 = Postfix('ی', 4, 'ی', ignore_poses=['VERB'], connector='none')
|
119 |
+
H1 = Postfix('ه', 4, 'است', ignore_poses=['VERB'], connector='fasele', non_connecting_chars=['ا', 'آ', 'اً'])
|
120 |
+
S = Postfix('س', 4, 'است', connector='fasele')
|
121 |
+
ST = Postfix('ست', 4, 'است', connector='fasele')
|
122 |
+
ED = Postfix('ید', 4, 'هستید', ignore_poses=['VERB'], connector='fasele')
|
123 |
+
EN = Postfix('ین', 4, 'هستید', ignore_poses=['VERB'], connector='fasele', non_connecting_chars=['تر'])
|
124 |
+
EM = Postfix('یم', 4, 'هستیم', ignore_poses=['VERB'], connector='fasele')
|
125 |
+
ND = Postfix('ند', 4, 'هستند', ignore_poses=['VERB'], connector='fasele')
|
126 |
+
# posts = [[Y1], [TAR, TARIN], [HA, HAY, A, A1], [M, T, SH, MAN, MUN, TAN, TUN, SHAN, SHUN], [N, S, ST, M1, M2, V, V1,Y2, H, H1, ED, EN, EM, ND, SHAM, SHI, SHID, SHE, SHAND, SHIN, SHAAN]]
|
127 |
+
# posts = [[Y1], [TAR, TARIN], [HA, HAY, A, A1], [YY, M, M_MAN, T, T1, T2, SH, MAN, MAN1, MAN2,MUN,MUN1,MUN2, TAN,TAN1,TAN2, TUN,TUN1,TUN2, SHAN,SHAN1,SHAN2, SHUN, SHUN1, SHUN2], [N, S, ST, M1, M2, V, V1,Y2, H1, ED, EN, EM, ND, SHAM, SHI, SHID, SHE, SHAND, SHIN, SHAAN]]
|
128 |
+
posts = [[Y1], [TAR, TARIN], [HA, HAY, A, A1], [YY, M, M_MAN, T, T1, SH, MAN, MAN1,MUN,MUN2, TAN,TAN2, TUN,TUN2, SHAN,SHAN2, SHUN, SHUN2], [N, S, ST, M1, M2, V, V1,Y2, H1, ED, EN, EM, ND, SHAM, SHI, SHID, SHE, SHAND, SHIN, SHAAN]]
|
129 |
+
PossessiveـPronouns = [M,T,SH, MAN, MUN, TAN, TUN, SHAN, SHUN]
|
130 |
+
cant_append_h_posts = [Y1, TAR, TARIN]
|
131 |
+
As = [A, A1]
|
132 |
+
|
133 |
+
def get_separator(self, w1, w2, append_h):
|
134 |
+
connector_2_str = {'none': '', 'nim': OneShotTransformer.NIM_FASELE, 'fasele': ' '}
|
135 |
+
not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و']
|
136 |
+
# if w2 == OneShotTransformer.Y2:
|
137 |
+
# return ''
|
138 |
+
# if w2 in [OneShotTransformer.M, OneShotTransformer.T, OneShotTransformer.SH] and ( type(w1) == str and w1[-1] in ['ا', 'و']):
|
139 |
+
# return 'ی'
|
140 |
+
# if type(w1) != str and w1.level == 1:
|
141 |
+
# return ' '
|
142 |
+
# not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و']
|
143 |
+
# if w1 in [OneShotTransformer.Y, OneShotTransformer.B, OneShotTransformer.HAMIN, OneShotTransformer.IN, OneShotTransformer.HAMUN] or w2 in [OneShotTransformer.ED, OneShotTransformer.EN, OneShotTransformer.EM, OneShotTransformer.ND, OneShotTransformer.H1, OneShotTransformer.M1, OneShotTransformer.S, OneShotTransformer.ST, OneShotTransformer.V, OneShotTransformer.N, OneShotTransformer.M2]:
|
144 |
+
# return ' '
|
145 |
+
#
|
146 |
+
# if ((type(w1) == str and len(w1)> 0 and w1[-1] in ['ا', 'و']) or (type(w1) != str and w1.formal[-1] in [ 'ا', 'و']))and w2.level == 3 :
|
147 |
+
# return 'ی' + ''
|
148 |
+
# if (type(w1) == str and len(w1)> 0 and w1[-1] in not_connect_chars) or (type(w1) != str and w1.word[-1] in not_connect_chars):
|
149 |
+
# return ''
|
150 |
+
all_pres = [p for pres in OneShotTransformer.pres for p in pres]
|
151 |
+
all_posts = [p for posts in OneShotTransformer.posts for p in posts]
|
152 |
+
if type(w1) == str:
|
153 |
+
last_ch = w1[-1]
|
154 |
+
else:
|
155 |
+
last_ch = w1.word[-1]
|
156 |
+
separator = ''
|
157 |
+
extra_sep = ''
|
158 |
+
if type(w1) == str and append_h and w2 in [OneShotTransformer.M, OneShotTransformer.T, OneShotTransformer.SH]:
|
159 |
+
extra_sep = OneShotTransformer.NIM_FASELE + 'ا'
|
160 |
+
if w2 in [OneShotTransformer.M, OneShotTransformer.T, OneShotTransformer.SH, OneShotTransformer.MAN, OneShotTransformer.MUN, OneShotTransformer.TAN, OneShotTransformer.TUN, OneShotTransformer.SHAN, OneShotTransformer.SHUN] and ( last_ch in ['ا', 'و']) :
|
161 |
+
extra_sep = 'ی'
|
162 |
+
if w1 in all_pres:
|
163 |
+
separator = connector_2_str[w1.connector]
|
164 |
+
if w2 in all_posts:
|
165 |
+
separator = connector_2_str[w2.connector]
|
166 |
+
|
167 |
+
# replace nim_fasele with '' for non connected words
|
168 |
+
|
169 |
+
if last_ch in not_connect_chars and separator == OneShotTransformer.NIM_FASELE:
|
170 |
+
separator = ''
|
171 |
+
return extra_sep + separator
|
172 |
+
|
173 |
+
def lemma_to_formals(self, iword):
|
174 |
+
out_iwords = [iword]
|
175 |
+
if iword.lemma in self.mapper and self.iword2str(iword) != self.mapper[iword.lemma]:
|
176 |
+
for map_words in self.mapper[iword.lemma]:
|
177 |
+
new_iw = InformalWord(lemma=map_words,prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos, append_h=iword.append_h)
|
178 |
+
if not iword.prefixs and not iword.postfixs:
|
179 |
+
new_iw.is_mapper = True
|
180 |
+
new_iw.semi_mapper = True
|
181 |
+
else:
|
182 |
+
new_iw.semi_mapper = True
|
183 |
+
out_iwords.append(new_iw)
|
184 |
+
formal_verbs = self.verb_to_formal_func(iword.lemma)
|
185 |
+
if formal_verbs is not None:
|
186 |
+
for f_v in formal_verbs:
|
187 |
+
new_iw = InformalWord(lemma=f_v,prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos, append_h=iword.append_h)
|
188 |
+
new_iw.is_verb = True
|
189 |
+
out_iwords.append(new_iw)
|
190 |
+
return out_iwords
|
191 |
+
|
192 |
+
|
193 |
+
def should_ignore_by_postagg(self, iword):
|
194 |
+
post_pres = [pre for pre in iword.prefixs] + [post for post in iword.postfixs]
|
195 |
+
for p in post_pres:
|
196 |
+
if (p.ignore_poses and iword.pos in p.ignore_poses) or (p.poses and iword.pos not in p.poses):
|
197 |
+
return True
|
198 |
+
return False
|
199 |
+
|
200 |
+
def filtered_based_on_rules(self, iword):
|
201 |
+
#YY
|
202 |
+
ha_p = [OneShotTransformer.A, OneShotTransformer.HA]
|
203 |
+
if iword.postfixs and OneShotTransformer.YY in iword.postfixs and not all(p in ha_p + [OneShotTransformer.YY] for p in iword.postfixs):
|
204 |
+
return True
|
205 |
+
#hasti!
|
206 |
+
if (iword.postfixs and len(iword.postfixs) == 1 and OneShotTransformer.Y2 in iword.postfixs and iword.lemma and iword.lemma[-1] in ['و', 'ا']) or (iword.postfixs and len(iword.postfixs) == 2 and OneShotTransformer.Y2 in iword.postfixs and iword.postfixs[0] in [OneShotTransformer.A, OneShotTransformer.HA]):
|
207 |
+
return True
|
208 |
+
#non connecting chars
|
209 |
+
if iword.prefixs:
|
210 |
+
last_pre = iword.prefixs[-1]
|
211 |
+
if last_pre.non_connecting_chars and iword.lemma and any(iword.lemma.startswith(ch) for ch in last_pre.non_connecting_chars):
|
212 |
+
return True
|
213 |
+
if iword.postfixs:
|
214 |
+
first_post = iword.postfixs[0]
|
215 |
+
if first_post.non_connecting_chars and iword.lemma and any(iword.lemma.endswith(ch) for ch in first_post.non_connecting_chars):
|
216 |
+
return True
|
217 |
+
#hidden H # goshnashe
|
218 |
+
if not iword.semi_mapper and not iword.append_h and iword.lemma and iword.lemma[-1] == 'ه' and iword.postfixs and iword.lemma not in self.non_hidden_h_words:
|
219 |
+
return True
|
220 |
+
# h + h
|
221 |
+
if iword.prefixs and iword.postfixs and len(iword.lemma) < 2:
|
222 |
+
return True
|
223 |
+
# خونهه - خونششونه
|
224 |
+
if iword.append_h and (OneShotTransformer.H in iword.postfixs or (len(iword.postfixs) == 1 and OneShotTransformer.H1 in iword.postfixs) ):
|
225 |
+
return True
|
226 |
+
if iword.prefixs and (OneShotTransformer.B in iword.prefixs or OneShotTransformer.Y in iword.prefixs) and (iword.lemma and iword.lemma[0] in ['ا', 'ی', 'و']):
|
227 |
+
return True
|
228 |
+
if iword.lemma in self.isolated_words and (iword.prefixs or iword.postfixs):
|
229 |
+
return True
|
230 |
+
# verb + postfixs ex: برنامه
|
231 |
+
if (iword.is_verb and iword.prefixs) or(iword.is_verb and iword.postfixs and (len(iword.postfixs) > 1 or not any(p in iword.postfixs for p in OneShotTransformer.PossessiveـPronouns +[OneShotTransformer.V]))):
|
232 |
+
return True
|
233 |
+
return False
|
234 |
+
|
235 |
+
def iword2str(self, iword):
|
236 |
+
sorted_prefixs = list(sorted(iword.prefixs, key=lambda prefix: prefix.level))
|
237 |
+
sorted_postfixs = list(sorted(iword.postfixs, key=lambda postfix: postfix.level))
|
238 |
+
concated_str = ''
|
239 |
+
zipped_prefixs = [(sorted_prefixs[i], sorted_prefixs[i + 1]) if i < len(sorted_prefixs) - 1 else (
|
240 |
+
sorted_prefixs[i], iword.lemma) for i in range(len(sorted_prefixs))]
|
241 |
+
for prev_prefix, prefix in zipped_prefixs:
|
242 |
+
separator = self.get_separator(prev_prefix, prefix, append_h=False)
|
243 |
+
prefix_formal = prev_prefix.formal
|
244 |
+
concated_str += prefix_formal
|
245 |
+
concated_str += separator
|
246 |
+
|
247 |
+
concated_str += iword.lemma
|
248 |
+
|
249 |
+
zipped_postfix = [(sorted_postfixs[i - 1], sorted_postfixs[i]) if i > 0 else (iword.lemma, sorted_postfixs[i])
|
250 |
+
for i in range(len(sorted_postfixs))]
|
251 |
+
for postfix, next_postfix in zipped_postfix:
|
252 |
+
separator = self.get_separator(postfix, next_postfix, append_h=iword.append_h)
|
253 |
+
concated_str += separator
|
254 |
+
postfix_formal = next_postfix.formal
|
255 |
+
concated_str += postfix_formal
|
256 |
+
return concated_str
|
257 |
+
|
258 |
+
def to_formals(self, iword):
|
259 |
+
str_iwords = []
|
260 |
+
all_iwords = self.lemma_to_formals(iword)
|
261 |
+
for iword in all_iwords:
|
262 |
+
# if iword.lemma == 'اون':
|
263 |
+
# print('')
|
264 |
+
if len(iword.lemma) == 1 and iword.lemma != 'و':
|
265 |
+
str_iwords.append(('', None))
|
266 |
+
continue
|
267 |
+
if self.filtered_based_on_rules(iword):
|
268 |
+
str_iwords.append(('', None))
|
269 |
+
continue
|
270 |
+
if self.should_ignore_by_postagg(iword):
|
271 |
+
str_iwords.append(('', None))
|
272 |
+
continue
|
273 |
+
if not iword.is_verb and not iword.semi_mapper and iword.lemma not in self.vocab:
|
274 |
+
str_iwords.append(('', None))
|
275 |
+
continue
|
276 |
+
concated_str = self.iword2str(iword)
|
277 |
+
str_iwords.append((concated_str, iword))
|
278 |
+
return str_iwords
|
279 |
+
|
280 |
+
def un_in(self, iword):
|
281 |
+
new_lemma = iword.lemma.replace('ون', 'ان')
|
282 |
+
if new_lemma != iword.lemma:
|
283 |
+
return InformalWord(lemma=new_lemma, prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos)
|
284 |
+
else:
|
285 |
+
return False
|
286 |
+
|
287 |
+
def prefix_obj(self, word):
|
288 |
+
op_separete = {'م': 'من', 'ت': 'تو', 'ش': 'آن', 'تان': 'شما', 'تون': 'شما', 'شون': 'آنان', 'شان': 'آنان',
|
289 |
+
'مان': 'ما', 'مون': 'ما'}
|
290 |
+
candidates = []
|
291 |
+
formal = ''
|
292 |
+
m = self.pre_obj_pattern.match(word)
|
293 |
+
if m:
|
294 |
+
tokens = m.groups()
|
295 |
+
if tokens[0] == 'باها':
|
296 |
+
formal += 'با'
|
297 |
+
else:
|
298 |
+
formal += tokens[0]
|
299 |
+
formal_obj = op_separete[tokens[1]]
|
300 |
+
formal += ' '
|
301 |
+
formal += formal_obj
|
302 |
+
if tokens[2] is not None:
|
303 |
+
formal += ' '
|
304 |
+
formal += 'هم'
|
305 |
+
alts = {'هم': 'هستم', 'آن': 'او'}
|
306 |
+
tokens = [[w] for w in formal.split()]
|
307 |
+
for t in tokens:
|
308 |
+
if t[0] in alts:
|
309 |
+
t.append(alts[t[0]])
|
310 |
+
|
311 |
+
candidates = itertools.product(*tokens)
|
312 |
+
candidates = [' '.join(cnd) for cnd in candidates]
|
313 |
+
|
314 |
+
return [(c, c) for c in candidates]
|
315 |
+
|
316 |
+
|
317 |
+
|
318 |
+
def append_tanvin_hat(self, iword):
|
319 |
+
if len(iword.lemma) > 1 and iword.lemma[0] == 'ا' and iword.lemma[-1] != 'ا':
|
320 |
+
new_lemma = 'آ' + iword.lemma[1:]
|
321 |
+
return InformalWord(lemma=new_lemma, prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos)
|
322 |
+
if len(iword.lemma) > 1 and iword.lemma[-1] == 'ا':
|
323 |
+
new_lemma = iword.lemma[:-1] + 'اً'
|
324 |
+
return InformalWord(lemma=new_lemma, prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos)
|
325 |
+
return False
|
326 |
+
|
327 |
+
def append_h(self, iword):
|
328 |
+
not_apply = self.verb_to_formal_func(iword.lemma) or (iword.lemma and iword.lemma[-1] in ['ا', 'و', 'ی']) or len(iword.lemma) <= 1 or iword.lemma =='' or iword.lemma[-1] == 'ه' or (OneShotTransformer.H in iword.postfixs and len(iword.postfixs) == 1) or any(p in iword.postfixs for p in OneShotTransformer.As) or(OneShotTransformer.V in iword.postfixs) or (iword.postfixs and iword.postfixs[0].word[0] in ['ی', 'و','ا'])
|
329 |
+
######## when add h?
|
330 |
+
new_lemma = iword.lemma + 'ه'
|
331 |
+
############# new_lemma in self.vocab
|
332 |
+
if len(iword.postfixs) > 0 and not any([p in OneShotTransformer.cant_append_h_posts for p in iword.postfixs]) and not not_apply and new_lemma not in self.non_hidden_h_words:
|
333 |
+
# if len(iword.postfixs) > 0 and not not_apply and new_lemma in self.vocab and new_lemma not in self.non_hidden_h_words:
|
334 |
+
return InformalWord(lemma=new_lemma, prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos, append_h= True)
|
335 |
+
return False
|
336 |
+
|
337 |
+
def __init__(self, vocab, mapper, verb_to_formal_func, ignore_words, postfix_mapper, isolated_words, non_hidden_h_words):
|
338 |
+
self.vocab = vocab
|
339 |
+
self.mapper = mapper
|
340 |
+
self.verb_to_formal_func = verb_to_formal_func
|
341 |
+
self.ignore_words = ignore_words
|
342 |
+
self.postfix_mapper = postfix_mapper
|
343 |
+
self.isolated_words = isolated_words
|
344 |
+
self.non_hidden_h_words = non_hidden_h_words
|
345 |
+
self.operators = [self.un_in, self.append_h, self.append_tanvin_hat]
|
346 |
+
patt = r'(از|به|باها)(مان|شون|شان|مون|م|تون|تان|ت|ش)(م)?$'
|
347 |
+
self.pre_obj_pattern = re.compile(patt)
|
348 |
+
|
349 |
+
def all_sequence_of_postfixs(self, word, index):
|
350 |
+
all_seqs =[]
|
351 |
+
for p in OneShotTransformer.posts[index]:
|
352 |
+
p_w = p.word
|
353 |
+
if word.startswith(p_w):
|
354 |
+
w = word[len(p_w):]
|
355 |
+
if len(w) == 0:
|
356 |
+
all_seqs.append(p)
|
357 |
+
else:
|
358 |
+
if index < len(OneShotTransformer.posts) -1 :
|
359 |
+
resp = self.all_sequence_of_postfixs(w, index+1)
|
360 |
+
if len(resp) > 0:
|
361 |
+
for item in resp:
|
362 |
+
if type(item) == list:
|
363 |
+
item.append(p)
|
364 |
+
sequence_with_p = item
|
365 |
+
else:
|
366 |
+
sequence_with_p = [p, item]
|
367 |
+
all_seqs.append(sequence_with_p)
|
368 |
+
if index < len(OneShotTransformer.posts) - 1:
|
369 |
+
resp = self.all_sequence_of_postfixs(word, index + 1)
|
370 |
+
all_seqs.extend(resp)
|
371 |
+
else:
|
372 |
+
return all_seqs
|
373 |
+
return all_seqs
|
374 |
+
|
375 |
+
def combine(self, l1, l2):
|
376 |
+
if len(l1) == 0:
|
377 |
+
return l2
|
378 |
+
elif len(l2) == 0:
|
379 |
+
return l1
|
380 |
+
return list(itertools.product(l1, l2))
|
381 |
+
|
382 |
+
|
383 |
+
def get_expand(self, iword):
|
384 |
+
all_possible_words = []
|
385 |
+
for subset_operators in utils.powerset(self.operators):
|
386 |
+
new_iword = InformalWord(lemma=iword.lemma, prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos)
|
387 |
+
for so in subset_operators:
|
388 |
+
so_resp = so(new_iword)
|
389 |
+
if so_resp:
|
390 |
+
new_iword = so_resp
|
391 |
+
all_possible_words.append(new_iword)
|
392 |
+
return all_possible_words
|
393 |
+
|
394 |
+
|
395 |
+
def match_postfixs(self, word, pos):
|
396 |
+
possible_combinatios = []
|
397 |
+
for i in range(len(OneShotTransformer.posts)):
|
398 |
+
for p in OneShotTransformer.posts[i]:
|
399 |
+
p_word = p.word
|
400 |
+
p_indxs = [indx for indx, ch in enumerate(word) if word[indx:indx+len(p_word)] == p_word]
|
401 |
+
for p_indx in p_indxs:
|
402 |
+
if p_indx != -1:
|
403 |
+
lemma = word[:p_indx]
|
404 |
+
pp = word[p_indx + len(p_word):]
|
405 |
+
if len(pp) ==0:
|
406 |
+
iw = InformalWord(lemma=lemma, postfixs=[p], pos=pos)
|
407 |
+
possible_combinatios.append(iw)
|
408 |
+
continue
|
409 |
+
if i < len(OneShotTransformer.posts) -1:
|
410 |
+
all_postfix = self.all_sequence_of_postfixs(pp, index=i+1)
|
411 |
+
if len(all_postfix) > 0:
|
412 |
+
for pfixs in all_postfix:
|
413 |
+
if type(pfixs) == list:
|
414 |
+
pfixs.append(p)
|
415 |
+
else:
|
416 |
+
pfixs = [p, pfixs]
|
417 |
+
iw = InformalWord(lemma=lemma, postfixs=pfixs, pos=pos)
|
418 |
+
possible_combinatios.append(iw)
|
419 |
+
elif len(pp) == 0:
|
420 |
+
iw = InformalWord(lemma=lemma, postfixs=[p], pos=pos)
|
421 |
+
possible_combinatios.append(iw)
|
422 |
+
|
423 |
+
return possible_combinatios
|
424 |
+
|
425 |
+
def match_prefixs(self, word, pos):
|
426 |
+
possible_combinatios = []
|
427 |
+
for i in range(len(OneShotTransformer.pres)):
|
428 |
+
for p in OneShotTransformer.pres[i]:
|
429 |
+
if word.startswith(p.word):
|
430 |
+
lemma = word[len(p.word):]
|
431 |
+
prefixs = [p]
|
432 |
+
iw = InformalWord(lemma=lemma, prefixs=prefixs, postfixs=[], pos=pos)
|
433 |
+
possible_combinatios.append(iw)
|
434 |
+
return possible_combinatios
|
435 |
+
return []
|
436 |
+
|
437 |
+
def parse_word(self, iword):
|
438 |
+
parsed_resp = []
|
439 |
+
prefixed_word = self.match_prefixs(iword.lemma,pos=iword.pos)
|
440 |
+
prefixed_word.append(iword)
|
441 |
+
parsed_resp.extend(prefixed_word)
|
442 |
+
for pw in prefixed_word:
|
443 |
+
postfixed_iwords = self.match_postfixs(pw.lemma,pos=iword.pos)
|
444 |
+
for piw in postfixed_iwords:
|
445 |
+
piw.prefixs = pw.prefixs
|
446 |
+
parsed_resp.append(piw)
|
447 |
+
return parsed_resp
|
448 |
+
|
449 |
+
def is_seqs_of_verbs(self, txt):
|
450 |
+
words = txt.split()
|
451 |
+
if len(words) < 2:
|
452 |
+
return False
|
453 |
+
for w in words:
|
454 |
+
formal_verb = self.verb_to_formal_func(w)
|
455 |
+
if formal_verb is None:
|
456 |
+
return False
|
457 |
+
if words[-1] in ['است', 'هست']:
|
458 |
+
return False
|
459 |
+
return True
|
460 |
+
|
461 |
+
def filter_results(self, word_lemmas):
|
462 |
+
return list(filter(lambda wl: len(wl[0])>0 and wl[0][-1] != '' and not self.is_seqs_of_verbs(wl[0]), word_lemmas))
|
463 |
+
|
464 |
+
def concatenate_formal_words(self, pre, next):
|
465 |
+
"""
|
466 |
+
خانه + ت -> خانهات
|
467 |
+
دیگر + ای -> دیگری
|
468 |
+
"""
|
469 |
+
nim_fasele = ''
|
470 |
+
not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و']
|
471 |
+
if len(pre) < 1 :
|
472 |
+
return next
|
473 |
+
if pre[-1] in ['ه'] and next in ['م', 'ت', 'ش']:
|
474 |
+
return pre + nim_fasele + 'ا' + next
|
475 |
+
if pre[-1] == 'ا'and next.split() and next.split()[0] in ['م', 'ت', 'ش', 'مان', 'تان', 'شان']:
|
476 |
+
return pre + nim_fasele + 'ی' + next
|
477 |
+
if pre[-1] not in ['ه'] and next in ['ای']:
|
478 |
+
return pre + 'ی'
|
479 |
+
out = pre + next
|
480 |
+
if pre[-1] not in not_connect_chars or next.startswith('ها') or pre[-1] in ['ه'] or pre + nim_fasele + next in self.vocab:
|
481 |
+
out = pre + nim_fasele + next
|
482 |
+
if self.verb_to_formal_func(next):
|
483 |
+
out = pre + ' ' + next
|
484 |
+
return out
|
485 |
+
|
486 |
+
def handle_nim_fasele_words(self, word, pos):
|
487 |
+
def extract_lemma_nim_fasele_words(word, pos):
|
488 |
+
formal_prefixs = []
|
489 |
+
formal_postfixs = []
|
490 |
+
prefixs = {'اون': 'آن', 'همون': 'همین'}
|
491 |
+
postfixs = self.postfix_mapper
|
492 |
+
tokens = word.split('')
|
493 |
+
index = 0
|
494 |
+
for i in range(len(tokens)):
|
495 |
+
index = i
|
496 |
+
if tokens[i] not in prefixs:
|
497 |
+
break
|
498 |
+
else:
|
499 |
+
formal_prefixs.append(prefixs[tokens[i]])
|
500 |
+
|
501 |
+
for i in range(len(tokens), index, -1):
|
502 |
+
current_tok = ''.join(tokens[index:i])
|
503 |
+
if current_tok in self.vocab or tokens[i - 1] not in postfixs:
|
504 |
+
return formal_prefixs, current_tok, formal_postfixs
|
505 |
+
else:
|
506 |
+
formal_postfixs.append(postfixs[tokens[i - 1]])
|
507 |
+
return formal_prefixs, current_tok, formal_postfixs
|
508 |
+
nim_fasele = ''
|
509 |
+
candidates = []
|
510 |
+
formal_word = ''
|
511 |
+
verbs = self.verb_to_formal_func(word)
|
512 |
+
if verbs:
|
513 |
+
return [(v, v) for v in verbs]
|
514 |
+
all_candidates = set()
|
515 |
+
# lemma
|
516 |
+
formal_prefixs, lemma, formal_postfixs = extract_lemma_nim_fasele_words(word, pos)
|
517 |
+
word_lemmas = self.transform(lemma, pos, ignore_nim_fasele=True)
|
518 |
+
# lemma with postfix should len=1
|
519 |
+
one_token_words = [wl for wl in word_lemmas if len(wl[0].split()) == 1]
|
520 |
+
if formal_postfixs and one_token_words:
|
521 |
+
all_formal_lemma_candidates = one_token_words
|
522 |
+
else:
|
523 |
+
all_formal_lemma_candidates = word_lemmas
|
524 |
+
if not all_formal_lemma_candidates:
|
525 |
+
if formal_postfixs or formal_prefixs:
|
526 |
+
all_formal_lemma_candidates = [(lemma, lemma)]
|
527 |
+
else:
|
528 |
+
tokens = lemma.split(nim_fasele)
|
529 |
+
if all(self.transform(t, None, ignore_nim_fasele=True) for t in tokens):
|
530 |
+
w = ' '.join(tokens)
|
531 |
+
return [(w, w)]
|
532 |
+
else:
|
533 |
+
return []
|
534 |
+
for cnd_lemma, formal_word_lemma in all_formal_lemma_candidates:
|
535 |
+
formal_word = ''
|
536 |
+
toks = formal_prefixs + [cnd_lemma] + formal_postfixs
|
537 |
+
for index, t in enumerate(toks):
|
538 |
+
formal_word = self.concatenate_formal_words(formal_word, t)
|
539 |
+
all_candidates.add((formal_word, formal_word_lemma))
|
540 |
+
# if t in self.postfix_mapper:
|
541 |
+
# formal_t = self.postfix_mapper[t]
|
542 |
+
# else:
|
543 |
+
# transform_outputs = self.transform(t, pos)
|
544 |
+
# if not transform_outputs:
|
545 |
+
# formal_t = t
|
546 |
+
# else:
|
547 |
+
# one_word_outputs = [ft for ft in transform_outputs if len(ft.split()) == 1]
|
548 |
+
# if one_word_outputs:
|
549 |
+
# if t in one_word_outputs:
|
550 |
+
# formal_t = t
|
551 |
+
# else:
|
552 |
+
# formal_t = one_word_outputs[0]
|
553 |
+
# else:
|
554 |
+
# formal_t = transform_outputs.pop()
|
555 |
+
return all_candidates
|
556 |
+
|
557 |
+
|
558 |
+
|
559 |
+
def transform(self, word, pos, ignore_nim_fasele=False):
|
560 |
+
"""ignore emoji , punctuation, numbers"""
|
561 |
+
ignore_chars = '.1234567890!@#$%^&*()_+۱۲۳۴۵۶۷۸۹÷؟×−+?><}،,{":' + string.ascii_lowercase + string.ascii_uppercase
|
562 |
+
if any(ic in word for ic in ignore_chars) or utils.if_emoji(word):
|
563 |
+
return [(word, word)]
|
564 |
+
"""handle nim fasele"""
|
565 |
+
nim_fasele = ''
|
566 |
+
if not ignore_nim_fasele and nim_fasele in word:
|
567 |
+
return self.handle_nim_fasele_words(word, pos)
|
568 |
+
# pass ignore words and accept as correct informal word!
|
569 |
+
if word in self.ignore_words and not word in self.mapper:
|
570 |
+
return [(word, word)]
|
571 |
+
formal_prefix_obj = self.prefix_obj(word)
|
572 |
+
if formal_prefix_obj:
|
573 |
+
return formal_prefix_obj
|
574 |
+
iword = InformalWord(lemma=word, pos=pos)
|
575 |
+
expanded_candidates = []
|
576 |
+
candidates = self.parse_word(iword)
|
577 |
+
#just verbs
|
578 |
+
if any(c.is_verb for c in candidates):
|
579 |
+
candidates = [c for c in candidates if c.is_verb]
|
580 |
+
for cnd in candidates:
|
581 |
+
expanded_candidates.extend(self.get_expand(cnd))
|
582 |
+
word_iwords = []
|
583 |
+
for ec in expanded_candidates:
|
584 |
+
word_iwords.extend(self.to_formals(ec))
|
585 |
+
if any(f[1] and (f[1].is_mapper or f[1].is_verb) for f in word_iwords if f[1] is not None):
|
586 |
+
word_iwords = [f for f in word_iwords if f[1] and (f[1].is_mapper or f[1].is_verb)]
|
587 |
+
# else:
|
588 |
+
word_lemmas_set = [(w, iword.lemma) for w, iword in word_iwords if iword is not None]
|
589 |
+
word_lemmas_set = set(word_lemmas_set)
|
590 |
+
out = self.filter_results(word_lemmas_set)
|
591 |
+
# if type(out) == str:
|
592 |
+
# out = [out]
|
593 |
+
# out = set(out)
|
594 |
+
return out
|
595 |
+
|
596 |
+
if __name__ == '__main__':
|
597 |
+
transformer = OneShotTransformer(None, None, None)
|
598 |
+
candidates = transformer.match_postfixs('کارامم')
|
599 |
+
print(candidates)
|
600 |
+
|
VerbHandler.py
ADDED
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from enum import Enum
|
3 |
+
from hazm import Normalizer
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
|
7 |
+
Formality = Enum('Formality', 'formal informal')
|
8 |
+
VerbTime = Enum('VerbTime', 'past present future')
|
9 |
+
Person = Enum('Person', 'Man To An Ma Shoma Anha')
|
10 |
+
Number = Enum('Number', 'Mofrad Jam')
|
11 |
+
class Verb:
|
12 |
+
def __init__(self, root, formality, time, pp, person, number):
|
13 |
+
self.root = root
|
14 |
+
self.formality = formality
|
15 |
+
self.time = time
|
16 |
+
self.person = person
|
17 |
+
self.number = number
|
18 |
+
self.pp = pp
|
19 |
+
|
20 |
+
class VerbHandler():
|
21 |
+
def __init__(self, csv_verb_addr, csv_irregular_verbs_mapper):
|
22 |
+
self.posfix_mapper = {'ه': 'د', 'ن': 'ند', 'ین': 'ید'}
|
23 |
+
self.objective_pr_mapper = {'شون':'شان', 'تون':'تان', 'مون':'مان'}
|
24 |
+
self.init_mapper = {'کنه': 'بکنه', 'کنم':'بکنم', 'کنی':'بکنی', 'کنیم': 'بکنیم', 'کنین':'بکنین', 'کنید':'بکنید', 'کنن':'بکنن', 'کنند':'بکنند'}
|
25 |
+
self.out_mapper = {'میایی': 'میآیی'}
|
26 |
+
self.init_mapper.update({'شم':'بشم', 'شی':'بشی', 'شن':'بشن', 'شین':'بشین' ,'شه':'بشه', 'شیم': 'بشیم'})
|
27 |
+
self.bons = self.load_bons(csv_verb_addr)
|
28 |
+
self.irregular_verbs = self.load_irregular_mapper(csv_irregular_verbs_mapper)
|
29 |
+
self.informal_past_bons = self.get_bons(type=Formality.informal, time=VerbTime.past)
|
30 |
+
self.informal_present_bons = self.get_bons(type=Formality.informal, time=VerbTime.present)
|
31 |
+
|
32 |
+
self.formal_past_bons = self.get_bons(type=Formality.formal, time=VerbTime.past)
|
33 |
+
self.formal_present_bons =self.get_bons(type=Formality.formal, time=VerbTime.present) + ['هست']
|
34 |
+
self.all_past_bons = self.formal_past_bons + self.informal_past_bons
|
35 |
+
self.all_present_bons = self.formal_present_bons + self.informal_present_bons
|
36 |
+
self.verb_mapper = {b:{'formal':self.bons[b]['formal']} for b in self.bons if self.bons[b]['type'] == Formality.informal}
|
37 |
+
self.solve_alef_issue()
|
38 |
+
self.compile_patterns()
|
39 |
+
|
40 |
+
|
41 |
+
def load_irregular_mapper(self, csv_addr):
|
42 |
+
df = pd.read_csv(csv_addr)
|
43 |
+
mapper = {informal: formal for _, (informal, formal) in df.iterrows()}
|
44 |
+
return mapper
|
45 |
+
|
46 |
+
def load_bons(self, csv_addr):
|
47 |
+
normalizer = Normalizer()
|
48 |
+
df = pd.read_csv(csv_addr)
|
49 |
+
df = df.fillna('')
|
50 |
+
bons = {}
|
51 |
+
for i, row in df.iterrows():
|
52 |
+
if row[2]:
|
53 |
+
row[2] = normalizer.normalize(row[2])
|
54 |
+
bons[row[2]] = {'type': Formality.formal, 'time': VerbTime.past}
|
55 |
+
if row[3]:
|
56 |
+
row[3] = normalizer.normalize(row[3])
|
57 |
+
bons[row[3]] = {'type': Formality.formal, 'time': VerbTime.present}
|
58 |
+
if row[10]:
|
59 |
+
bs = row[10].split()
|
60 |
+
for b in bs:
|
61 |
+
bons[b] = {'type': Formality.informal, 'time': VerbTime.past, 'formal': row[2]}
|
62 |
+
if row[11]:
|
63 |
+
bs = row[11].split()
|
64 |
+
for b in bs:
|
65 |
+
bons[b] = {'type': Formality.informal, 'time': VerbTime.present, 'formal': row[3]}
|
66 |
+
return bons
|
67 |
+
|
68 |
+
def get_bons(self, type, time):
|
69 |
+
return [b for b in self.bons if self.bons[b]['type'] == type and self.bons[b]['time'] == time]
|
70 |
+
|
71 |
+
def solve_alef_issue(self):
|
72 |
+
replace_alef_y = lambda v : 'ی' + v[1:]
|
73 |
+
replace_A_YA = lambda v : 'یا' + v[1:]
|
74 |
+
informal_past_start_with_alef = list(map(replace_alef_y, [v for v in self.informal_past_bons if v.startswith('ا') and not v.startswith('ای')]))
|
75 |
+
formal_past_start_with_alef = list(map(replace_alef_y, [v for v in self.formal_past_bons if v.startswith('ا') and not v.startswith('ای')]))
|
76 |
+
informal_present_start_with_alef = list(map(replace_alef_y, [v for v in self.informal_present_bons if v.startswith('ا') and not v.startswith('ای')]))
|
77 |
+
formal_present_start_with_alef = list(map(replace_alef_y, [v for v in self.formal_present_bons if v.startswith('ا') and not v.startswith('ای')]))
|
78 |
+
self.alef_mapper = {}
|
79 |
+
self.informal_past_start_with_alef = informal_past_start_with_alef + list(
|
80 |
+
map(replace_A_YA, [v for v in self.informal_past_bons if v.startswith('آ')]))
|
81 |
+
self.informal_present_start_with_alef = informal_present_start_with_alef + list(
|
82 |
+
map(replace_A_YA, [v for v in self.informal_present_bons if v.startswith('آ')]))
|
83 |
+
self.formal_past_start_with_alef = formal_past_start_with_alef + list(
|
84 |
+
map(replace_A_YA, [v for v in self.formal_past_bons if v.startswith('آ')]))
|
85 |
+
self.formal_present_start_with_alef = formal_present_start_with_alef + list(
|
86 |
+
map(replace_A_YA, [v for v in self.formal_present_bons if v.startswith('آ')]))
|
87 |
+
for verb in self.informal_past_start_with_alef + self.informal_present_start_with_alef + self.formal_past_start_with_alef + self.formal_present_start_with_alef:
|
88 |
+
if verb[:2] == 'یا':
|
89 |
+
origin = 'آ' + verb[2:]
|
90 |
+
else:
|
91 |
+
origin = 'ا' + verb[1:]
|
92 |
+
self.alef_mapper[verb] = origin
|
93 |
+
self.alef_mapper['یای'] = 'آی'
|
94 |
+
remove_a_hat = lambda w: w.replace('آ', 'ا')
|
95 |
+
self.formal_past_bons = list(
|
96 |
+
filter(lambda w: w != '', map(remove_a_hat, self.formal_past_bons + self.formal_past_start_with_alef)))
|
97 |
+
self.formal_present_bons = list(map(remove_a_hat, self.formal_present_bons + self.formal_present_start_with_alef)) + [
|
98 |
+
'یای'] + ['آی']
|
99 |
+
self.informal_past_bons = list(
|
100 |
+
filter(lambda w: w != '', map(remove_a_hat, self.informal_past_bons + self.informal_past_start_with_alef)))
|
101 |
+
self.informal_present_bons = list(
|
102 |
+
map(remove_a_hat, self.informal_present_bons + self.informal_present_start_with_alef)) + [
|
103 |
+
'یای'] + ['آی']
|
104 |
+
# sorted by length
|
105 |
+
self.formal_present_bons = sorted(self.formal_present_bons, key=lambda w: -len(w))
|
106 |
+
self.formal_past_bons = sorted(self.formal_past_bons, key=lambda w: -len(w))
|
107 |
+
self.informal_present_bons = sorted(self.informal_present_bons, key=lambda w: -len(w))
|
108 |
+
self.informal_past_bons = sorted(self.informal_past_bons, key=lambda w: -len(w))
|
109 |
+
verb_v_keys = [word for word in self.verb_mapper if 'آ' in word]
|
110 |
+
alef_verb_v_keys = [word for word in self.alef_mapper if 'آ' in word]
|
111 |
+
for v in verb_v_keys:
|
112 |
+
self.verb_mapper[v.replace('آ', 'ا')] = self.verb_mapper[v]
|
113 |
+
for v in alef_verb_v_keys:
|
114 |
+
self.alef_mapper[v.replace('آ', 'ا')] = self.alef_mapper[v]
|
115 |
+
|
116 |
+
|
117 |
+
def compile_patterns(self):
|
118 |
+
ME_r = '|'.join(['می','می'])
|
119 |
+
B_r = 'ب'
|
120 |
+
not_r = 'ن'
|
121 |
+
past_ends = ['م', 'ی', 'ه', 'یم', 'ین', 'ید', 'ند', '', 'ن']
|
122 |
+
present_ends = ['م', 'ی', 'ه', 'یم', 'ین', 'ن', 'ید', 'ند', 'د', '']
|
123 |
+
naghli_ends = ['هام', 'های', 'ه', 'هایم', 'هاید', 'هاند']
|
124 |
+
objective_pronouns = ['م', 'ت', 'ش', 'مون', 'تون', 'شون']
|
125 |
+
|
126 |
+
informal_past_r = '|'.join(self.informal_past_bons)
|
127 |
+
formal_past_r = '|'.join(self.formal_past_bons)
|
128 |
+
informal_present_r = '|'.join(self.informal_present_bons)
|
129 |
+
formal_present_r = '|'.join(self.formal_present_bons)
|
130 |
+
verb_postfix_past_r = '|'.join(past_ends)
|
131 |
+
verb_postfix__present_r = '|'.join(present_ends)
|
132 |
+
objective_pronouns_r = '|'.join(objective_pronouns)
|
133 |
+
naghli_ends_r = '|'.join(naghli_ends)
|
134 |
+
"""
|
135 |
+
#گذشتهی ساده
|
136 |
+
# r1 = past_r + verb_postfix_r + objectiveـpronouns_r
|
137 |
+
#گذشتهی ناتمام
|
138 |
+
# r2 = '(' + ME + ')'+ past_r +verb_postfix_r + objectiveـpronouns_r
|
139 |
+
|
140 |
+
#گذشتهی استمراری
|
141 |
+
# r3 = '(' + DASHT + ')'+ past_r + verb_postfix_r +objectiveـpronouns_r
|
142 |
+
|
143 |
+
#گذشتهی نقلی
|
144 |
+
# r4 = past_r + '(' + '|'.join(naghli_ends) + ')' +objectiveـpronouns_r
|
145 |
+
|
146 |
+
#گذشتهی پیشین
|
147 |
+
# r5 = past_r + verb_postfix_r + '(' + BUD + ')' + verb_postfix_r + objectiveـpronouns_r
|
148 |
+
|
149 |
+
#حال ساده
|
150 |
+
# r6 = present_r + verb_postfix_r
|
151 |
+
|
152 |
+
#حال ناتمام
|
153 |
+
# r7 = '(' + ME + ')'+ present_r + verb_postfix_r + objectiveـpronouns_r
|
154 |
+
|
155 |
+
#حال استمراری
|
156 |
+
# r8 = '( ' + DAR + ')'+ verb_postfix_r + '(' + ME + ')' + present_r + verb_postfix_r+ objectiveـpronouns_r
|
157 |
+
|
158 |
+
#آیندهی ساده
|
159 |
+
# r9 = '( ' + KHAH + ')'+ verb_postfix_r + present_r +objectiveـpronouns_r
|
160 |
+
|
161 |
+
#التزامی - گذشته
|
162 |
+
# r10 = present_r + '(ه)'+ '(' + BASH + ')' + verb_postfix_r + objectiveـpronouns_r
|
163 |
+
|
164 |
+
#التزامی - حال
|
165 |
+
# r11 = '(ب)' + present_r + verb_postfix_r +objectiveـpronouns_r
|
166 |
+
"""
|
167 |
+
#
|
168 |
+
# + : fealhaye rasmi + pasvan informal , hale sade baraye bazi fela ( hast, kon)
|
169 |
+
# formal
|
170 |
+
formal_present_pattern_b = '({})({})({})?({})?$'.format(B_r, formal_present_r, verb_postfix__present_r, objective_pronouns_r)
|
171 |
+
formal_present_pattern_n_me = '({})?({})({})({})?({})?$'.format(not_r, ME_r, formal_present_r, verb_postfix__present_r, objective_pronouns_r)
|
172 |
+
formal_present_pattern_n = '({})?({})({})?({})?$'.format(not_r, formal_present_r, verb_postfix__present_r, objective_pronouns_r)
|
173 |
+
formal_past_pattern = '({})?({})?({})({}|{})({})?$'.format(not_r, ME_r, formal_past_r, naghli_ends_r,
|
174 |
+
verb_postfix_past_r, objective_pronouns_r)
|
175 |
+
self.formal_past_verb_pattern = re.compile(formal_past_pattern)
|
176 |
+
self.formal_present_verb_pattern_b = re.compile(formal_present_pattern_b)
|
177 |
+
self.formal_present_verb_pattern_n_me = re.compile(formal_present_pattern_n_me)
|
178 |
+
self.formal_present_verb_pattern_n = re.compile(formal_present_pattern_n)
|
179 |
+
|
180 |
+
#informal
|
181 |
+
informal_present_pattern_b = '({})({})({})?({})?$'.format(B_r, informal_present_r, verb_postfix__present_r,
|
182 |
+
objective_pronouns_r)
|
183 |
+
informal_present_pattern_n_me = '({})?({})({})({})?({})?$'.format(not_r, ME_r, informal_present_r,
|
184 |
+
verb_postfix__present_r, objective_pronouns_r)
|
185 |
+
informal_present_pattern_n = '({})?({})({})?({})?$'.format(not_r, informal_present_r, verb_postfix__present_r,
|
186 |
+
objective_pronouns_r)
|
187 |
+
informal_past_pattern = '({})?({})?({})({}|{})({})?$'.format(not_r, ME_r, informal_past_r, naghli_ends_r,
|
188 |
+
verb_postfix_past_r, objective_pronouns_r)
|
189 |
+
self.informal_past_verb_pattern = re.compile(informal_past_pattern)
|
190 |
+
self.informal_present_verb_pattern_b = re.compile(informal_present_pattern_b)
|
191 |
+
self.informal_present_verb_pattern_n_me = re.compile(informal_present_pattern_n_me)
|
192 |
+
self.informal_present_verb_pattern_n = re.compile(informal_present_pattern_n)
|
193 |
+
|
194 |
+
|
195 |
+
def parse(self, token):
|
196 |
+
outputs = []
|
197 |
+
|
198 |
+
match_dict_formal = {'tense': '', 'root': '', 'neg': '', 'postfix': '', 'not_r': '', 'op': '', 'b': '', 'me': '', 'naghli':''}
|
199 |
+
match_dict_informal = {'tense': '', 'root': '', 'neg': '', 'postfix': '', 'not_r': '', 'op': '', 'b': '', 'me': '', 'naghli':''}
|
200 |
+
formal_past_match = self.formal_past_verb_pattern.match(token)
|
201 |
+
informal_past_match = self.informal_past_verb_pattern.match(token)
|
202 |
+
formal_present_match_b = self.formal_present_verb_pattern_b.match(token)
|
203 |
+
informal_present_match_b = self.informal_present_verb_pattern_b.match(token)
|
204 |
+
formal_present_match_n_me = self.formal_present_verb_pattern_n_me.match(token)
|
205 |
+
informal_present_match_n_me = self.informal_present_verb_pattern_n_me.match(token)
|
206 |
+
formal_present_match_n = self.formal_present_verb_pattern_n.match(token)
|
207 |
+
informal_present_match_n = self.informal_present_verb_pattern_n.match(token)
|
208 |
+
present_group_to_dict_b = lambda g: {k:g[i] for i,k in enumerate(['b', 'root', 'postfix', 'op'])}
|
209 |
+
present_group_to_dict_n_me = lambda g: {k:g[i] for i,k in enumerate(['neg', 'me','root', 'postfix','op'])}
|
210 |
+
present_group_to_dict_n = lambda g: {k:g[i] for i,k in enumerate(['neg','root', 'postfix','op'])}
|
211 |
+
past_group_to_dict = lambda g: {k:g[i] for i,k in enumerate(['neg', 'me', 'root', 'postfix', 'op'])}
|
212 |
+
formal_match = formal_past_match or formal_present_match_b or formal_present_match_n_me or formal_present_match_n
|
213 |
+
informal_match = informal_past_match or informal_present_match_b or informal_present_match_n_me or informal_present_match_n
|
214 |
+
if formal_match:
|
215 |
+
if formal_past_match:
|
216 |
+
match_dict_formal = past_group_to_dict(formal_past_match.groups())
|
217 |
+
match_dict_formal['tense'] = 'past'
|
218 |
+
else:
|
219 |
+
if formal_present_match_b:
|
220 |
+
match_dict_formal = present_group_to_dict_b(formal_present_match_b.groups())
|
221 |
+
elif formal_present_match_n_me:
|
222 |
+
match_dict_formal = present_group_to_dict_n_me(formal_present_match_n_me.groups())
|
223 |
+
elif formal_present_match_n:
|
224 |
+
match_dict_formal = present_group_to_dict_n(formal_present_match_n.groups())
|
225 |
+
match_dict_formal['tense'] = 'present'
|
226 |
+
outputs.append(match_dict_formal)
|
227 |
+
if informal_match:
|
228 |
+
if informal_past_match:
|
229 |
+
match_dict_informal = past_group_to_dict(informal_past_match.groups())
|
230 |
+
match_dict_informal['tense'] = 'past'
|
231 |
+
else:
|
232 |
+
if informal_present_match_b:
|
233 |
+
match_dict_informal = present_group_to_dict_b(informal_present_match_b.groups())
|
234 |
+
elif informal_present_match_n_me:
|
235 |
+
match_dict_informal = present_group_to_dict_n_me(informal_present_match_n_me.groups())
|
236 |
+
elif informal_present_match_n:
|
237 |
+
match_dict_informal = present_group_to_dict_n(informal_present_match_n.groups())
|
238 |
+
match_dict_informal['tense'] = 'present'
|
239 |
+
outputs.append(match_dict_informal)
|
240 |
+
for match_dict in outputs:
|
241 |
+
for key,val in match_dict.items():
|
242 |
+
if val is None:
|
243 |
+
match_dict[key] = ''
|
244 |
+
# print(match_dict)
|
245 |
+
return outputs
|
246 |
+
|
247 |
+
def formal_concatenate(self, match_dict, should_smooth):
|
248 |
+
out_dict = {'بیای': 'بیا', 'نیای': 'نیا'}
|
249 |
+
if match_dict['root'] == 'است' and match_dict['neg'] != '':
|
250 |
+
return 'نیست' + match_dict['postfix']
|
251 |
+
if self.if_simple_present(match_dict) or self.if_only_me(match_dict):
|
252 |
+
return None
|
253 |
+
if should_smooth:
|
254 |
+
if match_dict['prefix'] != '' and match_dict['prefix'][0] == 'م':
|
255 |
+
pass
|
256 |
+
else:
|
257 |
+
match_dict['root'] = 'یا' + match_dict['root'][1:]
|
258 |
+
# if len(match_dict['prefix']) == 3:
|
259 |
+
# match_dict['prefix'] = 'می'
|
260 |
+
if match_dict['prefix'] == 'ب' and match_dict['root'] and match_dict['root'][0] == 'ا':
|
261 |
+
match_dict['root'] = 'ی' + match_dict['root'][1:]
|
262 |
+
out = match_dict['neg'] + match_dict['prefix'] + match_dict['root'] + match_dict['postfix'] + match_dict['op']
|
263 |
+
if out in out_dict:
|
264 |
+
out = out_dict[out]
|
265 |
+
|
266 |
+
return out
|
267 |
+
|
268 |
+
def _set_match_dict_prefix(self, match_dict):
|
269 |
+
match_dict['prefix'] = ''
|
270 |
+
if 'me' in match_dict and match_dict['me'] != '':
|
271 |
+
if len(match_dict['me']) < 3:
|
272 |
+
match_dict['me'] = 'می'
|
273 |
+
match_dict['prefix'] = match_dict['me']
|
274 |
+
elif 'b' in match_dict and match_dict['b'] != '':
|
275 |
+
match_dict['prefix'] = match_dict['b']
|
276 |
+
return match_dict
|
277 |
+
|
278 |
+
def if_simple_present(self, match_dict):
|
279 |
+
if match_dict['root'] != '' and match_dict['tense'] == 'present' and match_dict['prefix'] == '' and match_dict['neg'] == '':
|
280 |
+
if match_dict['root'] not in ['کن', 'هست', 'است', 'دار', 'نیست', 'باش']:
|
281 |
+
return True
|
282 |
+
return False
|
283 |
+
|
284 |
+
def if_only_me(self, match_dict):
|
285 |
+
if match_dict['root'] != '' and match_dict['tense'] == 'present' and match_dict['prefix'] !='' and match_dict['prefix'][0] == 'م' and match_dict['postfix'] == '':
|
286 |
+
return True
|
287 |
+
return False
|
288 |
+
|
289 |
+
def is_masdar(self, match_dict):
|
290 |
+
return match_dict['root'] in self.all_past_bons and match_dict['me'] == '' and match_dict['postfix'] =='ن' and match_dict['op'] == ''
|
291 |
+
|
292 |
+
def informal_to_formal(self, token):
|
293 |
+
# irregular verbs checking
|
294 |
+
if token in self.irregular_verbs:
|
295 |
+
return [self.irregular_verbs[token]]
|
296 |
+
if token in self.init_mapper:
|
297 |
+
token = self.init_mapper[token]
|
298 |
+
outputs = []
|
299 |
+
if len(token) < 3:
|
300 |
+
return None
|
301 |
+
should_smooth = False
|
302 |
+
all_match_dicts = self.parse(token)
|
303 |
+
|
304 |
+
### بدهدم
|
305 |
+
#برد
|
306 |
+
if len(all_match_dicts) == 2 :
|
307 |
+
if all_match_dicts[1]['root'] in self.verb_mapper and self.verb_mapper[all_match_dicts[1]['root']]['formal'] == all_match_dicts[0]['root'] and all_match_dicts[1]['op'] != '':
|
308 |
+
del all_match_dicts[1]
|
309 |
+
elif all_match_dicts[1] == {'b': 'ب', 'root': 'ر', 'postfix': 'د', 'op': '', 'tense': 'present'}:
|
310 |
+
del all_match_dicts[1]
|
311 |
+
##
|
312 |
+
is_masdar = False
|
313 |
+
for match_dict in all_match_dicts:
|
314 |
+
if self.is_masdar(match_dict):
|
315 |
+
is_masdar = True
|
316 |
+
#نان بان
|
317 |
+
if match_dict['root'] != '' and match_dict['root'][0] == 'ا' and 'me' not in match_dict and ('b' in match_dict or match_dict['neg'] == 'ن'):
|
318 |
+
return None
|
319 |
+
if match_dict['root'] != '':
|
320 |
+
root = match_dict['root']
|
321 |
+
objective_pr = match_dict['op']
|
322 |
+
postfix = match_dict['postfix']
|
323 |
+
if root in self.alef_mapper:
|
324 |
+
should_smooth = True
|
325 |
+
match_dict['root'] = self.alef_mapper[root]
|
326 |
+
if match_dict['root'] in self.verb_mapper:
|
327 |
+
match_dict['root'] = self.verb_mapper[ match_dict['root']]['formal']
|
328 |
+
if postfix in self.posfix_mapper:
|
329 |
+
match_dict['postfix'] = self.posfix_mapper[postfix]
|
330 |
+
if match_dict['postfix'] == 'د' and match_dict['tense'] == 'past':
|
331 |
+
match_dict['postfix'] = 'ه'
|
332 |
+
if objective_pr in self.objective_pr_mapper:
|
333 |
+
match_dict['op'] = self.objective_pr_mapper[objective_pr]
|
334 |
+
match_dict['prefix'] = ''
|
335 |
+
if 'neg' not in match_dict:
|
336 |
+
match_dict['neg'] = ''
|
337 |
+
match_dict = self._set_match_dict_prefix(match_dict)
|
338 |
+
formal_verb = self.formal_concatenate(match_dict, should_smooth)
|
339 |
+
outputs.append(formal_verb)
|
340 |
+
not_none_outpts = [o for o in outputs if o is not None]
|
341 |
+
for index, item in enumerate(not_none_outpts):
|
342 |
+
if item in self.out_mapper:
|
343 |
+
not_none_outpts[index] = self.out_mapper[item]
|
344 |
+
if not_none_outpts:
|
345 |
+
# append bon
|
346 |
+
if len(not_none_outpts) == 1 and is_masdar:
|
347 |
+
masdar = not_none_outpts[0][:-2] + 'ن'
|
348 |
+
not_none_outpts.append(masdar)
|
349 |
+
return not_none_outpts
|
350 |
+
return None
|
app.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
import itertools
|
4 |
+
import os
|
5 |
+
from pathlib import Path
|
6 |
+
import yaml
|
7 |
+
from download_utils import download_dataset
|
8 |
+
import utils
|
9 |
+
from formality_transformer import FormalityTransformer
|
10 |
+
from hazm import SentenceTokenizer
|
11 |
+
|
12 |
+
|
13 |
+
def translate_short_sent(model, sent):
|
14 |
+
out_dict = {}
|
15 |
+
txt = utils.cleanify(sent)
|
16 |
+
is_valid = lambda w: model.oneshot_transformer.transform(w, None)
|
17 |
+
cnd_tokens = model.informal_tokenizer.tokenize(txt, is_valid)
|
18 |
+
for tokens in cnd_tokens:
|
19 |
+
tokens = [t for t in tokens if t != '']
|
20 |
+
new_tokens = []
|
21 |
+
for t in tokens:
|
22 |
+
new_tokens.extend(t.split())
|
23 |
+
txt = ' '.join(new_tokens)
|
24 |
+
tokens = txt.split()
|
25 |
+
candidates = []
|
26 |
+
for index in range(len(tokens)):
|
27 |
+
tok = tokens[index]
|
28 |
+
cnd = set()
|
29 |
+
pos = None
|
30 |
+
if model.verb_handler.informal_to_formal(tok):
|
31 |
+
pos = 'VERB'
|
32 |
+
f_words_lemma = model.oneshot_transformer.transform(tok, pos)
|
33 |
+
f_words_lemma = list(f_words_lemma)
|
34 |
+
for index, (word, lemma) in enumerate(f_words_lemma):
|
35 |
+
if pos != 'VERB' and tok not in model.mapper and model.should_filtered_by_one_bigram(lemma, word, tok):
|
36 |
+
f_words_lemma[index] = (tok, tok)
|
37 |
+
else:
|
38 |
+
word_toks = word.split()
|
39 |
+
word_repr = ''
|
40 |
+
for t in word_toks:
|
41 |
+
word_repr += ' ' + t
|
42 |
+
word_repr = word_repr.strip()
|
43 |
+
word_repr = model.repalce_for_gpt2(word_repr)
|
44 |
+
f_words_lemma[index] = (word, word_repr)
|
45 |
+
if f_words_lemma:
|
46 |
+
cnd.update(f_words_lemma)
|
47 |
+
else:
|
48 |
+
cnd = {(tok, tok)}
|
49 |
+
candidates.append(cnd)
|
50 |
+
all_combinations = itertools.product(*candidates)
|
51 |
+
all_combinations_list = list(all_combinations)
|
52 |
+
for id, cnd in enumerate(all_combinations_list):
|
53 |
+
normal_seq = ' '.join([c[0] for c in cnd])
|
54 |
+
lemma_seq = ' '.join([c[1] for c in cnd])
|
55 |
+
lemma_seq = utils.clean_text_for_lm(lemma_seq)
|
56 |
+
out_dict[id] = (normal_seq, lemma_seq)
|
57 |
+
candidates = [[item[0] for item in candidate_phrases] for candidate_phrases in candidates]
|
58 |
+
return model.lm_obj.get_best(candidates)
|
59 |
+
|
60 |
+
|
61 |
+
def translate(model, sentence_tokenizer, txt):
|
62 |
+
sents = sentence_tokenizer.tokenize(txt)
|
63 |
+
formal_output = ''
|
64 |
+
for sentence in sents:
|
65 |
+
formal_sentence = translate_short_sent(model, sentence)
|
66 |
+
formal_output += ' ' + formal_sentence
|
67 |
+
return formal_output
|
68 |
+
|
69 |
+
|
70 |
+
class Informal2Formal:
|
71 |
+
def __init__(self) -> None:
|
72 |
+
#download or load files
|
73 |
+
DEFAULT_CACHE_DIR = os.path.join(str(Path.home()), '.dadmatools', 'informal2formal')
|
74 |
+
config = load_config('dadmatools/informal2formal/config.yml')
|
75 |
+
file_urls = config['files'].values()
|
76 |
+
download_dataset(file_urls, DEFAULT_CACHE_DIR, filename=None)
|
77 |
+
|
78 |
+
# set assets files address
|
79 |
+
verbs_csv_addr = os.path.join(DEFAULT_CACHE_DIR, 'verbs.csv')
|
80 |
+
irregular_verbs_mapper = os.path.join(DEFAULT_CACHE_DIR, 'irregular_verb_mapper.csv')
|
81 |
+
lm_addr = os.path.join(DEFAULT_CACHE_DIR,'3gram.bin')
|
82 |
+
assets_file_addr = os.path.join(DEFAULT_CACHE_DIR,'assets.pkl')
|
83 |
+
self.sentence_tokenizer = SentenceTokenizer()
|
84 |
+
self.model = FormalityTransformer(asset_file_addr=assets_file_addr,
|
85 |
+
irregular_verbs_mapper_addr=irregular_verbs_mapper, verbs_csv_addr=verbs_csv_addr, lm_addr=lm_addr)
|
86 |
+
|
87 |
+
|
88 |
+
def load_config(config_file):
|
89 |
+
with open(config_file, "r") as file:
|
90 |
+
config = yaml.safe_load(file)
|
91 |
+
return config
|
92 |
+
|
93 |
+
|
94 |
+
st.cache(suppress_st_warning=True, allow_output_mutation=True)
|
95 |
+
st.set_page_config(page_title="Persian Informal to formal translator")
|
96 |
+
|
97 |
+
|
98 |
+
# @st.cache(suppress_st_warning=True, allow_output_mutation=True)
|
99 |
+
def load_model():
|
100 |
+
DEFAULT_CACHE_DIR = os.path.join(str(Path.home()), '.dadmatools', 'informal2formal')
|
101 |
+
config = load_config('config.yml')
|
102 |
+
file_urls = config['files'].values()
|
103 |
+
download_dataset(file_urls, DEFAULT_CACHE_DIR, filename=None)
|
104 |
+
# set assets files address
|
105 |
+
verbs_csv_addr = os.path.join(DEFAULT_CACHE_DIR, 'verbs.csv')
|
106 |
+
irregular_verbs_mapper = os.path.join(DEFAULT_CACHE_DIR, 'irregular_verb_mapper.csv')
|
107 |
+
lm_addr = os.path.join(DEFAULT_CACHE_DIR,'3gram.bin')
|
108 |
+
assets_file_addr = os.path.join(DEFAULT_CACHE_DIR,'assets.pkl')
|
109 |
+
model = FormalityTransformer(asset_file_addr=assets_file_addr,
|
110 |
+
irregular_verbs_mapper_addr=irregular_verbs_mapper, verbs_csv_addr=verbs_csv_addr, lm_addr=lm_addr)
|
111 |
+
return model
|
112 |
+
st.title("Persian/Farsi Formality Transformer")
|
113 |
+
st.write("Translate informal Persian texts to formal")
|
114 |
+
|
115 |
+
|
116 |
+
|
117 |
+
user_input: str = st.text_area(
|
118 |
+
"Input text",
|
119 |
+
height=200,
|
120 |
+
max_chars=5120,
|
121 |
+
)
|
122 |
+
|
123 |
+
|
124 |
+
if st.button("Run"):
|
125 |
+
model = load_model()
|
126 |
+
sentence_tokenizer = SentenceTokenizer()
|
127 |
+
translated_text = translate(model, sentence_tokenizer, user_input)
|
128 |
+
st.success(translated_text)
|
config.yml
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
files:
|
2 |
+
lm: https://huggingface.co/datasets/Dadmatech/informal2formal/resolve/main/3gram.bin
|
3 |
+
assets: https://huggingface.co/datasets/Dadmatech/informal2formal/resolve/main/assets.pkl
|
4 |
+
irregular_verb: https://huggingface.co/datasets/Dadmatech/informal2formal/raw/main/irregular_verb_mapper.csv
|
5 |
+
verbs: https://huggingface.co/datasets/Dadmatech/informal2formal/raw/main/verbs.csv
|
download_utils.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
|
4 |
+
import requests
|
5 |
+
from tqdm import tqdm
|
6 |
+
def download_dataset(urls, dest_dir, filename=None):
|
7 |
+
# source_code: https://github.com/sirbowen78/lab/blob/master/file_handling/dl_file1.py
|
8 |
+
# This example script downloads python program for mac.
|
9 |
+
|
10 |
+
# Home directory of Mac, pathlib.Path module make this easy.
|
11 |
+
# home_path = Path.home()
|
12 |
+
# This is the sub directory under home directory.
|
13 |
+
# sub_path = "tmp"
|
14 |
+
# The header of the dl link has a Content-Length which is in bytes.
|
15 |
+
# The bytes is in string hence has to convert to integer.
|
16 |
+
|
17 |
+
os.makedirs(dest_dir, exist_ok=True)
|
18 |
+
for url in urls:
|
19 |
+
if 'drive.google' in url:
|
20 |
+
import gdown
|
21 |
+
# import os
|
22 |
+
# print('gdown downloadddd output: ', dest_dir )
|
23 |
+
# print(dest_dir, filename)
|
24 |
+
# dest_dir = os.path.join(dest_dir,'peyma.zip')
|
25 |
+
return gdown.download(url, quiet=False, output=filename)
|
26 |
+
try:
|
27 |
+
filesize = int(requests.head(url).headers["Content-Length"])
|
28 |
+
except KeyError:
|
29 |
+
print('unknown file length')
|
30 |
+
filesize = -1
|
31 |
+
# os.path.basename returns python-3.8.5-macosx10.9.pkg,
|
32 |
+
# without this module I will have to manually split the url by "/"
|
33 |
+
# then get the last index with -1.
|
34 |
+
# Example:
|
35 |
+
# url.split("/")[-1]
|
36 |
+
filename = os.path.basename(url)
|
37 |
+
|
38 |
+
# make the sub directory, exists_ok=True will not have exception if the sub dir does not exists.
|
39 |
+
# the dir will be created if not exists.
|
40 |
+
os.makedirs(dest_dir, exist_ok=True)
|
41 |
+
|
42 |
+
# The absolute path to download the python program to.
|
43 |
+
dl_path = os.path.join(dest_dir, filename)
|
44 |
+
chunk_size = 1024
|
45 |
+
if os.path.exists(dl_path):
|
46 |
+
print(f'file {dl_path} already exist')
|
47 |
+
return dl_path
|
48 |
+
# Use the requests.get with stream enable, with iter_content by chunk size,
|
49 |
+
# the contents will be written to the dl_path.
|
50 |
+
# tqdm tracks the progress by progress.update(datasize)
|
51 |
+
with requests.get(url, stream=True) as r, open(dl_path, "wb") as f, tqdm(
|
52 |
+
unit="B", # unit string to be displayed.
|
53 |
+
unit_scale=True, # let tqdm to determine the scale in kilo, mega..etc.
|
54 |
+
unit_divisor=1024, # is used when unit_scale is true
|
55 |
+
total=filesize, # the total iteration.
|
56 |
+
file=sys.stdout, # default goes to stderr, this is the display on console.
|
57 |
+
desc=filename # prefix to be displayed on progress bar.
|
58 |
+
) as progress:
|
59 |
+
for chunk in r.iter_content(chunk_size=chunk_size):
|
60 |
+
# download the file chunk by chunk
|
61 |
+
datasize = f.write(chunk)
|
62 |
+
# on each chunk update the progress bar.
|
63 |
+
progress.update(datasize)
|
64 |
+
|
65 |
+
return True
|
formality_transformer.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import pickle
|
3 |
+
from kenlm_wrapper import Kelm_Wrapper
|
4 |
+
from OneShotTransformer import OneShotTransformer
|
5 |
+
from VerbHandler import VerbHandler
|
6 |
+
import kenlm
|
7 |
+
from tokenizer import InformalTokenizer
|
8 |
+
|
9 |
+
|
10 |
+
class FormalityTransformer:
|
11 |
+
def __init__(self, asset_file_addr, verbs_csv_addr, irregular_verbs_mapper_addr, lm_addr ):
|
12 |
+
assets = pickle.load(open(asset_file_addr, 'rb'))
|
13 |
+
self.vocab = assets['vocab']
|
14 |
+
self.word_ends_tanvin = assets['word_ends_tanvin']
|
15 |
+
self.non_hidden_h_words = assets['non_hidden_h_words']
|
16 |
+
self.isolated_words = assets['isolated_words']
|
17 |
+
self.ignore_words = assets['ignore_words']
|
18 |
+
self.mapper = assets['mapper']
|
19 |
+
self.postfix_mapper = assets['postfix_mapper']
|
20 |
+
postfixes = assets['postfixes']
|
21 |
+
|
22 |
+
self.informal_tokenizer = InformalTokenizer(self.vocab, postfixes)
|
23 |
+
self.verb_handler = VerbHandler(csv_verb_addr=verbs_csv_addr, csv_irregular_verbs_mapper=irregular_verbs_mapper_addr)
|
24 |
+
self.oneshot_transformer = OneShotTransformer(self.vocab, self.mapper, self.verb_handler.informal_to_formal,
|
25 |
+
ignore_words=self.ignore_words,
|
26 |
+
postfix_mapper=self.postfix_mapper,
|
27 |
+
isolated_words=self.isolated_words,
|
28 |
+
non_hidden_h_words=self.non_hidden_h_words)
|
29 |
+
lm_model = kenlm.Model(lm_addr)
|
30 |
+
self.lm_obj = Kelm_Wrapper(lm_model)
|
31 |
+
|
32 |
+
|
33 |
+
def should_filtered_by_one_bigram(self, lemma, word, original_word):
|
34 |
+
NIM_FASELE = ''
|
35 |
+
return original_word in self.vocab and (len(word.split()) > 1 or NIM_FASELE in word)
|
36 |
+
|
37 |
+
def repalce_for_gpt2(self, word_repr):
|
38 |
+
if word_repr in self.word_ends_tanvin:
|
39 |
+
return word_repr[:-2] + 'ا'
|
40 |
+
return word_repr
|
kenlm_wrapper.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
class Kelm_Wrapper:
|
3 |
+
def __init__(self, model):
|
4 |
+
self.model = model
|
5 |
+
def get_best_candidate_word(self, default_phrases, candidate_phrases, index):
|
6 |
+
candidate_texts = [' '.join(default_phrases[:index]) + ' ' + cnd + ' ' + ' '.join(default_phrases[index+1:]) for cnd in candidate_phrases]
|
7 |
+
scores = list(map(self.model.score, candidate_texts))
|
8 |
+
return scores.index(max(scores))
|
9 |
+
|
10 |
+
|
11 |
+
def get_best_ongram_phrases(self, candidates_list):
|
12 |
+
bests = []
|
13 |
+
for candidate_phrase in candidates_list:
|
14 |
+
scores = list(map(self.model.score, candidate_phrase))
|
15 |
+
best_phrase = candidate_phrase[scores.index(max(scores))]
|
16 |
+
bests.append(best_phrase)
|
17 |
+
return bests
|
18 |
+
|
19 |
+
|
20 |
+
def get_best(self, candidates_list):
|
21 |
+
bests = []
|
22 |
+
default_phrases = self.get_best_ongram_phrases(candidates_list)
|
23 |
+
# print(default_phrases)
|
24 |
+
for index in range(len(candidates_list)):
|
25 |
+
if len(candidates_list[index]) > 1:
|
26 |
+
best_phrase_index = self.get_best_candidate_word(default_phrases, candidates_list[index], index)
|
27 |
+
bests.append(candidates_list[index][best_phrase_index])
|
28 |
+
else:
|
29 |
+
bests.append(candidates_list[index][0])
|
30 |
+
return ' '.join(bests)
|
31 |
+
|
main.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import itertools
|
2 |
+
import os
|
3 |
+
from pathlib import Path
|
4 |
+
import yaml
|
5 |
+
from download_utils import download_dataset
|
6 |
+
import utils
|
7 |
+
from formality_transformer import FormalityTransformer
|
8 |
+
from hazm import SentenceTokenizer
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
def translate_short_sent(model, sent):
|
13 |
+
out_dict = {}
|
14 |
+
txt = utils.cleanify(sent)
|
15 |
+
is_valid = lambda w: model.oneshot_transformer.transform(w, None)
|
16 |
+
cnd_tokens = model.informal_tokenizer.tokenize(txt, is_valid)
|
17 |
+
for tokens in cnd_tokens:
|
18 |
+
tokens = [t for t in tokens if t != '']
|
19 |
+
new_tokens = []
|
20 |
+
for t in tokens:
|
21 |
+
new_tokens.extend(t.split())
|
22 |
+
txt = ' '.join(new_tokens)
|
23 |
+
tokens = txt.split()
|
24 |
+
candidates = []
|
25 |
+
for index in range(len(tokens)):
|
26 |
+
tok = tokens[index]
|
27 |
+
cnd = set()
|
28 |
+
pos = None
|
29 |
+
if model.verb_handler.informal_to_formal(tok):
|
30 |
+
pos = 'VERB'
|
31 |
+
f_words_lemma = model.oneshot_transformer.transform(tok, pos)
|
32 |
+
f_words_lemma = list(f_words_lemma)
|
33 |
+
for index, (word, lemma) in enumerate(f_words_lemma):
|
34 |
+
if pos != 'VERB' and tok not in model.mapper and model.should_filtered_by_one_bigram(lemma, word, tok):
|
35 |
+
f_words_lemma[index] = (tok, tok)
|
36 |
+
else:
|
37 |
+
word_toks = word.split()
|
38 |
+
word_repr = ''
|
39 |
+
for t in word_toks:
|
40 |
+
word_repr += ' ' + t
|
41 |
+
word_repr = word_repr.strip()
|
42 |
+
word_repr = model.repalce_for_gpt2(word_repr)
|
43 |
+
f_words_lemma[index] = (word, word_repr)
|
44 |
+
if f_words_lemma:
|
45 |
+
cnd.update(f_words_lemma)
|
46 |
+
else:
|
47 |
+
cnd = {(tok, tok)}
|
48 |
+
candidates.append(cnd)
|
49 |
+
all_combinations = itertools.product(*candidates)
|
50 |
+
all_combinations_list = list(all_combinations)
|
51 |
+
for id, cnd in enumerate(all_combinations_list):
|
52 |
+
normal_seq = ' '.join([c[0] for c in cnd])
|
53 |
+
lemma_seq = ' '.join([c[1] for c in cnd])
|
54 |
+
lemma_seq = utils.clean_text_for_lm(lemma_seq)
|
55 |
+
out_dict[id] = (normal_seq, lemma_seq)
|
56 |
+
candidates = [[item[0] for item in candidate_phrases] for candidate_phrases in candidates]
|
57 |
+
return model.lm_obj.get_best(candidates)
|
58 |
+
|
59 |
+
|
60 |
+
def translate(model, sentence_tokenizer, txt):
|
61 |
+
sents = sentence_tokenizer.tokenize(txt)
|
62 |
+
formal_output = ''
|
63 |
+
for sentence in sents:
|
64 |
+
formal_sentence = translate_short_sent(model, sentence)
|
65 |
+
formal_output += ' ' + formal_sentence
|
66 |
+
return formal_output
|
67 |
+
|
68 |
+
def load_config(config_file):
|
69 |
+
with open(config_file, "r") as file:
|
70 |
+
config = yaml.safe_load(file)
|
71 |
+
return config
|
72 |
+
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
if __name__ == '__main__':
|
77 |
+
|
78 |
+
#download or load files
|
79 |
+
DEFAULT_CACHE_DIR = os.path.join(str(Path.home()), '.dadmatools', 'informal2formal')
|
80 |
+
config = load_config('config.yml')
|
81 |
+
file_urls = config['files'].values()
|
82 |
+
download_dataset(file_urls, DEFAULT_CACHE_DIR, filename=None)
|
83 |
+
|
84 |
+
# set assets files address
|
85 |
+
verbs_csv_addr = os.path.join(DEFAULT_CACHE_DIR, 'verbs.csv')
|
86 |
+
irregular_verbs_mapper = os.path.join(DEFAULT_CACHE_DIR, 'irregular_verb_mapper.csv')
|
87 |
+
lm_addr = os.path.join(DEFAULT_CACHE_DIR,'3gram.bin')
|
88 |
+
assets_file_addr = os.path.join(DEFAULT_CACHE_DIR,'assets.pkl')
|
89 |
+
|
90 |
+
#test on a sample
|
91 |
+
sentence_tokenizer = SentenceTokenizer()
|
92 |
+
model = FormalityTransformer(asset_file_addr=assets_file_addr,
|
93 |
+
irregular_verbs_mapper_addr=irregular_verbs_mapper, verbs_csv_addr=verbs_csv_addr, lm_addr=lm_addr)
|
94 |
+
print(translate(model, sentence_tokenizer, 'اینو میشه واسه تبدیل تموم جملات محاوره استفاده کرد اگه خواستین'))
|
95 |
+
|
96 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
hazm
|
3 |
+
datasets
|
4 |
+
PyYAML
|
5 |
+
kenlm
|
6 |
+
streamlit
|
7 |
+
git+https://github.com/kpu/kenlm@master#egg=kenlm
|
tokenizer.py
ADDED
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import itertools
|
2 |
+
import utils
|
3 |
+
class InformalTokenizer:
|
4 |
+
def __init__(self, vocab, postfixes):
|
5 |
+
self.vocab = vocab
|
6 |
+
self.pres = InformalTokenizer.get_prefixs()
|
7 |
+
self.posts = postfixes
|
8 |
+
|
9 |
+
@staticmethod
|
10 |
+
def get_prefixs():
|
11 |
+
return ['نا', 'بی', 'هر', 'می']
|
12 |
+
|
13 |
+
@staticmethod
|
14 |
+
def get_postfixs(informal_postfix_addr):
|
15 |
+
with open(informal_postfix_addr, 'r') as f:
|
16 |
+
ps = f.read().splitlines()
|
17 |
+
return ps
|
18 |
+
|
19 |
+
|
20 |
+
def is_pre_post_word(self, w):
|
21 |
+
nim_fasele = ''
|
22 |
+
ws = w.split(nim_fasele)
|
23 |
+
pre, pos, v = [0,1,2]
|
24 |
+
is_pre_pos = False
|
25 |
+
state = pre
|
26 |
+
valid_w = ''
|
27 |
+
for w in ws:
|
28 |
+
if state == pre:
|
29 |
+
if w in self.pres:
|
30 |
+
valid_w += nim_fasele + w
|
31 |
+
is_pre_pos = True
|
32 |
+
continue
|
33 |
+
elif w in self.posts:
|
34 |
+
valid_w += nim_fasele + w
|
35 |
+
is_pre_pos = True
|
36 |
+
state = pos
|
37 |
+
continue
|
38 |
+
state = v
|
39 |
+
valid_w += nim_fasele + w
|
40 |
+
continue
|
41 |
+
|
42 |
+
if state == pos:
|
43 |
+
if w in self.posts:
|
44 |
+
valid_w += nim_fasele + w
|
45 |
+
continue
|
46 |
+
return False
|
47 |
+
if state == v:
|
48 |
+
if w in self.posts:
|
49 |
+
is_pre_pos = True
|
50 |
+
state = pos
|
51 |
+
valid_w += nim_fasele + w
|
52 |
+
continue
|
53 |
+
if w in self.vocab:
|
54 |
+
valid_w += nim_fasele + w
|
55 |
+
if valid_w not in self.vocab:
|
56 |
+
return False
|
57 |
+
continue
|
58 |
+
|
59 |
+
return False
|
60 |
+
if not is_pre_pos:
|
61 |
+
return False
|
62 |
+
return True
|
63 |
+
|
64 |
+
|
65 |
+
def get_valid_word(self, words):
|
66 |
+
seps = ['', '']
|
67 |
+
all_seqs = []
|
68 |
+
count = len(words)
|
69 |
+
lst = list(itertools.product(seps, repeat=count-1))
|
70 |
+
for item in lst:
|
71 |
+
seq = ''
|
72 |
+
for word, sep in zip(words[:-1], item):
|
73 |
+
seq += word + sep
|
74 |
+
seq += words[-1]
|
75 |
+
all_seqs.append(seq)
|
76 |
+
return [w for w in all_seqs if w in self.vocab or self.is_pre_post_word(w)]
|
77 |
+
|
78 |
+
def get_candidates(self, tokens, index=0, current_seq = ' '):
|
79 |
+
if index == len(tokens):
|
80 |
+
return current_seq
|
81 |
+
word = tokens[index]
|
82 |
+
next_word, next_next_word = [None, None]
|
83 |
+
if index < len(tokens) -1:
|
84 |
+
next_word = tokens[index+1]
|
85 |
+
if index < len(tokens) -2:
|
86 |
+
next_next_word = tokens[index+2]
|
87 |
+
cnds = []
|
88 |
+
if next_word is not None:
|
89 |
+
v_words = self.get_valid_word([word, next_word])
|
90 |
+
if v_words:
|
91 |
+
for v_w in v_words:
|
92 |
+
current_seq1 = current_seq + ' ' + v_w
|
93 |
+
cnds2 = self.get_candidates(tokens,index+2, current_seq1)
|
94 |
+
if type(cnds2) == str:
|
95 |
+
cnds.append(cnds2)
|
96 |
+
else:
|
97 |
+
cnds.extend(cnds2)
|
98 |
+
if next_next_word is not None:
|
99 |
+
v_words = self.get_valid_word([word, next_word, next_next_word])
|
100 |
+
if v_words:
|
101 |
+
for v_w in v_words:
|
102 |
+
current_seq2 = current_seq + ' ' + v_w
|
103 |
+
cnds3 = self.get_candidates(tokens,index+3, current_seq2)
|
104 |
+
if type(cnds3) == str:
|
105 |
+
cnds.append(cnds3)
|
106 |
+
else:
|
107 |
+
cnds.extend(cnds3)
|
108 |
+
current_seq = current_seq + ' ' + word
|
109 |
+
cnds1 = self.get_candidates(tokens,index+1, current_seq)
|
110 |
+
if type(cnds1) == str:
|
111 |
+
cnds.append(cnds1)
|
112 |
+
else:
|
113 |
+
cnds.extend(cnds1)
|
114 |
+
return [c.strip() for c in cnds]
|
115 |
+
|
116 |
+
def seperate_conjs(self, word, validator):
|
117 |
+
conjs = ['و', 'در', 'با', 'تا', 'که', 'از', 'تو', 'من', 'شما']
|
118 |
+
cnds = utils.split_conj_words(word, conjs)
|
119 |
+
valid_cnds = [c for c in cnds if validator(c)]
|
120 |
+
if valid_cnds:
|
121 |
+
return valid_cnds
|
122 |
+
return [word]
|
123 |
+
|
124 |
+
def tokenize(self, txt, validator):
|
125 |
+
tokens = txt.split()
|
126 |
+
all_cnds = []
|
127 |
+
for t in tokens:
|
128 |
+
if not validator(t):
|
129 |
+
ws = self.seperate_conjs(t, validator)
|
130 |
+
else:
|
131 |
+
ws = [t]
|
132 |
+
all_cnds.append(ws)
|
133 |
+
all_cnd_tokens = itertools.product(*all_cnds)
|
134 |
+
txts = list(map(self.get_dense_tokens, all_cnd_tokens))
|
135 |
+
return txts
|
136 |
+
|
137 |
+
def get_dense_tokens(self, tokens):
|
138 |
+
PRE, WORD, POST = 0,1,2
|
139 |
+
out_tokens = []
|
140 |
+
nim_fasele = ''
|
141 |
+
current_word = ''
|
142 |
+
state = WORD
|
143 |
+
for i, t in enumerate(tokens):
|
144 |
+
if state == WORD:
|
145 |
+
if t in self.pres:
|
146 |
+
out_tokens.append(current_word)
|
147 |
+
current_word = t
|
148 |
+
state = PRE
|
149 |
+
if t in self.posts:
|
150 |
+
current_word += nim_fasele
|
151 |
+
current_word += t
|
152 |
+
state = POST
|
153 |
+
if t not in self.pres and t not in self.posts:
|
154 |
+
out_tokens.append(current_word)
|
155 |
+
current_word = t
|
156 |
+
continue
|
157 |
+
if state == PRE:
|
158 |
+
if t in self.pres:
|
159 |
+
current_word += nim_fasele
|
160 |
+
current_word += t
|
161 |
+
if t in self.posts:
|
162 |
+
out_tokens.append(current_word)
|
163 |
+
current_word = t
|
164 |
+
state = WORD
|
165 |
+
if t not in self.pres and t not in self.posts:
|
166 |
+
current_word += nim_fasele
|
167 |
+
current_word += t
|
168 |
+
state = WORD
|
169 |
+
continue
|
170 |
+
if state == POST:
|
171 |
+
if t in self.pres:
|
172 |
+
out_tokens.append(current_word)
|
173 |
+
current_word = t
|
174 |
+
state = PRE
|
175 |
+
if t in self.posts:
|
176 |
+
current_word += nim_fasele
|
177 |
+
current_word += t
|
178 |
+
if t not in self.pres and t not in self.posts:
|
179 |
+
out_tokens.append(current_word)
|
180 |
+
current_word = t
|
181 |
+
state = WORD
|
182 |
+
if out_tokens[-1] != current_word:
|
183 |
+
out_tokens.append(current_word)
|
184 |
+
return out_tokens
|
utils.py
ADDED
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from functools import reduce
|
2 |
+
import itertools
|
3 |
+
import json
|
4 |
+
import re
|
5 |
+
import string
|
6 |
+
import pandas as pd
|
7 |
+
from hazm import Normalizer, WordTokenizer
|
8 |
+
|
9 |
+
normalizer = Normalizer()
|
10 |
+
tokenizer = WordTokenizer(separate_emoji=True)
|
11 |
+
|
12 |
+
|
13 |
+
def seprate_emoji_string(txt):
|
14 |
+
try:
|
15 |
+
oRes = re.compile(u'(['
|
16 |
+
u'\U0001F300-\U0001F64F'
|
17 |
+
u'\U0001F680-\U0001F6FF'
|
18 |
+
u'\u2600-\u26FF\u2700-\u27BF]+)',
|
19 |
+
re.UNICODE)
|
20 |
+
except re.error:
|
21 |
+
oRes = re.compile(u'(('
|
22 |
+
u'\ud83c[\udf00-\udfff]|'
|
23 |
+
u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
|
24 |
+
u'[\u2600-\u26FF\u2700-\u27BF])+)',
|
25 |
+
re.UNICODE)
|
26 |
+
|
27 |
+
return oRes.sub(r' \1 ', txt)
|
28 |
+
|
29 |
+
def cleanify(txt):
|
30 |
+
txt = txt.strip()
|
31 |
+
txt = re.sub('\s+', ' ', txt)
|
32 |
+
txt = re.sub('\u200f', '', txt)
|
33 |
+
txt = re.sub('+', '', txt)
|
34 |
+
txt = re.sub(' ', ' ', txt)
|
35 |
+
txt = re.sub(' ', ' ', txt)
|
36 |
+
txt = normalizer.normalize(txt)
|
37 |
+
txt = seprate_emoji_string(txt)
|
38 |
+
txt = ' '.join(tokenizer.tokenize(txt))
|
39 |
+
return txt
|
40 |
+
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
def clean_text_for_lm(txt):
|
45 |
+
ignore_chars = '.1234567890!@#$%^&*()_+۱۲۳۴۵۶۷۸۹÷؟×−+?><}،,{":' + string.ascii_lowercase + string.ascii_uppercase
|
46 |
+
tokens = txt.split()
|
47 |
+
clean_tokens = [t for t in tokens if not (any(ic in t for ic in ignore_chars) or if_emoji(t))]
|
48 |
+
return ' '.join(clean_tokens)
|
49 |
+
|
50 |
+
|
51 |
+
def add_to_mapper(mapping_list):
|
52 |
+
print(len(mapping_list))
|
53 |
+
df = pd.read_csv('resources/mapper.csv', delimiter=',', index_col=None)
|
54 |
+
print(df.columns)
|
55 |
+
for item in mapping_list:
|
56 |
+
df = df.append({'formal': item[1], 'informal': item[0]}, ignore_index=True)
|
57 |
+
df.to_csv('resources/mapper.csv', index=False)
|
58 |
+
|
59 |
+
|
60 |
+
def extract_non_convertable_words(corpus_addr, tokenizer, normalizer, transformer, output_addr, vocab):
|
61 |
+
f = open(corpus_addr)
|
62 |
+
non_convertables = {}
|
63 |
+
seen_words = set()
|
64 |
+
nim_fasele = ''
|
65 |
+
for i, line in enumerate(f):
|
66 |
+
print(i)
|
67 |
+
# if i > 500:
|
68 |
+
# break
|
69 |
+
line = normalizer.normalize(line)
|
70 |
+
tokens = tokenizer.tokenize(line)
|
71 |
+
for t in tokens:
|
72 |
+
# if nim_fasele in t:
|
73 |
+
# print(t)
|
74 |
+
if t in seen_words:
|
75 |
+
if t in non_convertables:
|
76 |
+
non_convertables[t] += 1
|
77 |
+
else:
|
78 |
+
candidates = transformer.transform(t, None)
|
79 |
+
# if not candidates and any(t.startswith(pre) for pre in ['از', 'در', 'چند', 'هر', 'هیچ', 'هم', 'با', 'بی', 'تا', 'و']):
|
80 |
+
# print(t)
|
81 |
+
if not candidates:
|
82 |
+
non_convertables[t] = 1
|
83 |
+
seen_words.add(t)
|
84 |
+
words_count = sorted([(word, count) for word, count in non_convertables.items()], key=lambda item: item[1], reverse=True)
|
85 |
+
words_count = [str(word) + ' ########### ' + str(count) for (word, count) in words_count]
|
86 |
+
with open(output_addr, 'w+') as f:
|
87 |
+
f.write('\n'.join(words_count))
|
88 |
+
|
89 |
+
|
90 |
+
def generate_irrgular_informal_verbs():
|
91 |
+
"""
|
92 |
+
برمیگرده میوفته برمیداره برمیگردونه درمیاره ایستادن نمیومد وامیسته
|
93 |
+
|
94 |
+
اومد
|
95 |
+
نیومد
|
96 |
+
اومدی
|
97 |
+
نیومدی
|
98 |
+
میومدی
|
99 |
+
نیومده
|
100 |
+
یومد
|
101 |
+
میومده
|
102 |
+
"""
|
103 |
+
|
104 |
+
mapping_verbs = []
|
105 |
+
past_ends = ['م', 'ی', 'ه', 'یم', 'ین', 'ید', 'ند', '', 'ن']
|
106 |
+
neg = ['ن', '']
|
107 |
+
pre = ['می', 'ب']
|
108 |
+
pre_verbs = [('بر', 'دار'), ('در', 'یار'), ('وا', 'ست'), ('بر', 'گرد'), ('ور', 'دار'), ('بر', 'گشت')]
|
109 |
+
extras = ['ن', 'نمی', 'می']
|
110 |
+
mapper = {'ه':'د', 'ن': 'ند', 'ین': 'ید', 'ور': 'بر', 'ست':'ایست', 'وا':'', 'یار':'آور'}
|
111 |
+
for item in pre_verbs:
|
112 |
+
for pe in past_ends:
|
113 |
+
for ex in extras:
|
114 |
+
p_end = pe
|
115 |
+
item0 = item[0]
|
116 |
+
item1 = item[1]
|
117 |
+
inf = item0 + ex + item1 + p_end
|
118 |
+
inf = inf.replace('یی', 'ی')
|
119 |
+
if item0 in mapper:
|
120 |
+
item0 = mapper[item0]
|
121 |
+
if item1 in mapper:
|
122 |
+
item1 = mapper[item1]
|
123 |
+
if p_end in mapper:
|
124 |
+
p_end = mapper[p_end]
|
125 |
+
formal = item0 + ex + item1 + p_end
|
126 |
+
formal = formal.replace('می', 'می')
|
127 |
+
formal = formal.replace('نآ', 'نیا')
|
128 |
+
mapping_verbs.append([formal, inf])
|
129 |
+
bons = ['یومد', 'یوفت']
|
130 |
+
v_mapper = {'یومد': 'یامد', 'یوفت': 'افت'}
|
131 |
+
verbs = itertools.product(neg, pre, bons, past_ends)
|
132 |
+
for v in verbs:
|
133 |
+
if v[0] == 'ن' and v[1] == 'ب' or (v[2] == 'یومد' and v[1] == 'ب'):
|
134 |
+
continue
|
135 |
+
inf = v[0] + v[1] + v[2] + v[3]
|
136 |
+
inf = inf.replace('یی', 'ی')
|
137 |
+
pe = v[3]
|
138 |
+
if pe in mapper:
|
139 |
+
pe = mapper[pe]
|
140 |
+
formal = v[0] + v[1] + '' + v_mapper[v[2]] + pe
|
141 |
+
formal = formal.replace('یی', 'ی')
|
142 |
+
formal = formal.replace('یا', 'یآ')
|
143 |
+
formal = formal.replace('دد', 'ده')
|
144 |
+
formal = formal.replace('با', 'بی')
|
145 |
+
mapping_verbs.append([formal, inf])
|
146 |
+
add_to_mapper(mapping_verbs)
|
147 |
+
|
148 |
+
|
149 |
+
|
150 |
+
def load_vocab(vocab_addr='resources/words.dat'):
|
151 |
+
vocab = {}
|
152 |
+
with open(vocab_addr, 'r', encoding='utf-8') as f:
|
153 |
+
for line in f:
|
154 |
+
try:
|
155 |
+
word, freq, p_tags = line.strip().split('\t')
|
156 |
+
vocab[word] = {'freq': freq, 'tags': p_tags}
|
157 |
+
except:
|
158 |
+
word = line.strip()
|
159 |
+
vocab[word] = {'freq': 1, 'tags': 'NUM'}
|
160 |
+
return vocab
|
161 |
+
|
162 |
+
def if_connect(word1, word2):
|
163 |
+
not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و']
|
164 |
+
if any(w =='' for w in [word1, word2]) or word1[-1] in not_connect_chars:
|
165 |
+
return True
|
166 |
+
return False
|
167 |
+
def split_conj_words(word, conjs):
|
168 |
+
candidates = set()
|
169 |
+
sorted_conjs = sorted(conjs, key=lambda x: len(x), reverse=True)
|
170 |
+
for c in sorted_conjs:
|
171 |
+
indx = word.find(c)
|
172 |
+
if indx != -1 and indx in [0, len(word)-1]:
|
173 |
+
pre_w = word[:indx]
|
174 |
+
next_w = word[indx+len(c) :]
|
175 |
+
if if_connect(pre_w, c) and if_connect(c, next_w):
|
176 |
+
cnd = ' '.join([pre_w, c, next_w])
|
177 |
+
cnd = cnd.strip()
|
178 |
+
candidates.add(cnd)
|
179 |
+
return list(candidates)
|
180 |
+
|
181 |
+
|
182 |
+
def is_formal_prefixed(word, vocab):
|
183 |
+
not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و']
|
184 |
+
nim_fasele = ''
|
185 |
+
m1 = re.match('(.+)های(م|ت|ش|مان|تان|شان)?$', word)
|
186 |
+
m2 = re.match('(.+[ا|و|ی])ی(م|ت|ش|مان|تان|شان)$', word)
|
187 |
+
m3 = re.match('(.+[^ا^و^ی])(م|ت|ش|مان|تان|شان)$', word)
|
188 |
+
m4 = re.match('(.+)(ها)$', word)
|
189 |
+
m5 = re.match('(.+[ه|ی])(اش|ام|ات)$', word)
|
190 |
+
if m3 or m2:
|
191 |
+
prefix_word = list(filter(lambda m: m is not None, [m3, m2]))[0].group(1)
|
192 |
+
if prefix_word in vocab:
|
193 |
+
return True
|
194 |
+
m_fired = list(filter(lambda m: m is not None, [m1, m4, m5]))
|
195 |
+
if len(m_fired) > 0:
|
196 |
+
# print(word, m_fired[0].groups())
|
197 |
+
prefix_word = m_fired[0].group(1)
|
198 |
+
if prefix_word[-1] != nim_fasele and prefix_word[-1] not in not_connect_chars:
|
199 |
+
return False
|
200 |
+
if prefix_word[-1] == nim_fasele and not (prefix_word[:-1] in vocab):
|
201 |
+
return False
|
202 |
+
if prefix_word[-1] != nim_fasele and not (prefix_word in vocab):
|
203 |
+
return False
|
204 |
+
return True
|
205 |
+
return False
|
206 |
+
|
207 |
+
|
208 |
+
def spelling_similairty(word):
|
209 |
+
all_possible = []
|
210 |
+
possible_repeated = get_possible_repeated_word(word)
|
211 |
+
all_possible = possible_repeated
|
212 |
+
if word in all_possible:
|
213 |
+
all_possible.remove(word)
|
214 |
+
return all_possible
|
215 |
+
|
216 |
+
def add_nim_alef_hat_dictionary(vocab):
|
217 |
+
word_with_hat = filter(lambda w: 'آ' in w, vocab)
|
218 |
+
word_with_nim = filter(lambda w: '' in w, vocab)
|
219 |
+
mapper1 = {w.replace('آ', 'ا').replace('', ''): w for w in word_with_hat}
|
220 |
+
mapper2 = {w.replace('', ''): w for w in word_with_nim}
|
221 |
+
mapper1.update(mapper2)
|
222 |
+
return mapper1
|
223 |
+
|
224 |
+
def generate_spell_mapper(vocab):
|
225 |
+
hat = 'آ'
|
226 |
+
tanvin = 'اً'
|
227 |
+
nim = ''
|
228 |
+
hamzeh = 'أ'
|
229 |
+
hamzeh_y = 'ئ'
|
230 |
+
sp_mapper = {hamzeh_y: ['ی'], hat: ['ا'], tanvin: ['ن', 'ا'], nim:['', ' '], hamzeh:['ا', '']}
|
231 |
+
special_chars = [hat, tanvin, nim, hamzeh]
|
232 |
+
out = {}
|
233 |
+
for word in vocab:
|
234 |
+
p_words = [word.replace(sp, sp_alt) for sp in special_chars for sp_alt in sp_mapper[sp]]
|
235 |
+
spell_errors = []
|
236 |
+
p_words = list(set(p_words) - set([word]))
|
237 |
+
for pw in p_words:
|
238 |
+
if pw in out:
|
239 |
+
out[pw].add(word)
|
240 |
+
else:
|
241 |
+
out[pw] = {word}
|
242 |
+
out = {w: list(out[w]) for w in out}
|
243 |
+
with open('spell_checker_mapper.json', 'w+', encoding='utf-8') as f:
|
244 |
+
json.dump(out, f, ensure_ascii=False, indent=1)
|
245 |
+
|
246 |
+
|
247 |
+
|
248 |
+
def create_mapper_tanvin_hamze_hat_nim_fasele():
|
249 |
+
mapper = {}
|
250 |
+
hats_word = open('resources/spell/words_with_hat.txt').read().splitlines()
|
251 |
+
nim_words = open('resources/spell/words_with_nim.txt').read().splitlines()
|
252 |
+
tanvin_words = open('resources/spell/words_with_tanvin.txt').read().splitlines()
|
253 |
+
hat_ch = 'آ'
|
254 |
+
nim_fasele = ''
|
255 |
+
for w in hats_word:
|
256 |
+
w_without_h = w.replace(hat_ch, 'ا')
|
257 |
+
mapper[w_without_h] = w
|
258 |
+
for w in nim_words:
|
259 |
+
w_without_nim = w.remove(nim_fasele)
|
260 |
+
mapper[w_without_nim] = w
|
261 |
+
w_space_instead_nim = w.replace(nim_fasele, ' ')
|
262 |
+
mapper[w_space_instead_nim] = w
|
263 |
+
|
264 |
+
def extract_lemma_nim_fasele_words(word, vocab):
|
265 |
+
prefixs = ['اون']
|
266 |
+
postfixs = {'ست': 'است', 'هام':'هایم', 'ام':'ام', 'ها':'ها', 'هامون':'هایمان', 'ترین': 'ترین', 'هایشان':'هایشان'}
|
267 |
+
tokens = word.split('')
|
268 |
+
index = 0
|
269 |
+
for i in range(len(tokens)):
|
270 |
+
index = i
|
271 |
+
if tokens[i] not in prefixs:
|
272 |
+
break
|
273 |
+
|
274 |
+
for i in range(len(tokens), 0, -1):
|
275 |
+
current_tok = ''.join(tokens[index:i])
|
276 |
+
if current_tok in vocab or tokens[i-1] not in postfixs:
|
277 |
+
return current_tok
|
278 |
+
|
279 |
+
|
280 |
+
def if_emoji(text):
|
281 |
+
# Wide UCS-4 build
|
282 |
+
try:
|
283 |
+
oRes = re.compile(u'(['
|
284 |
+
u'\U0001F300-\U0001F64F'
|
285 |
+
u'\U0001F680-\U0001F6FF'
|
286 |
+
u'\u2600-\u26FF\u2700-\u27BF]+)',
|
287 |
+
re.UNICODE)
|
288 |
+
|
289 |
+
except re.error:
|
290 |
+
# Narrow UCS-2 build
|
291 |
+
oRes = re.compile(u'(('
|
292 |
+
u'\ud83c[\udf00-\udfff]|'
|
293 |
+
u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
|
294 |
+
u'[\u2600-\u26FF\u2700-\u27BF])+)',
|
295 |
+
re.UNICODE)
|
296 |
+
|
297 |
+
return oRes.findall(text)
|
298 |
+
|
299 |
+
|
300 |
+
def powerset(lst):
|
301 |
+
return reduce(lambda result, x: result + [subset + [x] for subset in result],
|
302 |
+
lst, [[]])
|