mohammadkrb commited on
Commit
6227608
1 Parent(s): 3fd00d4

init streamlit based app

Browse files
Files changed (11) hide show
  1. OneShotTransformer.py +600 -0
  2. VerbHandler.py +350 -0
  3. app.py +128 -0
  4. config.yml +5 -0
  5. download_utils.py +65 -0
  6. formality_transformer.py +40 -0
  7. kenlm_wrapper.py +31 -0
  8. main.py +96 -0
  9. requirements.txt +7 -0
  10. tokenizer.py +184 -0
  11. utils.py +302 -0
OneShotTransformer.py ADDED
@@ -0,0 +1,600 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import itertools
3
+ import string
4
+ import utils
5
+
6
+
7
+ class InformalWord:
8
+ def __init__(self, lemma, prefixs=None, postfixs=None, pos=None, append_h=False):
9
+ if prefixs is None:
10
+ prefixs = []
11
+ if postfixs is None:
12
+ postfixs = []
13
+ self.is_verb = False
14
+ self.is_mapper = False
15
+ self.semi_mapper = False
16
+ self.append_h = append_h
17
+ self.lemma = lemma
18
+ self.prefixs = prefixs
19
+ self.postfixs = postfixs
20
+ self.pos = pos
21
+
22
+ class Prefix:
23
+ def __init__(self, word, level, formal=None, ignore_poses=None, poses=None, non_connecting_chars=None, connector='nim'):
24
+ if non_connecting_chars is None:
25
+ non_connecting_chars = []
26
+ self.word = word
27
+ self.level = level
28
+ self.ignore_poses = ignore_poses
29
+ self.poses = poses
30
+ self.connector = connector
31
+ if formal is None:
32
+ self.formal = word
33
+ else:
34
+ self.formal = formal
35
+ self.non_connecting_chars = non_connecting_chars
36
+ class Postfix:
37
+ def __init__(self, word, level, formal=None, ignore_poses=None, non_connecting_chars=None, poses=None, connector='nim'):
38
+ if non_connecting_chars is None:
39
+ non_connecting_chars = []
40
+ self.word = word
41
+ self.level = level
42
+ self.ignore_poses = ignore_poses
43
+ self.poses = poses
44
+ self.connector = connector
45
+ if formal is None:
46
+ self.formal = word
47
+ else:
48
+ self.formal = formal
49
+ self.non_connecting_chars = non_connecting_chars
50
+
51
+
52
+
53
+ class OneShotTransformer:
54
+
55
+ NIM_FASELE = chr(8204)
56
+ # prefixs
57
+ HAMUN = Prefix('همون', 1, 'همان',connector='fasele',non_connecting_chars=['ه'])
58
+ HAMIN = Prefix('همین', 1,connector='fasele')
59
+ HAR = Prefix('هر', 1,connector='fasele')
60
+ UN = Prefix('اون', 1, 'آن',connector='fasele',non_connecting_chars=['ه'])
61
+ IN = Prefix('این', 1,connector='fasele',non_connecting_chars=['ه'])
62
+ HICH = Prefix('هیچ', 1,connector='nim',non_connecting_chars=['ه', 'ا', 'آ'])
63
+ B = Prefix('ب', 1, 'به', ignore_poses=['VERB', 'CCONJ', 'SCONJ'],connector='fasele',non_connecting_chars=['ا', 'ه', 'آ'])
64
+ Y = Prefix('ی', 1, 'یک', ignore_poses=['VERB', 'CCONJ', 'SCONJ'],connector='fasele',non_connecting_chars=['ا', 'آ'])
65
+ BI = Prefix('بی', 1, ignore_poses=['VERB'],connector='nim',non_connecting_chars=['ا'])
66
+ POR = Prefix('پر', 1, ignore_poses=['VERB'],connector='nim')
67
+ pres = [[HAMIN, HAMUN, UN, IN, HAMIN, BI, B, Y, POR, HAR]]
68
+ #postfixs
69
+ Y1 = Postfix('ی', 0, ignore_poses=['VERB'], connector='none',non_connecting_chars=['ی', 'ا', 'و', 'آ', 'اً'])
70
+ TAR = Postfix('تر', 1, connector='nim')
71
+ TARIN = Postfix('ترین', 1, connector='nim')
72
+ HAY = Postfix('های', 2, connector='nim')
73
+ HA = Postfix('ها', 2, connector='nim')
74
+ A = Postfix('ا', 2, 'ها', ignore_poses=['VERB'], connector='nim',non_connecting_chars=['ا', 'و', 'آ', 'اً'])
75
+ A1 = Postfix('ای', 2, 'های', ignore_poses=['VERB'], connector='nim',non_connecting_chars=['ا', 'و', 'آ', 'اً'])
76
+ YY = Postfix('یی', 3, 'یی', ignore_poses=['VERB'], connector='none')
77
+ M = Postfix('م', 3, ignore_poses=['VERB'], connector='none')
78
+ M_MAN = Postfix('م', 3, 'من', ignore_poses=['VERB'], connector='fasele')
79
+ T = Postfix('ت', 3, connector='none')
80
+ T1 = Postfix('ت', 3, 'تو', connector='fasele')
81
+ # T2 = Postfix('ت', 3, 'خود', ignore_poses=['VERB'], connector='fasele')
82
+ SH = Postfix('ش', 3, connector='none')
83
+ # SH1 = Postfix('ش', 3, 'خود', connector='fasele')
84
+ # SH2 = Postfix('ش', 3, 'آن', connector='fasele')
85
+ # SH3 = Postfix('ش', 3, 'او', connector='fasele')
86
+ MAN = Postfix('مان', 3, connector='nim')
87
+ MAN1 = Postfix('مان', 3, 'ما', connector='fasele')
88
+ # MAN2 = Postfix('مان', 3, 'خود', connector='fasele')
89
+ MUN = Postfix('مون', 3, 'مان', connector='nim')
90
+ # MUN1 = Postfix('مون', 3, 'خود', connector='fasele')
91
+ MUN2 = Postfix('مون', 3, 'ما', connector='fasele')
92
+ TAN = Postfix('تان', 3, connector='nim')
93
+ # TAN1 = Postfix('تان', 3, 'خود', connector='fasele')
94
+ TAN2 = Postfix('تان', 3, 'شما', connector='fasele')
95
+ TUN = Postfix('تون', 3, 'تان', connector='nim')
96
+ # TUN1 = Postfix('تون', 3, 'خود', connector='fasele')
97
+ TUN2 = Postfix('تون', 3, 'شما', connector='fasele')
98
+ SHAN = Postfix('شان', 3, connector='nim')
99
+ # SHAN1 = Postfix('شان', 3, 'خود', connector='fasele')
100
+ SHAN2 = Postfix('شان', 3, 'آنان', connector='fasele')
101
+ SHUN = Postfix('شون', 3, 'شان', connector='nim')
102
+ # SHUN1 = Postfix('شون', 3, 'خود', connector='fasele')
103
+ SHUN2 = Postfix('شون', 3, 'آنان', connector='fasele')
104
+ N = Postfix('ن', 4, 'هستند', ignore_poses=['VERB', 'CCONJ', 'SCONJ'], connector='fasele', non_connecting_chars=['ی'])
105
+ SHAM = Postfix('شم', 4, 'بشوم',ignore_poses=['VERB'], connector='fasele')
106
+ SHI= Postfix('شی', 4, 'بشوی',ignore_poses=['VERB'], connector='fasele')
107
+ SHE= Postfix('شه', 4, 'شود',ignore_poses=['VERB'], connector='fasele')
108
+ SHIN= Postfix('شین', 4, 'شوید',ignore_poses=['VERB'], connector='fasele')
109
+ SHID= Postfix('شید', 4, 'شوید',ignore_poses=['VERB'], connector='fasele')
110
+ SHAAN= Postfix('شن', 4, 'شوند',ignore_poses=['VERB'], connector='fasele')
111
+ SHAND= Postfix('شند', 4, 'شوند',ignore_poses=['VERB'], connector='fasele')
112
+ M2 = Postfix('م', 4, 'هم',ignore_poses=['VERB'], connector='fasele')
113
+ V = Postfix('و', 4, 'را', connector='fasele', non_connecting_chars=['ا', 'ای', 'آ', 'اً'])
114
+ V1 = Postfix('رو', 4, 'را', connector='fasele')
115
+ H = Postfix('ه', 4, '', ignore_poses=['VERB', 'CCONJ', 'SCONJ'], connector='none')
116
+ # H2 = Postfix('ه', 4)
117
+ M1 = Postfix('م', 4, 'هستم',ignore_poses=['VERB'], connector='fasele')
118
+ Y2 = Postfix('ی', 4, 'ی', ignore_poses=['VERB'], connector='none')
119
+ H1 = Postfix('ه', 4, 'است', ignore_poses=['VERB'], connector='fasele', non_connecting_chars=['ا', 'آ', 'اً'])
120
+ S = Postfix('س', 4, 'است', connector='fasele')
121
+ ST = Postfix('ست', 4, 'است', connector='fasele')
122
+ ED = Postfix('ید', 4, 'هستید', ignore_poses=['VERB'], connector='fasele')
123
+ EN = Postfix('ین', 4, 'هستید', ignore_poses=['VERB'], connector='fasele', non_connecting_chars=['تر'])
124
+ EM = Postfix('یم', 4, 'هستیم', ignore_poses=['VERB'], connector='fasele')
125
+ ND = Postfix('ند', 4, 'هستند', ignore_poses=['VERB'], connector='fasele')
126
+ # posts = [[Y1], [TAR, TARIN], [HA, HAY, A, A1], [M, T, SH, MAN, MUN, TAN, TUN, SHAN, SHUN], [N, S, ST, M1, M2, V, V1,Y2, H, H1, ED, EN, EM, ND, SHAM, SHI, SHID, SHE, SHAND, SHIN, SHAAN]]
127
+ # posts = [[Y1], [TAR, TARIN], [HA, HAY, A, A1], [YY, M, M_MAN, T, T1, T2, SH, MAN, MAN1, MAN2,MUN,MUN1,MUN2, TAN,TAN1,TAN2, TUN,TUN1,TUN2, SHAN,SHAN1,SHAN2, SHUN, SHUN1, SHUN2], [N, S, ST, M1, M2, V, V1,Y2, H1, ED, EN, EM, ND, SHAM, SHI, SHID, SHE, SHAND, SHIN, SHAAN]]
128
+ posts = [[Y1], [TAR, TARIN], [HA, HAY, A, A1], [YY, M, M_MAN, T, T1, SH, MAN, MAN1,MUN,MUN2, TAN,TAN2, TUN,TUN2, SHAN,SHAN2, SHUN, SHUN2], [N, S, ST, M1, M2, V, V1,Y2, H1, ED, EN, EM, ND, SHAM, SHI, SHID, SHE, SHAND, SHIN, SHAAN]]
129
+ PossessiveـPronouns = [M,T,SH, MAN, MUN, TAN, TUN, SHAN, SHUN]
130
+ cant_append_h_posts = [Y1, TAR, TARIN]
131
+ As = [A, A1]
132
+
133
+ def get_separator(self, w1, w2, append_h):
134
+ connector_2_str = {'none': '', 'nim': OneShotTransformer.NIM_FASELE, 'fasele': ' '}
135
+ not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و']
136
+ # if w2 == OneShotTransformer.Y2:
137
+ # return ''
138
+ # if w2 in [OneShotTransformer.M, OneShotTransformer.T, OneShotTransformer.SH] and ( type(w1) == str and w1[-1] in ['ا', 'و']):
139
+ # return 'ی'
140
+ # if type(w1) != str and w1.level == 1:
141
+ # return ' '
142
+ # not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و']
143
+ # if w1 in [OneShotTransformer.Y, OneShotTransformer.B, OneShotTransformer.HAMIN, OneShotTransformer.IN, OneShotTransformer.HAMUN] or w2 in [OneShotTransformer.ED, OneShotTransformer.EN, OneShotTransformer.EM, OneShotTransformer.ND, OneShotTransformer.H1, OneShotTransformer.M1, OneShotTransformer.S, OneShotTransformer.ST, OneShotTransformer.V, OneShotTransformer.N, OneShotTransformer.M2]:
144
+ # return ' '
145
+ #
146
+ # if ((type(w1) == str and len(w1)> 0 and w1[-1] in ['ا', 'و']) or (type(w1) != str and w1.formal[-1] in [ 'ا', 'و']))and w2.level == 3 :
147
+ # return 'ی' + '‌'
148
+ # if (type(w1) == str and len(w1)> 0 and w1[-1] in not_connect_chars) or (type(w1) != str and w1.word[-1] in not_connect_chars):
149
+ # return ''
150
+ all_pres = [p for pres in OneShotTransformer.pres for p in pres]
151
+ all_posts = [p for posts in OneShotTransformer.posts for p in posts]
152
+ if type(w1) == str:
153
+ last_ch = w1[-1]
154
+ else:
155
+ last_ch = w1.word[-1]
156
+ separator = ''
157
+ extra_sep = ''
158
+ if type(w1) == str and append_h and w2 in [OneShotTransformer.M, OneShotTransformer.T, OneShotTransformer.SH]:
159
+ extra_sep = OneShotTransformer.NIM_FASELE + 'ا'
160
+ if w2 in [OneShotTransformer.M, OneShotTransformer.T, OneShotTransformer.SH, OneShotTransformer.MAN, OneShotTransformer.MUN, OneShotTransformer.TAN, OneShotTransformer.TUN, OneShotTransformer.SHAN, OneShotTransformer.SHUN] and ( last_ch in ['ا', 'و']) :
161
+ extra_sep = 'ی'
162
+ if w1 in all_pres:
163
+ separator = connector_2_str[w1.connector]
164
+ if w2 in all_posts:
165
+ separator = connector_2_str[w2.connector]
166
+
167
+ # replace nim_fasele with '' for non connected words
168
+
169
+ if last_ch in not_connect_chars and separator == OneShotTransformer.NIM_FASELE:
170
+ separator = ''
171
+ return extra_sep + separator
172
+
173
+ def lemma_to_formals(self, iword):
174
+ out_iwords = [iword]
175
+ if iword.lemma in self.mapper and self.iword2str(iword) != self.mapper[iword.lemma]:
176
+ for map_words in self.mapper[iword.lemma]:
177
+ new_iw = InformalWord(lemma=map_words,prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos, append_h=iword.append_h)
178
+ if not iword.prefixs and not iword.postfixs:
179
+ new_iw.is_mapper = True
180
+ new_iw.semi_mapper = True
181
+ else:
182
+ new_iw.semi_mapper = True
183
+ out_iwords.append(new_iw)
184
+ formal_verbs = self.verb_to_formal_func(iword.lemma)
185
+ if formal_verbs is not None:
186
+ for f_v in formal_verbs:
187
+ new_iw = InformalWord(lemma=f_v,prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos, append_h=iword.append_h)
188
+ new_iw.is_verb = True
189
+ out_iwords.append(new_iw)
190
+ return out_iwords
191
+
192
+
193
+ def should_ignore_by_postagg(self, iword):
194
+ post_pres = [pre for pre in iword.prefixs] + [post for post in iword.postfixs]
195
+ for p in post_pres:
196
+ if (p.ignore_poses and iword.pos in p.ignore_poses) or (p.poses and iword.pos not in p.poses):
197
+ return True
198
+ return False
199
+
200
+ def filtered_based_on_rules(self, iword):
201
+ #YY
202
+ ha_p = [OneShotTransformer.A, OneShotTransformer.HA]
203
+ if iword.postfixs and OneShotTransformer.YY in iword.postfixs and not all(p in ha_p + [OneShotTransformer.YY] for p in iword.postfixs):
204
+ return True
205
+ #hasti!
206
+ if (iword.postfixs and len(iword.postfixs) == 1 and OneShotTransformer.Y2 in iword.postfixs and iword.lemma and iword.lemma[-1] in ['و', 'ا']) or (iword.postfixs and len(iword.postfixs) == 2 and OneShotTransformer.Y2 in iword.postfixs and iword.postfixs[0] in [OneShotTransformer.A, OneShotTransformer.HA]):
207
+ return True
208
+ #non connecting chars
209
+ if iword.prefixs:
210
+ last_pre = iword.prefixs[-1]
211
+ if last_pre.non_connecting_chars and iword.lemma and any(iword.lemma.startswith(ch) for ch in last_pre.non_connecting_chars):
212
+ return True
213
+ if iword.postfixs:
214
+ first_post = iword.postfixs[0]
215
+ if first_post.non_connecting_chars and iword.lemma and any(iword.lemma.endswith(ch) for ch in first_post.non_connecting_chars):
216
+ return True
217
+ #hidden H # goshnashe
218
+ if not iword.semi_mapper and not iword.append_h and iword.lemma and iword.lemma[-1] == 'ه' and iword.postfixs and iword.lemma not in self.non_hidden_h_words:
219
+ return True
220
+ # h + h
221
+ if iword.prefixs and iword.postfixs and len(iword.lemma) < 2:
222
+ return True
223
+ # خونهه - خونششونه
224
+ if iword.append_h and (OneShotTransformer.H in iword.postfixs or (len(iword.postfixs) == 1 and OneShotTransformer.H1 in iword.postfixs) ):
225
+ return True
226
+ if iword.prefixs and (OneShotTransformer.B in iword.prefixs or OneShotTransformer.Y in iword.prefixs) and (iword.lemma and iword.lemma[0] in ['ا', 'ی', 'و']):
227
+ return True
228
+ if iword.lemma in self.isolated_words and (iword.prefixs or iword.postfixs):
229
+ return True
230
+ # verb + postfixs ex: برنامه
231
+ if (iword.is_verb and iword.prefixs) or(iword.is_verb and iword.postfixs and (len(iword.postfixs) > 1 or not any(p in iword.postfixs for p in OneShotTransformer.PossessiveـPronouns +[OneShotTransformer.V]))):
232
+ return True
233
+ return False
234
+
235
+ def iword2str(self, iword):
236
+ sorted_prefixs = list(sorted(iword.prefixs, key=lambda prefix: prefix.level))
237
+ sorted_postfixs = list(sorted(iword.postfixs, key=lambda postfix: postfix.level))
238
+ concated_str = ''
239
+ zipped_prefixs = [(sorted_prefixs[i], sorted_prefixs[i + 1]) if i < len(sorted_prefixs) - 1 else (
240
+ sorted_prefixs[i], iword.lemma) for i in range(len(sorted_prefixs))]
241
+ for prev_prefix, prefix in zipped_prefixs:
242
+ separator = self.get_separator(prev_prefix, prefix, append_h=False)
243
+ prefix_formal = prev_prefix.formal
244
+ concated_str += prefix_formal
245
+ concated_str += separator
246
+
247
+ concated_str += iword.lemma
248
+
249
+ zipped_postfix = [(sorted_postfixs[i - 1], sorted_postfixs[i]) if i > 0 else (iword.lemma, sorted_postfixs[i])
250
+ for i in range(len(sorted_postfixs))]
251
+ for postfix, next_postfix in zipped_postfix:
252
+ separator = self.get_separator(postfix, next_postfix, append_h=iword.append_h)
253
+ concated_str += separator
254
+ postfix_formal = next_postfix.formal
255
+ concated_str += postfix_formal
256
+ return concated_str
257
+
258
+ def to_formals(self, iword):
259
+ str_iwords = []
260
+ all_iwords = self.lemma_to_formals(iword)
261
+ for iword in all_iwords:
262
+ # if iword.lemma == 'اون':
263
+ # print('')
264
+ if len(iword.lemma) == 1 and iword.lemma != 'و':
265
+ str_iwords.append(('', None))
266
+ continue
267
+ if self.filtered_based_on_rules(iword):
268
+ str_iwords.append(('', None))
269
+ continue
270
+ if self.should_ignore_by_postagg(iword):
271
+ str_iwords.append(('', None))
272
+ continue
273
+ if not iword.is_verb and not iword.semi_mapper and iword.lemma not in self.vocab:
274
+ str_iwords.append(('', None))
275
+ continue
276
+ concated_str = self.iword2str(iword)
277
+ str_iwords.append((concated_str, iword))
278
+ return str_iwords
279
+
280
+ def un_in(self, iword):
281
+ new_lemma = iword.lemma.replace('ون', 'ان')
282
+ if new_lemma != iword.lemma:
283
+ return InformalWord(lemma=new_lemma, prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos)
284
+ else:
285
+ return False
286
+
287
+ def prefix_obj(self, word):
288
+ op_separete = {'م': 'من', 'ت': 'تو', 'ش': 'آن', 'تان': 'شما', 'تون': 'شما', 'شون': 'آنان', 'شان': 'آنان',
289
+ 'مان': 'ما', 'مون': 'ما'}
290
+ candidates = []
291
+ formal = ''
292
+ m = self.pre_obj_pattern.match(word)
293
+ if m:
294
+ tokens = m.groups()
295
+ if tokens[0] == 'باها':
296
+ formal += 'با'
297
+ else:
298
+ formal += tokens[0]
299
+ formal_obj = op_separete[tokens[1]]
300
+ formal += ' '
301
+ formal += formal_obj
302
+ if tokens[2] is not None:
303
+ formal += ' '
304
+ formal += 'هم'
305
+ alts = {'هم': 'هستم', 'آن': 'او'}
306
+ tokens = [[w] for w in formal.split()]
307
+ for t in tokens:
308
+ if t[0] in alts:
309
+ t.append(alts[t[0]])
310
+
311
+ candidates = itertools.product(*tokens)
312
+ candidates = [' '.join(cnd) for cnd in candidates]
313
+
314
+ return [(c, c) for c in candidates]
315
+
316
+
317
+
318
+ def append_tanvin_hat(self, iword):
319
+ if len(iword.lemma) > 1 and iword.lemma[0] == 'ا' and iword.lemma[-1] != 'ا':
320
+ new_lemma = 'آ' + iword.lemma[1:]
321
+ return InformalWord(lemma=new_lemma, prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos)
322
+ if len(iword.lemma) > 1 and iword.lemma[-1] == 'ا':
323
+ new_lemma = iword.lemma[:-1] + 'اً'
324
+ return InformalWord(lemma=new_lemma, prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos)
325
+ return False
326
+
327
+ def append_h(self, iword):
328
+ not_apply = self.verb_to_formal_func(iword.lemma) or (iword.lemma and iword.lemma[-1] in ['ا', 'و', 'ی']) or len(iword.lemma) <= 1 or iword.lemma =='' or iword.lemma[-1] == 'ه' or (OneShotTransformer.H in iword.postfixs and len(iword.postfixs) == 1) or any(p in iword.postfixs for p in OneShotTransformer.As) or(OneShotTransformer.V in iword.postfixs) or (iword.postfixs and iword.postfixs[0].word[0] in ['ی', 'و','ا'])
329
+ ######## when add h?
330
+ new_lemma = iword.lemma + 'ه'
331
+ ############# new_lemma in self.vocab
332
+ if len(iword.postfixs) > 0 and not any([p in OneShotTransformer.cant_append_h_posts for p in iword.postfixs]) and not not_apply and new_lemma not in self.non_hidden_h_words:
333
+ # if len(iword.postfixs) > 0 and not not_apply and new_lemma in self.vocab and new_lemma not in self.non_hidden_h_words:
334
+ return InformalWord(lemma=new_lemma, prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos, append_h= True)
335
+ return False
336
+
337
+ def __init__(self, vocab, mapper, verb_to_formal_func, ignore_words, postfix_mapper, isolated_words, non_hidden_h_words):
338
+ self.vocab = vocab
339
+ self.mapper = mapper
340
+ self.verb_to_formal_func = verb_to_formal_func
341
+ self.ignore_words = ignore_words
342
+ self.postfix_mapper = postfix_mapper
343
+ self.isolated_words = isolated_words
344
+ self.non_hidden_h_words = non_hidden_h_words
345
+ self.operators = [self.un_in, self.append_h, self.append_tanvin_hat]
346
+ patt = r'(از|به|باها)(مان|شون|شان|مون|م|تون|تان|ت|ش)(م)?$'
347
+ self.pre_obj_pattern = re.compile(patt)
348
+
349
+ def all_sequence_of_postfixs(self, word, index):
350
+ all_seqs =[]
351
+ for p in OneShotTransformer.posts[index]:
352
+ p_w = p.word
353
+ if word.startswith(p_w):
354
+ w = word[len(p_w):]
355
+ if len(w) == 0:
356
+ all_seqs.append(p)
357
+ else:
358
+ if index < len(OneShotTransformer.posts) -1 :
359
+ resp = self.all_sequence_of_postfixs(w, index+1)
360
+ if len(resp) > 0:
361
+ for item in resp:
362
+ if type(item) == list:
363
+ item.append(p)
364
+ sequence_with_p = item
365
+ else:
366
+ sequence_with_p = [p, item]
367
+ all_seqs.append(sequence_with_p)
368
+ if index < len(OneShotTransformer.posts) - 1:
369
+ resp = self.all_sequence_of_postfixs(word, index + 1)
370
+ all_seqs.extend(resp)
371
+ else:
372
+ return all_seqs
373
+ return all_seqs
374
+
375
+ def combine(self, l1, l2):
376
+ if len(l1) == 0:
377
+ return l2
378
+ elif len(l2) == 0:
379
+ return l1
380
+ return list(itertools.product(l1, l2))
381
+
382
+
383
+ def get_expand(self, iword):
384
+ all_possible_words = []
385
+ for subset_operators in utils.powerset(self.operators):
386
+ new_iword = InformalWord(lemma=iword.lemma, prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos)
387
+ for so in subset_operators:
388
+ so_resp = so(new_iword)
389
+ if so_resp:
390
+ new_iword = so_resp
391
+ all_possible_words.append(new_iword)
392
+ return all_possible_words
393
+
394
+
395
+ def match_postfixs(self, word, pos):
396
+ possible_combinatios = []
397
+ for i in range(len(OneShotTransformer.posts)):
398
+ for p in OneShotTransformer.posts[i]:
399
+ p_word = p.word
400
+ p_indxs = [indx for indx, ch in enumerate(word) if word[indx:indx+len(p_word)] == p_word]
401
+ for p_indx in p_indxs:
402
+ if p_indx != -1:
403
+ lemma = word[:p_indx]
404
+ pp = word[p_indx + len(p_word):]
405
+ if len(pp) ==0:
406
+ iw = InformalWord(lemma=lemma, postfixs=[p], pos=pos)
407
+ possible_combinatios.append(iw)
408
+ continue
409
+ if i < len(OneShotTransformer.posts) -1:
410
+ all_postfix = self.all_sequence_of_postfixs(pp, index=i+1)
411
+ if len(all_postfix) > 0:
412
+ for pfixs in all_postfix:
413
+ if type(pfixs) == list:
414
+ pfixs.append(p)
415
+ else:
416
+ pfixs = [p, pfixs]
417
+ iw = InformalWord(lemma=lemma, postfixs=pfixs, pos=pos)
418
+ possible_combinatios.append(iw)
419
+ elif len(pp) == 0:
420
+ iw = InformalWord(lemma=lemma, postfixs=[p], pos=pos)
421
+ possible_combinatios.append(iw)
422
+
423
+ return possible_combinatios
424
+
425
+ def match_prefixs(self, word, pos):
426
+ possible_combinatios = []
427
+ for i in range(len(OneShotTransformer.pres)):
428
+ for p in OneShotTransformer.pres[i]:
429
+ if word.startswith(p.word):
430
+ lemma = word[len(p.word):]
431
+ prefixs = [p]
432
+ iw = InformalWord(lemma=lemma, prefixs=prefixs, postfixs=[], pos=pos)
433
+ possible_combinatios.append(iw)
434
+ return possible_combinatios
435
+ return []
436
+
437
+ def parse_word(self, iword):
438
+ parsed_resp = []
439
+ prefixed_word = self.match_prefixs(iword.lemma,pos=iword.pos)
440
+ prefixed_word.append(iword)
441
+ parsed_resp.extend(prefixed_word)
442
+ for pw in prefixed_word:
443
+ postfixed_iwords = self.match_postfixs(pw.lemma,pos=iword.pos)
444
+ for piw in postfixed_iwords:
445
+ piw.prefixs = pw.prefixs
446
+ parsed_resp.append(piw)
447
+ return parsed_resp
448
+
449
+ def is_seqs_of_verbs(self, txt):
450
+ words = txt.split()
451
+ if len(words) < 2:
452
+ return False
453
+ for w in words:
454
+ formal_verb = self.verb_to_formal_func(w)
455
+ if formal_verb is None:
456
+ return False
457
+ if words[-1] in ['است', 'هست']:
458
+ return False
459
+ return True
460
+
461
+ def filter_results(self, word_lemmas):
462
+ return list(filter(lambda wl: len(wl[0])>0 and wl[0][-1] != '‌' and not self.is_seqs_of_verbs(wl[0]), word_lemmas))
463
+
464
+ def concatenate_formal_words(self, pre, next):
465
+ """
466
+ خانه +‌ ت -> خانه‌ات
467
+ دیگر + ای -> دیگری
468
+ """
469
+ nim_fasele = '‌'
470
+ not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و']
471
+ if len(pre) < 1 :
472
+ return next
473
+ if pre[-1] in ['ه'] and next in ['م', 'ت', 'ش']:
474
+ return pre + nim_fasele + 'ا' + next
475
+ if pre[-1] == 'ا'and next.split() and next.split()[0] in ['م', 'ت', 'ش', 'مان', 'تان', 'شان']:
476
+ return pre + nim_fasele + 'ی' + next
477
+ if pre[-1] not in ['ه'] and next in ['ای']:
478
+ return pre + 'ی'
479
+ out = pre + next
480
+ if pre[-1] not in not_connect_chars or next.startswith('ها') or pre[-1] in ['ه'] or pre + nim_fasele + next in self.vocab:
481
+ out = pre + nim_fasele + next
482
+ if self.verb_to_formal_func(next):
483
+ out = pre + ' ' + next
484
+ return out
485
+
486
+ def handle_nim_fasele_words(self, word, pos):
487
+ def extract_lemma_nim_fasele_words(word, pos):
488
+ formal_prefixs = []
489
+ formal_postfixs = []
490
+ prefixs = {'اون': 'آن', 'همون': 'همین'}
491
+ postfixs = self.postfix_mapper
492
+ tokens = word.split('‌')
493
+ index = 0
494
+ for i in range(len(tokens)):
495
+ index = i
496
+ if tokens[i] not in prefixs:
497
+ break
498
+ else:
499
+ formal_prefixs.append(prefixs[tokens[i]])
500
+
501
+ for i in range(len(tokens), index, -1):
502
+ current_tok = '‌'.join(tokens[index:i])
503
+ if current_tok in self.vocab or tokens[i - 1] not in postfixs:
504
+ return formal_prefixs, current_tok, formal_postfixs
505
+ else:
506
+ formal_postfixs.append(postfixs[tokens[i - 1]])
507
+ return formal_prefixs, current_tok, formal_postfixs
508
+ nim_fasele = '‌'
509
+ candidates = []
510
+ formal_word = ''
511
+ verbs = self.verb_to_formal_func(word)
512
+ if verbs:
513
+ return [(v, v) for v in verbs]
514
+ all_candidates = set()
515
+ # lemma
516
+ formal_prefixs, lemma, formal_postfixs = extract_lemma_nim_fasele_words(word, pos)
517
+ word_lemmas = self.transform(lemma, pos, ignore_nim_fasele=True)
518
+ # lemma with postfix should len=1
519
+ one_token_words = [wl for wl in word_lemmas if len(wl[0].split()) == 1]
520
+ if formal_postfixs and one_token_words:
521
+ all_formal_lemma_candidates = one_token_words
522
+ else:
523
+ all_formal_lemma_candidates = word_lemmas
524
+ if not all_formal_lemma_candidates:
525
+ if formal_postfixs or formal_prefixs:
526
+ all_formal_lemma_candidates = [(lemma, lemma)]
527
+ else:
528
+ tokens = lemma.split(nim_fasele)
529
+ if all(self.transform(t, None, ignore_nim_fasele=True) for t in tokens):
530
+ w = ' '.join(tokens)
531
+ return [(w, w)]
532
+ else:
533
+ return []
534
+ for cnd_lemma, formal_word_lemma in all_formal_lemma_candidates:
535
+ formal_word = ''
536
+ toks = formal_prefixs + [cnd_lemma] + formal_postfixs
537
+ for index, t in enumerate(toks):
538
+ formal_word = self.concatenate_formal_words(formal_word, t)
539
+ all_candidates.add((formal_word, formal_word_lemma))
540
+ # if t in self.postfix_mapper:
541
+ # formal_t = self.postfix_mapper[t]
542
+ # else:
543
+ # transform_outputs = self.transform(t, pos)
544
+ # if not transform_outputs:
545
+ # formal_t = t
546
+ # else:
547
+ # one_word_outputs = [ft for ft in transform_outputs if len(ft.split()) == 1]
548
+ # if one_word_outputs:
549
+ # if t in one_word_outputs:
550
+ # formal_t = t
551
+ # else:
552
+ # formal_t = one_word_outputs[0]
553
+ # else:
554
+ # formal_t = transform_outputs.pop()
555
+ return all_candidates
556
+
557
+
558
+
559
+ def transform(self, word, pos, ignore_nim_fasele=False):
560
+ """ignore emoji , punctuation, numbers"""
561
+ ignore_chars = '.1234567890!@#$%^&*()_+۱۲۳۴۵۶۷۸۹÷؟×−+?><}،,{":' + string.ascii_lowercase + string.ascii_uppercase
562
+ if any(ic in word for ic in ignore_chars) or utils.if_emoji(word):
563
+ return [(word, word)]
564
+ """handle nim fasele"""
565
+ nim_fasele = '‌'
566
+ if not ignore_nim_fasele and nim_fasele in word:
567
+ return self.handle_nim_fasele_words(word, pos)
568
+ # pass ignore words and accept as correct informal word!
569
+ if word in self.ignore_words and not word in self.mapper:
570
+ return [(word, word)]
571
+ formal_prefix_obj = self.prefix_obj(word)
572
+ if formal_prefix_obj:
573
+ return formal_prefix_obj
574
+ iword = InformalWord(lemma=word, pos=pos)
575
+ expanded_candidates = []
576
+ candidates = self.parse_word(iword)
577
+ #just verbs
578
+ if any(c.is_verb for c in candidates):
579
+ candidates = [c for c in candidates if c.is_verb]
580
+ for cnd in candidates:
581
+ expanded_candidates.extend(self.get_expand(cnd))
582
+ word_iwords = []
583
+ for ec in expanded_candidates:
584
+ word_iwords.extend(self.to_formals(ec))
585
+ if any(f[1] and (f[1].is_mapper or f[1].is_verb) for f in word_iwords if f[1] is not None):
586
+ word_iwords = [f for f in word_iwords if f[1] and (f[1].is_mapper or f[1].is_verb)]
587
+ # else:
588
+ word_lemmas_set = [(w, iword.lemma) for w, iword in word_iwords if iword is not None]
589
+ word_lemmas_set = set(word_lemmas_set)
590
+ out = self.filter_results(word_lemmas_set)
591
+ # if type(out) == str:
592
+ # out = [out]
593
+ # out = set(out)
594
+ return out
595
+
596
+ if __name__ == '__main__':
597
+ transformer = OneShotTransformer(None, None, None)
598
+ candidates = transformer.match_postfixs('کارامم')
599
+ print(candidates)
600
+
VerbHandler.py ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from enum import Enum
3
+ from hazm import Normalizer
4
+ import pandas as pd
5
+
6
+
7
+ Formality = Enum('Formality', 'formal informal')
8
+ VerbTime = Enum('VerbTime', 'past present future')
9
+ Person = Enum('Person', 'Man To An Ma Shoma Anha')
10
+ Number = Enum('Number', 'Mofrad Jam')
11
+ class Verb:
12
+ def __init__(self, root, formality, time, pp, person, number):
13
+ self.root = root
14
+ self.formality = formality
15
+ self.time = time
16
+ self.person = person
17
+ self.number = number
18
+ self.pp = pp
19
+
20
+ class VerbHandler():
21
+ def __init__(self, csv_verb_addr, csv_irregular_verbs_mapper):
22
+ self.posfix_mapper = {'ه': 'د', 'ن': 'ند', 'ین': 'ید'}
23
+ self.objective_pr_mapper = {'شون':'شان', 'تون':'تان', 'مون':'مان'}
24
+ self.init_mapper = {'کنه': 'بکنه', 'کنم':'بکنم', 'کنی':'بکنی', 'کنیم': 'بکنیم', 'کنین':'بکنین', 'کنید':'بکنید', 'کنن':'بکنن', 'کنند':'بکنند'}
25
+ self.out_mapper = {'می‌ایی': 'می‌آیی'}
26
+ self.init_mapper.update({'شم':'بشم', 'شی':'بشی', 'شن':'بشن', 'شین':'بشین' ,'شه':'بشه', 'شیم': 'بشیم'})
27
+ self.bons = self.load_bons(csv_verb_addr)
28
+ self.irregular_verbs = self.load_irregular_mapper(csv_irregular_verbs_mapper)
29
+ self.informal_past_bons = self.get_bons(type=Formality.informal, time=VerbTime.past)
30
+ self.informal_present_bons = self.get_bons(type=Formality.informal, time=VerbTime.present)
31
+
32
+ self.formal_past_bons = self.get_bons(type=Formality.formal, time=VerbTime.past)
33
+ self.formal_present_bons =self.get_bons(type=Formality.formal, time=VerbTime.present) + ['هست']
34
+ self.all_past_bons = self.formal_past_bons + self.informal_past_bons
35
+ self.all_present_bons = self.formal_present_bons + self.informal_present_bons
36
+ self.verb_mapper = {b:{'formal':self.bons[b]['formal']} for b in self.bons if self.bons[b]['type'] == Formality.informal}
37
+ self.solve_alef_issue()
38
+ self.compile_patterns()
39
+
40
+
41
+ def load_irregular_mapper(self, csv_addr):
42
+ df = pd.read_csv(csv_addr)
43
+ mapper = {informal: formal for _, (informal, formal) in df.iterrows()}
44
+ return mapper
45
+
46
+ def load_bons(self, csv_addr):
47
+ normalizer = Normalizer()
48
+ df = pd.read_csv(csv_addr)
49
+ df = df.fillna('')
50
+ bons = {}
51
+ for i, row in df.iterrows():
52
+ if row[2]:
53
+ row[2] = normalizer.normalize(row[2])
54
+ bons[row[2]] = {'type': Formality.formal, 'time': VerbTime.past}
55
+ if row[3]:
56
+ row[3] = normalizer.normalize(row[3])
57
+ bons[row[3]] = {'type': Formality.formal, 'time': VerbTime.present}
58
+ if row[10]:
59
+ bs = row[10].split()
60
+ for b in bs:
61
+ bons[b] = {'type': Formality.informal, 'time': VerbTime.past, 'formal': row[2]}
62
+ if row[11]:
63
+ bs = row[11].split()
64
+ for b in bs:
65
+ bons[b] = {'type': Formality.informal, 'time': VerbTime.present, 'formal': row[3]}
66
+ return bons
67
+
68
+ def get_bons(self, type, time):
69
+ return [b for b in self.bons if self.bons[b]['type'] == type and self.bons[b]['time'] == time]
70
+
71
+ def solve_alef_issue(self):
72
+ replace_alef_y = lambda v : 'ی' + v[1:]
73
+ replace_A_YA = lambda v : 'یا' + v[1:]
74
+ informal_past_start_with_alef = list(map(replace_alef_y, [v for v in self.informal_past_bons if v.startswith('ا') and not v.startswith('ای')]))
75
+ formal_past_start_with_alef = list(map(replace_alef_y, [v for v in self.formal_past_bons if v.startswith('ا') and not v.startswith('ای')]))
76
+ informal_present_start_with_alef = list(map(replace_alef_y, [v for v in self.informal_present_bons if v.startswith('ا') and not v.startswith('ای')]))
77
+ formal_present_start_with_alef = list(map(replace_alef_y, [v for v in self.formal_present_bons if v.startswith('ا') and not v.startswith('ای')]))
78
+ self.alef_mapper = {}
79
+ self.informal_past_start_with_alef = informal_past_start_with_alef + list(
80
+ map(replace_A_YA, [v for v in self.informal_past_bons if v.startswith('آ')]))
81
+ self.informal_present_start_with_alef = informal_present_start_with_alef + list(
82
+ map(replace_A_YA, [v for v in self.informal_present_bons if v.startswith('آ')]))
83
+ self.formal_past_start_with_alef = formal_past_start_with_alef + list(
84
+ map(replace_A_YA, [v for v in self.formal_past_bons if v.startswith('آ')]))
85
+ self.formal_present_start_with_alef = formal_present_start_with_alef + list(
86
+ map(replace_A_YA, [v for v in self.formal_present_bons if v.startswith('آ')]))
87
+ for verb in self.informal_past_start_with_alef + self.informal_present_start_with_alef + self.formal_past_start_with_alef + self.formal_present_start_with_alef:
88
+ if verb[:2] == 'یا':
89
+ origin = 'آ' + verb[2:]
90
+ else:
91
+ origin = 'ا' + verb[1:]
92
+ self.alef_mapper[verb] = origin
93
+ self.alef_mapper['یای'] = 'آی'
94
+ remove_a_hat = lambda w: w.replace('آ', 'ا')
95
+ self.formal_past_bons = list(
96
+ filter(lambda w: w != '', map(remove_a_hat, self.formal_past_bons + self.formal_past_start_with_alef)))
97
+ self.formal_present_bons = list(map(remove_a_hat, self.formal_present_bons + self.formal_present_start_with_alef)) + [
98
+ 'یای'] + ['آی']
99
+ self.informal_past_bons = list(
100
+ filter(lambda w: w != '', map(remove_a_hat, self.informal_past_bons + self.informal_past_start_with_alef)))
101
+ self.informal_present_bons = list(
102
+ map(remove_a_hat, self.informal_present_bons + self.informal_present_start_with_alef)) + [
103
+ 'یای'] + ['آی']
104
+ # sorted by length
105
+ self.formal_present_bons = sorted(self.formal_present_bons, key=lambda w: -len(w))
106
+ self.formal_past_bons = sorted(self.formal_past_bons, key=lambda w: -len(w))
107
+ self.informal_present_bons = sorted(self.informal_present_bons, key=lambda w: -len(w))
108
+ self.informal_past_bons = sorted(self.informal_past_bons, key=lambda w: -len(w))
109
+ verb_v_keys = [word for word in self.verb_mapper if 'آ' in word]
110
+ alef_verb_v_keys = [word for word in self.alef_mapper if 'آ' in word]
111
+ for v in verb_v_keys:
112
+ self.verb_mapper[v.replace('آ', 'ا')] = self.verb_mapper[v]
113
+ for v in alef_verb_v_keys:
114
+ self.alef_mapper[v.replace('آ', 'ا')] = self.alef_mapper[v]
115
+
116
+
117
+ def compile_patterns(self):
118
+ ME_r = '|'.join(['می','می‌'])
119
+ B_r = 'ب'
120
+ not_r = 'ن'
121
+ past_ends = ['م', 'ی', 'ه', 'یم', 'ین', 'ید', 'ند', '', 'ن']
122
+ present_ends = ['م', 'ی', 'ه', 'یم', 'ین', 'ن', 'ید', 'ند', 'د', '']
123
+ naghli_ends = ['ه‌ام', 'ه‌ای', 'ه', 'ه‌ایم', 'ه‌اید', 'ه‌اند']
124
+ objective_pronouns = ['م', 'ت', 'ش', 'مون', 'تون', 'شون']
125
+
126
+ informal_past_r = '|'.join(self.informal_past_bons)
127
+ formal_past_r = '|'.join(self.formal_past_bons)
128
+ informal_present_r = '|'.join(self.informal_present_bons)
129
+ formal_present_r = '|'.join(self.formal_present_bons)
130
+ verb_postfix_past_r = '|'.join(past_ends)
131
+ verb_postfix__present_r = '|'.join(present_ends)
132
+ objective_pronouns_r = '|'.join(objective_pronouns)
133
+ naghli_ends_r = '|'.join(naghli_ends)
134
+ """
135
+ #گذشته‌ی ساده
136
+ # r1 = past_r + verb_postfix_r + objectiveـpronouns_r
137
+ #گذشته‌ی ناتمام
138
+ # r2 = '(' + ME + ')'+ past_r +verb_postfix_r + objectiveـpronouns_r
139
+
140
+ #گذشته‌ی استمراری
141
+ # r3 = '(' + DASHT + ')'+ past_r + verb_postfix_r +objectiveـpronouns_r
142
+
143
+ #گذشته‌ی نقلی
144
+ # r4 = past_r + '(' + '|'.join(naghli_ends) + ')' +objectiveـpronouns_r
145
+
146
+ #گذشته‌ی پیشین
147
+ # r5 = past_r + verb_postfix_r + '(' + BUD + ')' + verb_postfix_r + objectiveـpronouns_r
148
+
149
+ #حال ساده
150
+ # r6 = present_r + verb_postfix_r
151
+
152
+ #حال ناتمام
153
+ # r7 = '(' + ME + ')'+ present_r + verb_postfix_r + objectiveـpronouns_r
154
+
155
+ #حال استمراری
156
+ # r8 = '( ' + DAR + ')'+ verb_postfix_r + '(' + ME + ')' + present_r + verb_postfix_r+ objectiveـpronouns_r
157
+
158
+ #آینده‌ی ساده
159
+ # r9 = '( ' + KHAH + ')'+ verb_postfix_r + present_r +objectiveـpronouns_r
160
+
161
+ #التزامی - گذشته
162
+ # r10 = present_r + '(ه)'+ '(' + BASH + ')' + verb_postfix_r + objectiveـpronouns_r
163
+
164
+ #التزامی - حال
165
+ # r11 = '(ب)' + present_r + verb_postfix_r +objectiveـpronouns_r
166
+ """
167
+ #
168
+ # + : fealhaye rasmi + pasvan informal , hale sade baraye bazi fela ( hast, kon)
169
+ # formal
170
+ formal_present_pattern_b = '({})({})({})?({})?$'.format(B_r, formal_present_r, verb_postfix__present_r, objective_pronouns_r)
171
+ formal_present_pattern_n_me = '({})?({})({})({})?({})?$'.format(not_r, ME_r, formal_present_r, verb_postfix__present_r, objective_pronouns_r)
172
+ formal_present_pattern_n = '({})?({})({})?({})?$'.format(not_r, formal_present_r, verb_postfix__present_r, objective_pronouns_r)
173
+ formal_past_pattern = '({})?({})?({})({}|{})({})?$'.format(not_r, ME_r, formal_past_r, naghli_ends_r,
174
+ verb_postfix_past_r, objective_pronouns_r)
175
+ self.formal_past_verb_pattern = re.compile(formal_past_pattern)
176
+ self.formal_present_verb_pattern_b = re.compile(formal_present_pattern_b)
177
+ self.formal_present_verb_pattern_n_me = re.compile(formal_present_pattern_n_me)
178
+ self.formal_present_verb_pattern_n = re.compile(formal_present_pattern_n)
179
+
180
+ #informal
181
+ informal_present_pattern_b = '({})({})({})?({})?$'.format(B_r, informal_present_r, verb_postfix__present_r,
182
+ objective_pronouns_r)
183
+ informal_present_pattern_n_me = '({})?({})({})({})?({})?$'.format(not_r, ME_r, informal_present_r,
184
+ verb_postfix__present_r, objective_pronouns_r)
185
+ informal_present_pattern_n = '({})?({})({})?({})?$'.format(not_r, informal_present_r, verb_postfix__present_r,
186
+ objective_pronouns_r)
187
+ informal_past_pattern = '({})?({})?({})({}|{})({})?$'.format(not_r, ME_r, informal_past_r, naghli_ends_r,
188
+ verb_postfix_past_r, objective_pronouns_r)
189
+ self.informal_past_verb_pattern = re.compile(informal_past_pattern)
190
+ self.informal_present_verb_pattern_b = re.compile(informal_present_pattern_b)
191
+ self.informal_present_verb_pattern_n_me = re.compile(informal_present_pattern_n_me)
192
+ self.informal_present_verb_pattern_n = re.compile(informal_present_pattern_n)
193
+
194
+
195
+ def parse(self, token):
196
+ outputs = []
197
+
198
+ match_dict_formal = {'tense': '', 'root': '', 'neg': '', 'postfix': '', 'not_r': '', 'op': '', 'b': '', 'me': '', 'naghli':''}
199
+ match_dict_informal = {'tense': '', 'root': '', 'neg': '', 'postfix': '', 'not_r': '', 'op': '', 'b': '', 'me': '', 'naghli':''}
200
+ formal_past_match = self.formal_past_verb_pattern.match(token)
201
+ informal_past_match = self.informal_past_verb_pattern.match(token)
202
+ formal_present_match_b = self.formal_present_verb_pattern_b.match(token)
203
+ informal_present_match_b = self.informal_present_verb_pattern_b.match(token)
204
+ formal_present_match_n_me = self.formal_present_verb_pattern_n_me.match(token)
205
+ informal_present_match_n_me = self.informal_present_verb_pattern_n_me.match(token)
206
+ formal_present_match_n = self.formal_present_verb_pattern_n.match(token)
207
+ informal_present_match_n = self.informal_present_verb_pattern_n.match(token)
208
+ present_group_to_dict_b = lambda g: {k:g[i] for i,k in enumerate(['b', 'root', 'postfix', 'op'])}
209
+ present_group_to_dict_n_me = lambda g: {k:g[i] for i,k in enumerate(['neg', 'me','root', 'postfix','op'])}
210
+ present_group_to_dict_n = lambda g: {k:g[i] for i,k in enumerate(['neg','root', 'postfix','op'])}
211
+ past_group_to_dict = lambda g: {k:g[i] for i,k in enumerate(['neg', 'me', 'root', 'postfix', 'op'])}
212
+ formal_match = formal_past_match or formal_present_match_b or formal_present_match_n_me or formal_present_match_n
213
+ informal_match = informal_past_match or informal_present_match_b or informal_present_match_n_me or informal_present_match_n
214
+ if formal_match:
215
+ if formal_past_match:
216
+ match_dict_formal = past_group_to_dict(formal_past_match.groups())
217
+ match_dict_formal['tense'] = 'past'
218
+ else:
219
+ if formal_present_match_b:
220
+ match_dict_formal = present_group_to_dict_b(formal_present_match_b.groups())
221
+ elif formal_present_match_n_me:
222
+ match_dict_formal = present_group_to_dict_n_me(formal_present_match_n_me.groups())
223
+ elif formal_present_match_n:
224
+ match_dict_formal = present_group_to_dict_n(formal_present_match_n.groups())
225
+ match_dict_formal['tense'] = 'present'
226
+ outputs.append(match_dict_formal)
227
+ if informal_match:
228
+ if informal_past_match:
229
+ match_dict_informal = past_group_to_dict(informal_past_match.groups())
230
+ match_dict_informal['tense'] = 'past'
231
+ else:
232
+ if informal_present_match_b:
233
+ match_dict_informal = present_group_to_dict_b(informal_present_match_b.groups())
234
+ elif informal_present_match_n_me:
235
+ match_dict_informal = present_group_to_dict_n_me(informal_present_match_n_me.groups())
236
+ elif informal_present_match_n:
237
+ match_dict_informal = present_group_to_dict_n(informal_present_match_n.groups())
238
+ match_dict_informal['tense'] = 'present'
239
+ outputs.append(match_dict_informal)
240
+ for match_dict in outputs:
241
+ for key,val in match_dict.items():
242
+ if val is None:
243
+ match_dict[key] = ''
244
+ # print(match_dict)
245
+ return outputs
246
+
247
+ def formal_concatenate(self, match_dict, should_smooth):
248
+ out_dict = {'بیای': 'بیا', 'نیای': 'نیا'}
249
+ if match_dict['root'] == 'است' and match_dict['neg'] != '':
250
+ return 'نیست' + match_dict['postfix']
251
+ if self.if_simple_present(match_dict) or self.if_only_me(match_dict):
252
+ return None
253
+ if should_smooth:
254
+ if match_dict['prefix'] != '' and match_dict['prefix'][0] == 'م':
255
+ pass
256
+ else:
257
+ match_dict['root'] = 'یا' + match_dict['root'][1:]
258
+ # if len(match_dict['prefix']) == 3:
259
+ # match_dict['prefix'] = 'می'
260
+ if match_dict['prefix'] == 'ب' and match_dict['root'] and match_dict['root'][0] == 'ا':
261
+ match_dict['root'] = 'ی' + match_dict['root'][1:]
262
+ out = match_dict['neg'] + match_dict['prefix'] + match_dict['root'] + match_dict['postfix'] + match_dict['op']
263
+ if out in out_dict:
264
+ out = out_dict[out]
265
+
266
+ return out
267
+
268
+ def _set_match_dict_prefix(self, match_dict):
269
+ match_dict['prefix'] = ''
270
+ if 'me' in match_dict and match_dict['me'] != '':
271
+ if len(match_dict['me']) < 3:
272
+ match_dict['me'] = 'می‌'
273
+ match_dict['prefix'] = match_dict['me']
274
+ elif 'b' in match_dict and match_dict['b'] != '':
275
+ match_dict['prefix'] = match_dict['b']
276
+ return match_dict
277
+
278
+ def if_simple_present(self, match_dict):
279
+ if match_dict['root'] != '' and match_dict['tense'] == 'present' and match_dict['prefix'] == '' and match_dict['neg'] == '':
280
+ if match_dict['root'] not in ['کن', 'هست', 'است', 'دار', 'نیست', 'باش']:
281
+ return True
282
+ return False
283
+
284
+ def if_only_me(self, match_dict):
285
+ if match_dict['root'] != '' and match_dict['tense'] == 'present' and match_dict['prefix'] !='' and match_dict['prefix'][0] == 'م' and match_dict['postfix'] == '':
286
+ return True
287
+ return False
288
+
289
+ def is_masdar(self, match_dict):
290
+ return match_dict['root'] in self.all_past_bons and match_dict['me'] == '' and match_dict['postfix'] =='ن' and match_dict['op'] == ''
291
+
292
+ def informal_to_formal(self, token):
293
+ # irregular verbs checking
294
+ if token in self.irregular_verbs:
295
+ return [self.irregular_verbs[token]]
296
+ if token in self.init_mapper:
297
+ token = self.init_mapper[token]
298
+ outputs = []
299
+ if len(token) < 3:
300
+ return None
301
+ should_smooth = False
302
+ all_match_dicts = self.parse(token)
303
+
304
+ ### بدهدم
305
+ #برد
306
+ if len(all_match_dicts) == 2 :
307
+ if all_match_dicts[1]['root'] in self.verb_mapper and self.verb_mapper[all_match_dicts[1]['root']]['formal'] == all_match_dicts[0]['root'] and all_match_dicts[1]['op'] != '':
308
+ del all_match_dicts[1]
309
+ elif all_match_dicts[1] == {'b': 'ب', 'root': 'ر', 'postfix': 'د', 'op': '', 'tense': 'present'}:
310
+ del all_match_dicts[1]
311
+ ##
312
+ is_masdar = False
313
+ for match_dict in all_match_dicts:
314
+ if self.is_masdar(match_dict):
315
+ is_masdar = True
316
+ #نان بان
317
+ if match_dict['root'] != '' and match_dict['root'][0] == 'ا' and 'me' not in match_dict and ('b' in match_dict or match_dict['neg'] == 'ن'):
318
+ return None
319
+ if match_dict['root'] != '':
320
+ root = match_dict['root']
321
+ objective_pr = match_dict['op']
322
+ postfix = match_dict['postfix']
323
+ if root in self.alef_mapper:
324
+ should_smooth = True
325
+ match_dict['root'] = self.alef_mapper[root]
326
+ if match_dict['root'] in self.verb_mapper:
327
+ match_dict['root'] = self.verb_mapper[ match_dict['root']]['formal']
328
+ if postfix in self.posfix_mapper:
329
+ match_dict['postfix'] = self.posfix_mapper[postfix]
330
+ if match_dict['postfix'] == 'د' and match_dict['tense'] == 'past':
331
+ match_dict['postfix'] = 'ه'
332
+ if objective_pr in self.objective_pr_mapper:
333
+ match_dict['op'] = self.objective_pr_mapper[objective_pr]
334
+ match_dict['prefix'] = ''
335
+ if 'neg' not in match_dict:
336
+ match_dict['neg'] = ''
337
+ match_dict = self._set_match_dict_prefix(match_dict)
338
+ formal_verb = self.formal_concatenate(match_dict, should_smooth)
339
+ outputs.append(formal_verb)
340
+ not_none_outpts = [o for o in outputs if o is not None]
341
+ for index, item in enumerate(not_none_outpts):
342
+ if item in self.out_mapper:
343
+ not_none_outpts[index] = self.out_mapper[item]
344
+ if not_none_outpts:
345
+ # append bon
346
+ if len(not_none_outpts) == 1 and is_masdar:
347
+ masdar = not_none_outpts[0][:-2] + 'ن'
348
+ not_none_outpts.append(masdar)
349
+ return not_none_outpts
350
+ return None
app.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import itertools
4
+ import os
5
+ from pathlib import Path
6
+ import yaml
7
+ from download_utils import download_dataset
8
+ import utils
9
+ from formality_transformer import FormalityTransformer
10
+ from hazm import SentenceTokenizer
11
+
12
+
13
+ def translate_short_sent(model, sent):
14
+ out_dict = {}
15
+ txt = utils.cleanify(sent)
16
+ is_valid = lambda w: model.oneshot_transformer.transform(w, None)
17
+ cnd_tokens = model.informal_tokenizer.tokenize(txt, is_valid)
18
+ for tokens in cnd_tokens:
19
+ tokens = [t for t in tokens if t != '']
20
+ new_tokens = []
21
+ for t in tokens:
22
+ new_tokens.extend(t.split())
23
+ txt = ' '.join(new_tokens)
24
+ tokens = txt.split()
25
+ candidates = []
26
+ for index in range(len(tokens)):
27
+ tok = tokens[index]
28
+ cnd = set()
29
+ pos = None
30
+ if model.verb_handler.informal_to_formal(tok):
31
+ pos = 'VERB'
32
+ f_words_lemma = model.oneshot_transformer.transform(tok, pos)
33
+ f_words_lemma = list(f_words_lemma)
34
+ for index, (word, lemma) in enumerate(f_words_lemma):
35
+ if pos != 'VERB' and tok not in model.mapper and model.should_filtered_by_one_bigram(lemma, word, tok):
36
+ f_words_lemma[index] = (tok, tok)
37
+ else:
38
+ word_toks = word.split()
39
+ word_repr = ''
40
+ for t in word_toks:
41
+ word_repr += ' ' + t
42
+ word_repr = word_repr.strip()
43
+ word_repr = model.repalce_for_gpt2(word_repr)
44
+ f_words_lemma[index] = (word, word_repr)
45
+ if f_words_lemma:
46
+ cnd.update(f_words_lemma)
47
+ else:
48
+ cnd = {(tok, tok)}
49
+ candidates.append(cnd)
50
+ all_combinations = itertools.product(*candidates)
51
+ all_combinations_list = list(all_combinations)
52
+ for id, cnd in enumerate(all_combinations_list):
53
+ normal_seq = ' '.join([c[0] for c in cnd])
54
+ lemma_seq = ' '.join([c[1] for c in cnd])
55
+ lemma_seq = utils.clean_text_for_lm(lemma_seq)
56
+ out_dict[id] = (normal_seq, lemma_seq)
57
+ candidates = [[item[0] for item in candidate_phrases] for candidate_phrases in candidates]
58
+ return model.lm_obj.get_best(candidates)
59
+
60
+
61
+ def translate(model, sentence_tokenizer, txt):
62
+ sents = sentence_tokenizer.tokenize(txt)
63
+ formal_output = ''
64
+ for sentence in sents:
65
+ formal_sentence = translate_short_sent(model, sentence)
66
+ formal_output += ' ' + formal_sentence
67
+ return formal_output
68
+
69
+
70
+ class Informal2Formal:
71
+ def __init__(self) -> None:
72
+ #download or load files
73
+ DEFAULT_CACHE_DIR = os.path.join(str(Path.home()), '.dadmatools', 'informal2formal')
74
+ config = load_config('dadmatools/informal2formal/config.yml')
75
+ file_urls = config['files'].values()
76
+ download_dataset(file_urls, DEFAULT_CACHE_DIR, filename=None)
77
+
78
+ # set assets files address
79
+ verbs_csv_addr = os.path.join(DEFAULT_CACHE_DIR, 'verbs.csv')
80
+ irregular_verbs_mapper = os.path.join(DEFAULT_CACHE_DIR, 'irregular_verb_mapper.csv')
81
+ lm_addr = os.path.join(DEFAULT_CACHE_DIR,'3gram.bin')
82
+ assets_file_addr = os.path.join(DEFAULT_CACHE_DIR,'assets.pkl')
83
+ self.sentence_tokenizer = SentenceTokenizer()
84
+ self.model = FormalityTransformer(asset_file_addr=assets_file_addr,
85
+ irregular_verbs_mapper_addr=irregular_verbs_mapper, verbs_csv_addr=verbs_csv_addr, lm_addr=lm_addr)
86
+
87
+
88
+ def load_config(config_file):
89
+ with open(config_file, "r") as file:
90
+ config = yaml.safe_load(file)
91
+ return config
92
+
93
+
94
+ st.cache(suppress_st_warning=True, allow_output_mutation=True)
95
+ st.set_page_config(page_title="Persian Informal to formal translator")
96
+
97
+
98
+ # @st.cache(suppress_st_warning=True, allow_output_mutation=True)
99
+ def load_model():
100
+ DEFAULT_CACHE_DIR = os.path.join(str(Path.home()), '.dadmatools', 'informal2formal')
101
+ config = load_config('config.yml')
102
+ file_urls = config['files'].values()
103
+ download_dataset(file_urls, DEFAULT_CACHE_DIR, filename=None)
104
+ # set assets files address
105
+ verbs_csv_addr = os.path.join(DEFAULT_CACHE_DIR, 'verbs.csv')
106
+ irregular_verbs_mapper = os.path.join(DEFAULT_CACHE_DIR, 'irregular_verb_mapper.csv')
107
+ lm_addr = os.path.join(DEFAULT_CACHE_DIR,'3gram.bin')
108
+ assets_file_addr = os.path.join(DEFAULT_CACHE_DIR,'assets.pkl')
109
+ model = FormalityTransformer(asset_file_addr=assets_file_addr,
110
+ irregular_verbs_mapper_addr=irregular_verbs_mapper, verbs_csv_addr=verbs_csv_addr, lm_addr=lm_addr)
111
+ return model
112
+ st.title("Persian/Farsi Formality Transformer")
113
+ st.write("Translate informal Persian texts to formal")
114
+
115
+
116
+
117
+ user_input: str = st.text_area(
118
+ "Input text",
119
+ height=200,
120
+ max_chars=5120,
121
+ )
122
+
123
+
124
+ if st.button("Run"):
125
+ model = load_model()
126
+ sentence_tokenizer = SentenceTokenizer()
127
+ translated_text = translate(model, sentence_tokenizer, user_input)
128
+ st.success(translated_text)
config.yml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ files:
2
+ lm: https://huggingface.co/datasets/Dadmatech/informal2formal/resolve/main/3gram.bin
3
+ assets: https://huggingface.co/datasets/Dadmatech/informal2formal/resolve/main/assets.pkl
4
+ irregular_verb: https://huggingface.co/datasets/Dadmatech/informal2formal/raw/main/irregular_verb_mapper.csv
5
+ verbs: https://huggingface.co/datasets/Dadmatech/informal2formal/raw/main/verbs.csv
download_utils.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ import requests
5
+ from tqdm import tqdm
6
+ def download_dataset(urls, dest_dir, filename=None):
7
+ # source_code: https://github.com/sirbowen78/lab/blob/master/file_handling/dl_file1.py
8
+ # This example script downloads python program for mac.
9
+
10
+ # Home directory of Mac, pathlib.Path module make this easy.
11
+ # home_path = Path.home()
12
+ # This is the sub directory under home directory.
13
+ # sub_path = "tmp"
14
+ # The header of the dl link has a Content-Length which is in bytes.
15
+ # The bytes is in string hence has to convert to integer.
16
+
17
+ os.makedirs(dest_dir, exist_ok=True)
18
+ for url in urls:
19
+ if 'drive.google' in url:
20
+ import gdown
21
+ # import os
22
+ # print('gdown downloadddd output: ', dest_dir )
23
+ # print(dest_dir, filename)
24
+ # dest_dir = os.path.join(dest_dir,'peyma.zip')
25
+ return gdown.download(url, quiet=False, output=filename)
26
+ try:
27
+ filesize = int(requests.head(url).headers["Content-Length"])
28
+ except KeyError:
29
+ print('unknown file length')
30
+ filesize = -1
31
+ # os.path.basename returns python-3.8.5-macosx10.9.pkg,
32
+ # without this module I will have to manually split the url by "/"
33
+ # then get the last index with -1.
34
+ # Example:
35
+ # url.split("/")[-1]
36
+ filename = os.path.basename(url)
37
+
38
+ # make the sub directory, exists_ok=True will not have exception if the sub dir does not exists.
39
+ # the dir will be created if not exists.
40
+ os.makedirs(dest_dir, exist_ok=True)
41
+
42
+ # The absolute path to download the python program to.
43
+ dl_path = os.path.join(dest_dir, filename)
44
+ chunk_size = 1024
45
+ if os.path.exists(dl_path):
46
+ print(f'file {dl_path} already exist')
47
+ return dl_path
48
+ # Use the requests.get with stream enable, with iter_content by chunk size,
49
+ # the contents will be written to the dl_path.
50
+ # tqdm tracks the progress by progress.update(datasize)
51
+ with requests.get(url, stream=True) as r, open(dl_path, "wb") as f, tqdm(
52
+ unit="B", # unit string to be displayed.
53
+ unit_scale=True, # let tqdm to determine the scale in kilo, mega..etc.
54
+ unit_divisor=1024, # is used when unit_scale is true
55
+ total=filesize, # the total iteration.
56
+ file=sys.stdout, # default goes to stderr, this is the display on console.
57
+ desc=filename # prefix to be displayed on progress bar.
58
+ ) as progress:
59
+ for chunk in r.iter_content(chunk_size=chunk_size):
60
+ # download the file chunk by chunk
61
+ datasize = f.write(chunk)
62
+ # on each chunk update the progress bar.
63
+ progress.update(datasize)
64
+
65
+ return True
formality_transformer.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pickle
3
+ from kenlm_wrapper import Kelm_Wrapper
4
+ from OneShotTransformer import OneShotTransformer
5
+ from VerbHandler import VerbHandler
6
+ import kenlm
7
+ from tokenizer import InformalTokenizer
8
+
9
+
10
+ class FormalityTransformer:
11
+ def __init__(self, asset_file_addr, verbs_csv_addr, irregular_verbs_mapper_addr, lm_addr ):
12
+ assets = pickle.load(open(asset_file_addr, 'rb'))
13
+ self.vocab = assets['vocab']
14
+ self.word_ends_tanvin = assets['word_ends_tanvin']
15
+ self.non_hidden_h_words = assets['non_hidden_h_words']
16
+ self.isolated_words = assets['isolated_words']
17
+ self.ignore_words = assets['ignore_words']
18
+ self.mapper = assets['mapper']
19
+ self.postfix_mapper = assets['postfix_mapper']
20
+ postfixes = assets['postfixes']
21
+
22
+ self.informal_tokenizer = InformalTokenizer(self.vocab, postfixes)
23
+ self.verb_handler = VerbHandler(csv_verb_addr=verbs_csv_addr, csv_irregular_verbs_mapper=irregular_verbs_mapper_addr)
24
+ self.oneshot_transformer = OneShotTransformer(self.vocab, self.mapper, self.verb_handler.informal_to_formal,
25
+ ignore_words=self.ignore_words,
26
+ postfix_mapper=self.postfix_mapper,
27
+ isolated_words=self.isolated_words,
28
+ non_hidden_h_words=self.non_hidden_h_words)
29
+ lm_model = kenlm.Model(lm_addr)
30
+ self.lm_obj = Kelm_Wrapper(lm_model)
31
+
32
+
33
+ def should_filtered_by_one_bigram(self, lemma, word, original_word):
34
+ NIM_FASELE = '‌'
35
+ return original_word in self.vocab and (len(word.split()) > 1 or NIM_FASELE in word)
36
+
37
+ def repalce_for_gpt2(self, word_repr):
38
+ if word_repr in self.word_ends_tanvin:
39
+ return word_repr[:-2] + 'ا'
40
+ return word_repr
kenlm_wrapper.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ class Kelm_Wrapper:
3
+ def __init__(self, model):
4
+ self.model = model
5
+ def get_best_candidate_word(self, default_phrases, candidate_phrases, index):
6
+ candidate_texts = [' '.join(default_phrases[:index]) + ' ' + cnd + ' ' + ' '.join(default_phrases[index+1:]) for cnd in candidate_phrases]
7
+ scores = list(map(self.model.score, candidate_texts))
8
+ return scores.index(max(scores))
9
+
10
+
11
+ def get_best_ongram_phrases(self, candidates_list):
12
+ bests = []
13
+ for candidate_phrase in candidates_list:
14
+ scores = list(map(self.model.score, candidate_phrase))
15
+ best_phrase = candidate_phrase[scores.index(max(scores))]
16
+ bests.append(best_phrase)
17
+ return bests
18
+
19
+
20
+ def get_best(self, candidates_list):
21
+ bests = []
22
+ default_phrases = self.get_best_ongram_phrases(candidates_list)
23
+ # print(default_phrases)
24
+ for index in range(len(candidates_list)):
25
+ if len(candidates_list[index]) > 1:
26
+ best_phrase_index = self.get_best_candidate_word(default_phrases, candidates_list[index], index)
27
+ bests.append(candidates_list[index][best_phrase_index])
28
+ else:
29
+ bests.append(candidates_list[index][0])
30
+ return ' '.join(bests)
31
+
main.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import itertools
2
+ import os
3
+ from pathlib import Path
4
+ import yaml
5
+ from download_utils import download_dataset
6
+ import utils
7
+ from formality_transformer import FormalityTransformer
8
+ from hazm import SentenceTokenizer
9
+
10
+
11
+
12
+ def translate_short_sent(model, sent):
13
+ out_dict = {}
14
+ txt = utils.cleanify(sent)
15
+ is_valid = lambda w: model.oneshot_transformer.transform(w, None)
16
+ cnd_tokens = model.informal_tokenizer.tokenize(txt, is_valid)
17
+ for tokens in cnd_tokens:
18
+ tokens = [t for t in tokens if t != '']
19
+ new_tokens = []
20
+ for t in tokens:
21
+ new_tokens.extend(t.split())
22
+ txt = ' '.join(new_tokens)
23
+ tokens = txt.split()
24
+ candidates = []
25
+ for index in range(len(tokens)):
26
+ tok = tokens[index]
27
+ cnd = set()
28
+ pos = None
29
+ if model.verb_handler.informal_to_formal(tok):
30
+ pos = 'VERB'
31
+ f_words_lemma = model.oneshot_transformer.transform(tok, pos)
32
+ f_words_lemma = list(f_words_lemma)
33
+ for index, (word, lemma) in enumerate(f_words_lemma):
34
+ if pos != 'VERB' and tok not in model.mapper and model.should_filtered_by_one_bigram(lemma, word, tok):
35
+ f_words_lemma[index] = (tok, tok)
36
+ else:
37
+ word_toks = word.split()
38
+ word_repr = ''
39
+ for t in word_toks:
40
+ word_repr += ' ' + t
41
+ word_repr = word_repr.strip()
42
+ word_repr = model.repalce_for_gpt2(word_repr)
43
+ f_words_lemma[index] = (word, word_repr)
44
+ if f_words_lemma:
45
+ cnd.update(f_words_lemma)
46
+ else:
47
+ cnd = {(tok, tok)}
48
+ candidates.append(cnd)
49
+ all_combinations = itertools.product(*candidates)
50
+ all_combinations_list = list(all_combinations)
51
+ for id, cnd in enumerate(all_combinations_list):
52
+ normal_seq = ' '.join([c[0] for c in cnd])
53
+ lemma_seq = ' '.join([c[1] for c in cnd])
54
+ lemma_seq = utils.clean_text_for_lm(lemma_seq)
55
+ out_dict[id] = (normal_seq, lemma_seq)
56
+ candidates = [[item[0] for item in candidate_phrases] for candidate_phrases in candidates]
57
+ return model.lm_obj.get_best(candidates)
58
+
59
+
60
+ def translate(model, sentence_tokenizer, txt):
61
+ sents = sentence_tokenizer.tokenize(txt)
62
+ formal_output = ''
63
+ for sentence in sents:
64
+ formal_sentence = translate_short_sent(model, sentence)
65
+ formal_output += ' ' + formal_sentence
66
+ return formal_output
67
+
68
+ def load_config(config_file):
69
+ with open(config_file, "r") as file:
70
+ config = yaml.safe_load(file)
71
+ return config
72
+
73
+
74
+
75
+
76
+ if __name__ == '__main__':
77
+
78
+ #download or load files
79
+ DEFAULT_CACHE_DIR = os.path.join(str(Path.home()), '.dadmatools', 'informal2formal')
80
+ config = load_config('config.yml')
81
+ file_urls = config['files'].values()
82
+ download_dataset(file_urls, DEFAULT_CACHE_DIR, filename=None)
83
+
84
+ # set assets files address
85
+ verbs_csv_addr = os.path.join(DEFAULT_CACHE_DIR, 'verbs.csv')
86
+ irregular_verbs_mapper = os.path.join(DEFAULT_CACHE_DIR, 'irregular_verb_mapper.csv')
87
+ lm_addr = os.path.join(DEFAULT_CACHE_DIR,'3gram.bin')
88
+ assets_file_addr = os.path.join(DEFAULT_CACHE_DIR,'assets.pkl')
89
+
90
+ #test on a sample
91
+ sentence_tokenizer = SentenceTokenizer()
92
+ model = FormalityTransformer(asset_file_addr=assets_file_addr,
93
+ irregular_verbs_mapper_addr=irregular_verbs_mapper, verbs_csv_addr=verbs_csv_addr, lm_addr=lm_addr)
94
+ print(translate(model, sentence_tokenizer, 'اینو میشه واسه تبدیل تموم جملات محاوره استفاده کرد اگه خواستین'))
95
+
96
+
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pandas
2
+ hazm
3
+ datasets
4
+ PyYAML
5
+ kenlm
6
+ streamlit
7
+ git+https://github.com/kpu/kenlm@master#egg=kenlm
tokenizer.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import itertools
2
+ import utils
3
+ class InformalTokenizer:
4
+ def __init__(self, vocab, postfixes):
5
+ self.vocab = vocab
6
+ self.pres = InformalTokenizer.get_prefixs()
7
+ self.posts = postfixes
8
+
9
+ @staticmethod
10
+ def get_prefixs():
11
+ return ['نا', 'بی', 'هر', 'می']
12
+
13
+ @staticmethod
14
+ def get_postfixs(informal_postfix_addr):
15
+ with open(informal_postfix_addr, 'r') as f:
16
+ ps = f.read().splitlines()
17
+ return ps
18
+
19
+
20
+ def is_pre_post_word(self, w):
21
+ nim_fasele = '‌'
22
+ ws = w.split(nim_fasele)
23
+ pre, pos, v = [0,1,2]
24
+ is_pre_pos = False
25
+ state = pre
26
+ valid_w = ''
27
+ for w in ws:
28
+ if state == pre:
29
+ if w in self.pres:
30
+ valid_w += nim_fasele + w
31
+ is_pre_pos = True
32
+ continue
33
+ elif w in self.posts:
34
+ valid_w += nim_fasele + w
35
+ is_pre_pos = True
36
+ state = pos
37
+ continue
38
+ state = v
39
+ valid_w += nim_fasele + w
40
+ continue
41
+
42
+ if state == pos:
43
+ if w in self.posts:
44
+ valid_w += nim_fasele + w
45
+ continue
46
+ return False
47
+ if state == v:
48
+ if w in self.posts:
49
+ is_pre_pos = True
50
+ state = pos
51
+ valid_w += nim_fasele + w
52
+ continue
53
+ if w in self.vocab:
54
+ valid_w += nim_fasele + w
55
+ if valid_w not in self.vocab:
56
+ return False
57
+ continue
58
+
59
+ return False
60
+ if not is_pre_pos:
61
+ return False
62
+ return True
63
+
64
+
65
+ def get_valid_word(self, words):
66
+ seps = ['', '‌']
67
+ all_seqs = []
68
+ count = len(words)
69
+ lst = list(itertools.product(seps, repeat=count-1))
70
+ for item in lst:
71
+ seq = ''
72
+ for word, sep in zip(words[:-1], item):
73
+ seq += word + sep
74
+ seq += words[-1]
75
+ all_seqs.append(seq)
76
+ return [w for w in all_seqs if w in self.vocab or self.is_pre_post_word(w)]
77
+
78
+ def get_candidates(self, tokens, index=0, current_seq = ' '):
79
+ if index == len(tokens):
80
+ return current_seq
81
+ word = tokens[index]
82
+ next_word, next_next_word = [None, None]
83
+ if index < len(tokens) -1:
84
+ next_word = tokens[index+1]
85
+ if index < len(tokens) -2:
86
+ next_next_word = tokens[index+2]
87
+ cnds = []
88
+ if next_word is not None:
89
+ v_words = self.get_valid_word([word, next_word])
90
+ if v_words:
91
+ for v_w in v_words:
92
+ current_seq1 = current_seq + ' ' + v_w
93
+ cnds2 = self.get_candidates(tokens,index+2, current_seq1)
94
+ if type(cnds2) == str:
95
+ cnds.append(cnds2)
96
+ else:
97
+ cnds.extend(cnds2)
98
+ if next_next_word is not None:
99
+ v_words = self.get_valid_word([word, next_word, next_next_word])
100
+ if v_words:
101
+ for v_w in v_words:
102
+ current_seq2 = current_seq + ' ' + v_w
103
+ cnds3 = self.get_candidates(tokens,index+3, current_seq2)
104
+ if type(cnds3) == str:
105
+ cnds.append(cnds3)
106
+ else:
107
+ cnds.extend(cnds3)
108
+ current_seq = current_seq + ' ' + word
109
+ cnds1 = self.get_candidates(tokens,index+1, current_seq)
110
+ if type(cnds1) == str:
111
+ cnds.append(cnds1)
112
+ else:
113
+ cnds.extend(cnds1)
114
+ return [c.strip() for c in cnds]
115
+
116
+ def seperate_conjs(self, word, validator):
117
+ conjs = ['و', 'در', 'با', 'تا', 'که', 'از', 'تو', 'من', 'شما']
118
+ cnds = utils.split_conj_words(word, conjs)
119
+ valid_cnds = [c for c in cnds if validator(c)]
120
+ if valid_cnds:
121
+ return valid_cnds
122
+ return [word]
123
+
124
+ def tokenize(self, txt, validator):
125
+ tokens = txt.split()
126
+ all_cnds = []
127
+ for t in tokens:
128
+ if not validator(t):
129
+ ws = self.seperate_conjs(t, validator)
130
+ else:
131
+ ws = [t]
132
+ all_cnds.append(ws)
133
+ all_cnd_tokens = itertools.product(*all_cnds)
134
+ txts = list(map(self.get_dense_tokens, all_cnd_tokens))
135
+ return txts
136
+
137
+ def get_dense_tokens(self, tokens):
138
+ PRE, WORD, POST = 0,1,2
139
+ out_tokens = []
140
+ nim_fasele = '‌'
141
+ current_word = ''
142
+ state = WORD
143
+ for i, t in enumerate(tokens):
144
+ if state == WORD:
145
+ if t in self.pres:
146
+ out_tokens.append(current_word)
147
+ current_word = t
148
+ state = PRE
149
+ if t in self.posts:
150
+ current_word += nim_fasele
151
+ current_word += t
152
+ state = POST
153
+ if t not in self.pres and t not in self.posts:
154
+ out_tokens.append(current_word)
155
+ current_word = t
156
+ continue
157
+ if state == PRE:
158
+ if t in self.pres:
159
+ current_word += nim_fasele
160
+ current_word += t
161
+ if t in self.posts:
162
+ out_tokens.append(current_word)
163
+ current_word = t
164
+ state = WORD
165
+ if t not in self.pres and t not in self.posts:
166
+ current_word += nim_fasele
167
+ current_word += t
168
+ state = WORD
169
+ continue
170
+ if state == POST:
171
+ if t in self.pres:
172
+ out_tokens.append(current_word)
173
+ current_word = t
174
+ state = PRE
175
+ if t in self.posts:
176
+ current_word += nim_fasele
177
+ current_word += t
178
+ if t not in self.pres and t not in self.posts:
179
+ out_tokens.append(current_word)
180
+ current_word = t
181
+ state = WORD
182
+ if out_tokens[-1] != current_word:
183
+ out_tokens.append(current_word)
184
+ return out_tokens
utils.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import reduce
2
+ import itertools
3
+ import json
4
+ import re
5
+ import string
6
+ import pandas as pd
7
+ from hazm import Normalizer, WordTokenizer
8
+
9
+ normalizer = Normalizer()
10
+ tokenizer = WordTokenizer(separate_emoji=True)
11
+
12
+
13
+ def seprate_emoji_string(txt):
14
+ try:
15
+ oRes = re.compile(u'(['
16
+ u'\U0001F300-\U0001F64F'
17
+ u'\U0001F680-\U0001F6FF'
18
+ u'\u2600-\u26FF\u2700-\u27BF]+)',
19
+ re.UNICODE)
20
+ except re.error:
21
+ oRes = re.compile(u'(('
22
+ u'\ud83c[\udf00-\udfff]|'
23
+ u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
24
+ u'[\u2600-\u26FF\u2700-\u27BF])+)',
25
+ re.UNICODE)
26
+
27
+ return oRes.sub(r' \1 ', txt)
28
+
29
+ def cleanify(txt):
30
+ txt = txt.strip()
31
+ txt = re.sub('\s+', ' ', txt)
32
+ txt = re.sub('\u200f', '', txt)
33
+ txt = re.sub('‌+', '‌', txt)
34
+ txt = re.sub('‌ ', ' ', txt)
35
+ txt = re.sub(' ‌', ' ', txt)
36
+ txt = normalizer.normalize(txt)
37
+ txt = seprate_emoji_string(txt)
38
+ txt = ' '.join(tokenizer.tokenize(txt))
39
+ return txt
40
+
41
+
42
+
43
+
44
+ def clean_text_for_lm(txt):
45
+ ignore_chars = '.1234567890!@#$%^&*()_+۱۲۳۴۵۶۷۸۹÷؟×−+?><}،,{":' + string.ascii_lowercase + string.ascii_uppercase
46
+ tokens = txt.split()
47
+ clean_tokens = [t for t in tokens if not (any(ic in t for ic in ignore_chars) or if_emoji(t))]
48
+ return ' '.join(clean_tokens)
49
+
50
+
51
+ def add_to_mapper(mapping_list):
52
+ print(len(mapping_list))
53
+ df = pd.read_csv('resources/mapper.csv', delimiter=',', index_col=None)
54
+ print(df.columns)
55
+ for item in mapping_list:
56
+ df = df.append({'formal': item[1], 'informal': item[0]}, ignore_index=True)
57
+ df.to_csv('resources/mapper.csv', index=False)
58
+
59
+
60
+ def extract_non_convertable_words(corpus_addr, tokenizer, normalizer, transformer, output_addr, vocab):
61
+ f = open(corpus_addr)
62
+ non_convertables = {}
63
+ seen_words = set()
64
+ nim_fasele = '‌'
65
+ for i, line in enumerate(f):
66
+ print(i)
67
+ # if i > 500:
68
+ # break
69
+ line = normalizer.normalize(line)
70
+ tokens = tokenizer.tokenize(line)
71
+ for t in tokens:
72
+ # if nim_fasele in t:
73
+ # print(t)
74
+ if t in seen_words:
75
+ if t in non_convertables:
76
+ non_convertables[t] += 1
77
+ else:
78
+ candidates = transformer.transform(t, None)
79
+ # if not candidates and any(t.startswith(pre) for pre in ['از', 'در', 'چند', 'هر', 'هیچ', 'هم', 'با', 'بی', 'تا', 'و']):
80
+ # print(t)
81
+ if not candidates:
82
+ non_convertables[t] = 1
83
+ seen_words.add(t)
84
+ words_count = sorted([(word, count) for word, count in non_convertables.items()], key=lambda item: item[1], reverse=True)
85
+ words_count = [str(word) + ' ########### ' + str(count) for (word, count) in words_count]
86
+ with open(output_addr, 'w+') as f:
87
+ f.write('\n'.join(words_count))
88
+
89
+
90
+ def generate_irrgular_informal_verbs():
91
+ """
92
+ برمیگرده میوفته برمیداره برمیگردونه درمیاره ایستادن نمیومد وامیسته
93
+
94
+ اومد
95
+ نیومد
96
+ اومدی
97
+ نیومدی
98
+ میومدی
99
+ نیومده
100
+ یومد
101
+ میومده
102
+ """
103
+
104
+ mapping_verbs = []
105
+ past_ends = ['م', 'ی', 'ه', 'یم', 'ین', 'ید', 'ند', '', 'ن']
106
+ neg = ['ن', '']
107
+ pre = ['می', 'ب']
108
+ pre_verbs = [('بر', 'دار'), ('در', 'یار'), ('وا', 'ست'), ('بر', 'گرد'), ('ور', 'دار'), ('بر', 'گشت')]
109
+ extras = ['ن', 'نمی', 'می']
110
+ mapper = {'ه':'د', 'ن': 'ند', 'ین': 'ید', 'ور': 'بر', 'ست':'ایست', 'وا':'', 'یار':'آور'}
111
+ for item in pre_verbs:
112
+ for pe in past_ends:
113
+ for ex in extras:
114
+ p_end = pe
115
+ item0 = item[0]
116
+ item1 = item[1]
117
+ inf = item0 + ex + item1 + p_end
118
+ inf = inf.replace('یی', 'ی')
119
+ if item0 in mapper:
120
+ item0 = mapper[item0]
121
+ if item1 in mapper:
122
+ item1 = mapper[item1]
123
+ if p_end in mapper:
124
+ p_end = mapper[p_end]
125
+ formal = item0 + ex + item1 + p_end
126
+ formal = formal.replace('می', 'می‌')
127
+ formal = formal.replace('نآ', 'نیا')
128
+ mapping_verbs.append([formal, inf])
129
+ bons = ['یومد', 'یوفت']
130
+ v_mapper = {'یومد': 'یامد', 'یوفت': 'افت'}
131
+ verbs = itertools.product(neg, pre, bons, past_ends)
132
+ for v in verbs:
133
+ if v[0] == 'ن' and v[1] == 'ب' or (v[2] == 'یومد' and v[1] == 'ب'):
134
+ continue
135
+ inf = v[0] + v[1] + v[2] + v[3]
136
+ inf = inf.replace('یی', 'ی')
137
+ pe = v[3]
138
+ if pe in mapper:
139
+ pe = mapper[pe]
140
+ formal = v[0] + v[1] + '‌' + v_mapper[v[2]] + pe
141
+ formal = formal.replace('ی‌ی', 'ی')
142
+ formal = formal.replace('یا', 'ی‌آ')
143
+ formal = formal.replace('دد', 'ده')
144
+ formal = formal.replace('ب‌ا', 'بی')
145
+ mapping_verbs.append([formal, inf])
146
+ add_to_mapper(mapping_verbs)
147
+
148
+
149
+
150
+ def load_vocab(vocab_addr='resources/words.dat'):
151
+ vocab = {}
152
+ with open(vocab_addr, 'r', encoding='utf-8') as f:
153
+ for line in f:
154
+ try:
155
+ word, freq, p_tags = line.strip().split('\t')
156
+ vocab[word] = {'freq': freq, 'tags': p_tags}
157
+ except:
158
+ word = line.strip()
159
+ vocab[word] = {'freq': 1, 'tags': 'NUM'}
160
+ return vocab
161
+
162
+ def if_connect(word1, word2):
163
+ not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و']
164
+ if any(w =='' for w in [word1, word2]) or word1[-1] in not_connect_chars:
165
+ return True
166
+ return False
167
+ def split_conj_words(word, conjs):
168
+ candidates = set()
169
+ sorted_conjs = sorted(conjs, key=lambda x: len(x), reverse=True)
170
+ for c in sorted_conjs:
171
+ indx = word.find(c)
172
+ if indx != -1 and indx in [0, len(word)-1]:
173
+ pre_w = word[:indx]
174
+ next_w = word[indx+len(c) :]
175
+ if if_connect(pre_w, c) and if_connect(c, next_w):
176
+ cnd = ' '.join([pre_w, c, next_w])
177
+ cnd = cnd.strip()
178
+ candidates.add(cnd)
179
+ return list(candidates)
180
+
181
+
182
+ def is_formal_prefixed(word, vocab):
183
+ not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و']
184
+ nim_fasele = '‌'
185
+ m1 = re.match('(.+)های(م|ت|ش|مان|تان|شان)?$', word)
186
+ m2 = re.match('(.+[ا|و|ی])ی(م|ت|ش|مان|تان|شان)$', word)
187
+ m3 = re.match('(.+[^ا^و^ی])(م|ت|ش|مان|تان|شان)$', word)
188
+ m4 = re.match('(.+)(ها)$', word)
189
+ m5 = re.match('(.+[ه|ی]‌)(اش|ام|ات)$', word)
190
+ if m3 or m2:
191
+ prefix_word = list(filter(lambda m: m is not None, [m3, m2]))[0].group(1)
192
+ if prefix_word in vocab:
193
+ return True
194
+ m_fired = list(filter(lambda m: m is not None, [m1, m4, m5]))
195
+ if len(m_fired) > 0:
196
+ # print(word, m_fired[0].groups())
197
+ prefix_word = m_fired[0].group(1)
198
+ if prefix_word[-1] != nim_fasele and prefix_word[-1] not in not_connect_chars:
199
+ return False
200
+ if prefix_word[-1] == nim_fasele and not (prefix_word[:-1] in vocab):
201
+ return False
202
+ if prefix_word[-1] != nim_fasele and not (prefix_word in vocab):
203
+ return False
204
+ return True
205
+ return False
206
+
207
+
208
+ def spelling_similairty(word):
209
+ all_possible = []
210
+ possible_repeated = get_possible_repeated_word(word)
211
+ all_possible = possible_repeated
212
+ if word in all_possible:
213
+ all_possible.remove(word)
214
+ return all_possible
215
+
216
+ def add_nim_alef_hat_dictionary(vocab):
217
+ word_with_hat = filter(lambda w: 'آ' in w, vocab)
218
+ word_with_nim = filter(lambda w: '‌' in w, vocab)
219
+ mapper1 = {w.replace('آ', 'ا').replace('‌', ''): w for w in word_with_hat}
220
+ mapper2 = {w.replace('‌', ''): w for w in word_with_nim}
221
+ mapper1.update(mapper2)
222
+ return mapper1
223
+
224
+ def generate_spell_mapper(vocab):
225
+ hat = 'آ'
226
+ tanvin = 'اً'
227
+ nim = '‌'
228
+ hamzeh = 'أ'
229
+ hamzeh_y = 'ئ'
230
+ sp_mapper = {hamzeh_y: ['ی'], hat: ['ا'], tanvin: ['ن', 'ا'], nim:['', ' '], hamzeh:['ا', '']}
231
+ special_chars = [hat, tanvin, nim, hamzeh]
232
+ out = {}
233
+ for word in vocab:
234
+ p_words = [word.replace(sp, sp_alt) for sp in special_chars for sp_alt in sp_mapper[sp]]
235
+ spell_errors = []
236
+ p_words = list(set(p_words) - set([word]))
237
+ for pw in p_words:
238
+ if pw in out:
239
+ out[pw].add(word)
240
+ else:
241
+ out[pw] = {word}
242
+ out = {w: list(out[w]) for w in out}
243
+ with open('spell_checker_mapper.json', 'w+', encoding='utf-8') as f:
244
+ json.dump(out, f, ensure_ascii=False, indent=1)
245
+
246
+
247
+
248
+ def create_mapper_tanvin_hamze_hat_nim_fasele():
249
+ mapper = {}
250
+ hats_word = open('resources/spell/words_with_hat.txt').read().splitlines()
251
+ nim_words = open('resources/spell/words_with_nim.txt').read().splitlines()
252
+ tanvin_words = open('resources/spell/words_with_tanvin.txt').read().splitlines()
253
+ hat_ch = 'آ'
254
+ nim_fasele = '‌'
255
+ for w in hats_word:
256
+ w_without_h = w.replace(hat_ch, 'ا')
257
+ mapper[w_without_h] = w
258
+ for w in nim_words:
259
+ w_without_nim = w.remove(nim_fasele)
260
+ mapper[w_without_nim] = w
261
+ w_space_instead_nim = w.replace(nim_fasele, ' ')
262
+ mapper[w_space_instead_nim] = w
263
+
264
+ def extract_lemma_nim_fasele_words(word, vocab):
265
+ prefixs = ['اون']
266
+ postfixs = {'ست': 'است', 'هام':'هایم', 'ام':'ام', 'ها':'ها', 'هامون':'هایمان', 'ترین': 'ترین', 'هایشان':'هایشان'}
267
+ tokens = word.split('‌')
268
+ index = 0
269
+ for i in range(len(tokens)):
270
+ index = i
271
+ if tokens[i] not in prefixs:
272
+ break
273
+
274
+ for i in range(len(tokens), 0, -1):
275
+ current_tok = '‌'.join(tokens[index:i])
276
+ if current_tok in vocab or tokens[i-1] not in postfixs:
277
+ return current_tok
278
+
279
+
280
+ def if_emoji(text):
281
+ # Wide UCS-4 build
282
+ try:
283
+ oRes = re.compile(u'(['
284
+ u'\U0001F300-\U0001F64F'
285
+ u'\U0001F680-\U0001F6FF'
286
+ u'\u2600-\u26FF\u2700-\u27BF]+)',
287
+ re.UNICODE)
288
+
289
+ except re.error:
290
+ # Narrow UCS-2 build
291
+ oRes = re.compile(u'(('
292
+ u'\ud83c[\udf00-\udfff]|'
293
+ u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
294
+ u'[\u2600-\u26FF\u2700-\u27BF])+)',
295
+ re.UNICODE)
296
+
297
+ return oRes.findall(text)
298
+
299
+
300
+ def powerset(lst):
301
+ return reduce(lambda result, x: result + [subset + [x] for subset in result],
302
+ lst, [[]])