mohamedabdullah commited on
Commit
73f4808
1 Parent(s): 0400c34

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +215 -0
app.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from datasets import load_dataset
3
+ import re
4
+ import numpy as np
5
+
6
+ dataset = load_dataset("mohamedabdullah/Arabic-unique-words", data_files="ar_vocab.txt")
7
+ word_l = re.findall('[^a-zA-Z0-9\s\W]{2,25}', dataset['train']['text'][0])
8
+ vocab = set(word_l)
9
+
10
+ def delete_letter(word):
11
+ return [word[:i]+word[i+1:] for i in range(len(word))]
12
+
13
+ def switch_letter(word):
14
+ switch_l = []
15
+
16
+ for i in range(len(word)-1):
17
+ w_l = re.findall('\w', word)
18
+ if i-1 < 0:
19
+ w_l[i:i+2] = w_l[i+1::-1]
20
+ else:
21
+ w_l[i:i+2] = w_l[i+1:i-1:-1]
22
+
23
+ switch_l.append(''.join(w_l))
24
+
25
+ return switch_l
26
+
27
+ def replace_letter(word):
28
+ letters = 'ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ'
29
+
30
+ replace_set = set()
31
+
32
+ for i in range(len(word)):
33
+ for l in letters:
34
+ new_word = word[:i]+l+word[i+1:]
35
+ if new_word == word:
36
+ continue
37
+ replace_set.add(new_word)
38
+
39
+ replace_l = sorted(list(replace_set))
40
+
41
+ return replace_l
42
+
43
+ def insert_letter(word):
44
+ letters = 'ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ'
45
+ insert_l = []
46
+
47
+ for i in range(len(word)+1):
48
+ for l in letters:
49
+ new_word = word[:i]+l+word[i:]
50
+ insert_l.append(new_word)
51
+
52
+ return insert_l
53
+
54
+ def edit_one_letter(word, allow_switches = True):
55
+ edit_one_set = delete_letter(word)+insert_letter(word)+replace_letter(word)
56
+
57
+ if allow_switches:
58
+ edit_one_set += switch_letter(word)
59
+
60
+ return set(edit_one_set)
61
+
62
+ def edit_two_letters(word, allow_switches = True):
63
+ edit_two_set = []
64
+ edit_one_set = edit_one_letter(word)
65
+
66
+ for edit in edit_one_set:
67
+ edit_two_set += edit_one_letter(edit)
68
+
69
+ return set(edit_two_set) | set(edit_one_set)
70
+
71
+ def get_corrections(word, vocab):
72
+ suggestions = []
73
+
74
+ correct_word_suggest = [word] if word in vocab else []
75
+ edit_one_letter_suggest = list(filter(lambda item: item in vocab, list(edit_one_letter(word))))
76
+ edit_two_letter_suggest = list(filter(lambda item: item in vocab, list(edit_two_letters(word))))
77
+
78
+ suggestions = correct_word_suggest or edit_one_letter_suggest or edit_two_letter_suggest or ['لم يتم العثور علي إقتراحات مناسبة لهذه الكلمة']
79
+
80
+ return set(suggestions)
81
+
82
+ def min_edit_distance(source, target, ins_cost = 1, del_cost = 1, rep_cost = 2):
83
+ m = len(source)
84
+ n = len(target)
85
+ D = np.zeros((m+1, n+1), dtype=int)
86
+
87
+ for row in range(1, m+1):
88
+ D[row,0] = D[row-1,0]+del_cost
89
+
90
+ for col in range(1, n+1):
91
+ D[0,col] = D[0, col-1]+ins_cost
92
+
93
+ for row in range(1, m+1):
94
+ for col in range(1, n+1):
95
+ r_cost = rep_cost
96
+
97
+ if source[row-1] == target[col-1]:
98
+ r_cost = 0
99
+
100
+ D[row,col] = np.min([D[row-1,col]+del_cost, D[row,col-1]+ins_cost, D[row-1,col-1]+r_cost])
101
+
102
+ med = D[m,n]
103
+
104
+ return med
105
+
106
+ def get_suggestions(corrections, word):
107
+ distance = []
108
+ suggest = []
109
+
110
+ for correction in corrections:
111
+ source = word
112
+ target = correction
113
+ min_edits = min_edit_distance(source, target)
114
+
115
+ distance.append(min_edits)
116
+ suggest.append(correction)
117
+
118
+ suggest_result = list(map(lambda idx: suggest[idx], np.argsort(distance)))
119
+ return suggest_result
120
+
121
+ def ar_spelling_checker(text):
122
+ word_l = re.findall('\w{3,}', text)
123
+ result = {}
124
+
125
+ for word in word_l:
126
+ tmp_corrections = []
127
+ if not word in vocab:
128
+ tmp_corrections = get_corrections(word, vocab)
129
+ if len(tmp_corrections) == 0:
130
+ continue
131
+ result[word] = get_suggestions(tmp_corrections, word)
132
+
133
+ output = '''<style>
134
+ .content{
135
+ direction: rtl;
136
+ }
137
+ .word{
138
+ color: #842029;
139
+ background-color: #f8d7da;
140
+ border-color: #f5c2c7;
141
+ padding: 10px 20px;
142
+ display: inline-block;
143
+ direction: rtl;
144
+ font-size: 15px;
145
+ font-weight: 500;
146
+ margin-bottom: 15px;
147
+ box-sizing: border-box;
148
+ border: 1px solid transparent;
149
+ border-radius: 0.25rem;
150
+ }
151
+ .suggest{
152
+ color: #0f5132;
153
+ background-color: #d1e7dd;
154
+ border-color: #badbcc;
155
+ display: inline-block;
156
+ margin-right: 5px;
157
+ }
158
+ .separator{
159
+ height:3px;
160
+ background: #CCC;
161
+ margin-bottom: 15px;
162
+ }
163
+ .msg{
164
+ color: #0f5132;
165
+ background-color: #d1e7dd;
166
+ border-color: #badbcc;
167
+ border: 1px solid transparent;
168
+ border-radius: 0.25rem;
169
+ padding: 15px 20px;
170
+ direction: rtl;
171
+ font-size: 20px;
172
+ font-weight: 500;
173
+ text-align: center;
174
+ }
175
+ </style>'''
176
+
177
+ output += '<div class="content">'
178
+
179
+ if len(result.keys()) == 0:
180
+ output += '<div class="msg">لا توجد أخطاء إملائية 🤗</div>'
181
+
182
+ for word in result.keys():
183
+ output += f'<div class="word">{word}</div><br />'
184
+ for suggest in result[word]:
185
+ output += f'<div class="word suggest">{suggest}</div>'
186
+
187
+ output += '<div class="separator"></div>'
188
+
189
+ output += '</div>'
190
+
191
+ return output
192
+
193
+ with gr.Blocks(css="""
194
+ #input{direction: rtl;}
195
+ #component-112{height: 30px;}
196
+ .gr-form{margin-top: 15px;}
197
+ .gr-text-input{font-size: 17px; height:50px; padding: 0.725rem;}
198
+ .text-gray-500{font-size: 16px; margin-bottom: 13px;}
199
+ .gr-button{color: #084298; background-color: #cfe2ff; border-color: #b6d4fe;
200
+ border: 1px solid transparent; border-radius: 0.25rem;
201
+ padding: 15px 20px; font-size: 20px; font-weight: 500; font-family: 'IBM Plex Mono';}
202
+ .output-html{min-height: 2rem;}
203
+ .title{text-align: center;font-size: 25px;margin-top: 13px;position: absolute;width:100%;
204
+ line-height: 1.5;font-family: 'IBM Plex Mono';}
205
+ .desc{text-align: center; font-size: 17px; font-family: 'IBM Plex Mono'; margin-top: 46px;}""") as demo:
206
+
207
+ intro = gr.HTML('<h1 class="title">Arabic Spelling Checker 🤗</h1>')
208
+ description = gr.HTML('<p class="desc">Web-based app to detect spelling mistakes in Arabic words using dynamic programming</p>')
209
+ text = gr.Textbox(label="النص", elem_id="input")
210
+ btn = gr.Button("Spelling Check")
211
+ output = gr.HTML()
212
+
213
+ btn.click(ar_spelling_checker, [text], output)
214
+
215
+ demo.launch(inline=False)