Spaces:
Runtime error
Runtime error
# coding: utf8 | |
import re | |
from normalize import chuan_hoa_dau_tu_tieng_viet | |
import numpy as np | |
from tqdm import tqdm | |
import textdistance | |
import json | |
from copy import copy | |
with open("common-vietnamese-syllables.txt", "r", encoding="utf-8") as file: | |
vi_syllables = [line.strip("\n") for line in file.readlines()] | |
vi_syllables_new = [] | |
for syllable in vi_syllables: | |
normalized = chuan_hoa_dau_tu_tieng_viet(syllable) | |
vi_syllables_new.append(normalized) | |
regex_nguyen_am_don = "ộ|ặ|ằ|ụ|ầ|a|ũ|á|ể|ỡ|ủ|y|ở|ế|ẵ|ệ|é|ẹ|â|ề|ê|ọ|ờ|ẳ|ợ|ỷ|ữ|ị|e|u|ò|ẫ|i|ỉ|ẩ|ẽ|õ|ỹ|ô|ỵ|ồ|ú|í|ó|ỗ|ã|ẻ|ù|ă|ơ|ứ|ậ|ử|ừ|à|ĩ|ả|ố|ớ|ự|ắ|o|ý|ỳ|ư|ấ|ễ|ạ|ỏ|ổ|è|ì" | |
regex_nguyen_am_doi = "uằ|iê|ấu|ượ|ùy|ạy|uỹ|ươ|ỗi|yệ|ụy|ẫy|oà|ái|ói|uồ|uỷ|oỏ|ệu|ue|oi|ậu|oè|uã|ãi|òi|ơi|ựa|ụi|iể|oá|ìa|ĩu|uẹ|ìu|ầu|ỏe|ối|uẳ|ịa|òe|ai|ọe|yể|ày|ỉu|uỵ|uể|óe|ỉa|ũa|ườ|uè|êu|ẹo|uá|ỏi|uấ|ưỡ|ội|au|iề|ửu|ọi|ảu|uẽ|ầy|ẻo|ao|yế|uẻ|uơ|ưở|iế|uở|ịu|ủa|ẫu|uặ|oằ|oò|ạu|uỳ|ạo|oọ|ưa|oẹ|ui|uậ|ủi|áo|óa|ẩu|ảy|oẵ|áu|ựu|uô|ửa|ễu|uâ|oạ|uổ|uê|ùi|ếu|ời|iu|uo|oé|yễ|oẳ|uớ|ay|iễ|ủy|ướ|oó|eo|ũi|oả|ua|ỏa|ấy|uố|èo|oo|úy|ẩy|ồi|yề|ẽo|uẫ|ứu|ãy|ổi|ía|ảo|ué|uờ|ùa|ia|ều|oa|iệ|àu|õa|oắ|uắ|uả|ứa|ởi|ụa|ũy|òa|íu|éo|oã|uă|uộ|ữu|úa|ải|ỡi|ừu|ểu|oe|õi|ọa|ừa|uệ|uý|uó|ào|uà|ây|oă|uạ|ữa|oặ|uy|ợi|uẩ|uỗ|ão|uế|ưu|ửi|ại|âu|ới|uầ|ĩa|úi|oẻ|ôi|ài|uề|yê|ậy|áy" | |
regex_nguyen_am_ba = "uỷu|uây|ươu|iệu|yếu|yểu|uyế|uyệ|uyề|ưỡi|uôi|ượi|uổi|oay|uào|iễu|oeo|oèo|uỗi|oai|uấy|oái|uỵu|uyể|uồi|oáy|yều|oẹo|uẫy|ưởi|iểu|uầy|iêu|uối|uyễ|ưới|iều|oài|uao|ươi|yêu|ười|uya|oải|ướu|uội|oại|iếu|ượu|uẩy|uyê|uậy" | |
all_phu_am_dau = {'', 'gh', 'q', 'kh', 'p', 'm', 'qu', 'n', 'b', 'g', 't', 'ch', 'th', 'k', 'đ', 'r', 'ph', 'ngh', 'gi', 'tr', 's', 'l', 'h', 'nh', 'c', 'ng', 'd', 'v', 'x'} | |
all_phu_am_cuoi = {'', 'ng', 'nh', 't', 'ch', 'c', 'p', 'm', 'k', 'n'} | |
all_nguyen_am_don = "ộ|ặ|ằ|ụ|ầ|a|ũ|á|ể|ỡ|ủ|y|ở|ế|ẵ|ệ|é|ẹ|â|ề|ê|ọ|ờ|ẳ|ợ|ỷ|ữ|ị|e|u|ò|ẫ|i|ỉ|ẩ|ẽ|õ|ỹ|ô|ỵ|ồ|ú|í|ó|ỗ|ã|ẻ|ù|ă|ơ|ứ|ậ|ử|ừ|à|ĩ|ả|ố|ớ|ự|ắ|o|ý|ỳ|ư|ấ|ễ|ạ|ỏ|ổ|è|ì".split("|") | |
all_nguyen_am_doi = "uằ|iê|ấu|ượ|ùy|ạy|uỹ|ươ|ỗi|yệ|ụy|ẫy|oà|ái|ói|uồ|uỷ|oỏ|ệu|ue|oi|ậu|oè|uã|ãi|òi|ơi|ựa|ụi|iể|oá|ìa|ĩu|uẹ|ìu|ầu|ỏe|ối|uẳ|ịa|òe|ai|ọe|yể|ày|ỉu|uỵ|uể|óe|ỉa|ũa|ườ|uè|êu|ẹo|uá|ỏi|uấ|ưỡ|ội|au|iề|ửu|ọi|ảu|uẽ|ầy|ẻo|ao|yế|uẻ|uơ|ưở|iế|uở|ịu|ủa|ẫu|uặ|oằ|oò|ạu|uỳ|ạo|oọ|ưa|oẹ|ui|uậ|ủi|áo|óa|ẩu|ảy|oẵ|áu|ựu|uô|ửa|ễu|uâ|oạ|uổ|uê|ùi|ếu|ời|iu|uo|oé|yễ|oẳ|uớ|ay|iễ|ủy|ướ|oó|eo|ũi|oả|ua|ỏa|ấy|uố|èo|oo|úy|ẩy|ồi|yề|ẽo|uẫ|ứu|ãy|ổi|ía|ảo|ué|uờ|ùa|ia|ều|oa|iệ|àu|õa|oắ|uắ|uả|ứa|ởi|ụa|ũy|òa|íu|éo|oã|uă|uộ|ữu|úa|ải|ỡi|ừu|ểu|oe|õi|ọa|ừa|uệ|uý|uó|ào|uà|ây|oă|uạ|ữa|oặ|uy|ợi|uẩ|uỗ|ão|uế|ưu|ửi|ại|âu|ới|uầ|ĩa|úi|oẻ|ôi|ài|uề|yê|ậy|áy".split("|") | |
all_nguyen_am_ba = "uỷu|uây|ươu|iệu|yếu|yểu|uyế|uyệ|uyề|ưỡi|uôi|ượi|uổi|oay|uào|iễu|oeo|oèo|uỗi|oai|uấy|oái|uỵu|uyể|uồi|oáy|yều|oẹo|uẫy|ưởi|iểu|uầy|iêu|uối|uyễ|ưới|iều|oài|uao|ươi|yêu|ười|uya|oải|ướu|uội|oại|iếu|ượu|uẩy|uyê|uậy".split("|") | |
confusion_set = dict() | |
special_list = set() | |
for syllable in tqdm(vi_syllables_new): | |
# print(syllable) | |
if syllable[0:2] in ["qu", "gi"]: | |
special_list.add(syllable) | |
# print(f"Ignore {syllable}") | |
continue | |
confusion_set[syllable] = dict() | |
syllable_candidates = confusion_set[syllable] | |
syllable_candidates['phu_am_dau'] = set() | |
syllable_candidates['nguyen_am'] = set() | |
syllable_candidates['phu_am_cuoi'] = set() | |
if len(re.findall(regex_nguyen_am_ba, syllable)) != 0: | |
result = re.findall(regex_nguyen_am_ba, syllable) | |
nguyen_am = result[0] | |
elif len(re.findall(regex_nguyen_am_doi, syllable)) != 0: | |
result = re.findall(regex_nguyen_am_doi, syllable) | |
nguyen_am = result[0] | |
elif len(re.findall(regex_nguyen_am_don, syllable)) != 0: | |
result = re.findall(regex_nguyen_am_don, syllable) | |
nguyen_am = result[0] | |
else: | |
raise Exception("Khong co nguyen am") | |
phu_am_dau, phu_am_cuoi = "", "" | |
if len(re.findall(f"(.+){nguyen_am}", syllable)) !=0 : | |
result = re.findall(f"(.+){nguyen_am}", syllable) | |
phu_am_dau = result[0] | |
if len(re.findall(f"{nguyen_am}(.+)", syllable)) !=0 : | |
result = re.findall(f"{nguyen_am}(.+)", syllable) | |
phu_am_cuoi = result[0] | |
### Error thay đổi phụ âm đầu | |
for candidate in all_phu_am_dau: | |
if "".join([candidate, nguyen_am, phu_am_cuoi]) in vi_syllables_new: | |
syllable_candidates['phu_am_dau'].add("".join([candidate, nguyen_am, phu_am_cuoi])) | |
### Error thay đổi nguyên âm | |
all_nguyen_am = all_nguyen_am_don + all_nguyen_am_doi + all_nguyen_am_ba | |
for candidate in all_nguyen_am: | |
if "".join([phu_am_dau, candidate, phu_am_cuoi]) in vi_syllables_new: | |
syllable_candidates['nguyen_am'].add("".join([phu_am_dau, candidate, phu_am_cuoi])) | |
### Error thay đổi phụ âm cuối | |
for candidate in all_phu_am_cuoi: | |
if "".join([phu_am_dau, nguyen_am, candidate]) in vi_syllables_new: | |
syllable_candidates['phu_am_cuoi'].add("".join([phu_am_dau, nguyen_am, candidate])) | |
for syllable in tqdm(special_list): | |
if len(re.findall(regex_nguyen_am_don, syllable)) > 1: | |
phu_am_dau = syllable[0:2] | |
remained = syllable[2:] | |
else: | |
phu_am_dau = syllable[0] | |
remained = syllable[1:] | |
confusion_set[syllable] = dict() | |
syllable_candidates = confusion_set[syllable] | |
syllable_candidates['phu_am_dau'] = set() | |
syllable_candidates['nguyen_am'] = set() | |
syllable_candidates['phu_am_cuoi'] = set() | |
if len(re.findall(regex_nguyen_am_ba, remained)) != 0: | |
result = re.findall(regex_nguyen_am_ba, remained) | |
nguyen_am = result[0] | |
elif len(re.findall(regex_nguyen_am_doi, remained)) != 0: | |
result = re.findall(regex_nguyen_am_doi, remained) | |
nguyen_am = result[0] | |
elif len(re.findall(regex_nguyen_am_don, remained)) != 0: | |
result = re.findall(regex_nguyen_am_don, remained) | |
nguyen_am = result[0] | |
else: | |
nguyen_am, phu_am_cuoi = "", "" | |
phu_am_cuoi = "" | |
if nguyen_am != "" and len(re.findall(f"{nguyen_am}(.+)", remained)) !=0 : | |
result = re.findall(f"{nguyen_am}(.+)", remained) | |
phu_am_cuoi = result[0] | |
### Error thay đổi phụ âm đầu | |
for candidate in all_phu_am_dau: | |
if "".join([candidate, nguyen_am, phu_am_cuoi]) in vi_syllables_new: | |
syllable_candidates['phu_am_dau'].add("".join([candidate, nguyen_am, phu_am_cuoi])) | |
### Error thay đổi nguyên âm | |
all_nguyen_am = all_nguyen_am_don + all_nguyen_am_doi + all_nguyen_am_ba | |
for candidate in all_nguyen_am: | |
if "".join([phu_am_dau, candidate, phu_am_cuoi]) in vi_syllables_new: | |
syllable_candidates['nguyen_am'].add("".join([phu_am_dau, candidate, phu_am_cuoi])) | |
### Error thay đổi phụ âm cuối | |
for candidate in all_phu_am_cuoi: | |
if "".join([phu_am_dau, nguyen_am, candidate]) in vi_syllables_new: | |
syllable_candidates['phu_am_cuoi'].add("".join([phu_am_dau, nguyen_am, candidate])) | |
for key in tqdm(confusion_set.keys()): | |
for key_2_level in confusion_set[key].keys(): | |
try: | |
confusion_set[key][key_2_level].remove(key) | |
except: | |
pass | |
for key in tqdm(confusion_set.keys()): | |
for key_2_level in confusion_set[key].keys(): | |
candidates_to_remove = [] | |
for candidate in confusion_set[key][key_2_level]: | |
similarity = textdistance.damerau_levenshtein.normalized_similarity(key, candidate) | |
if similarity < 0.5: | |
candidates_to_remove.append(candidate) | |
for candidate in candidates_to_remove: | |
confusion_set[key][key_2_level].remove(candidate) | |
keyboard_neighbor = {'a': 'áàảãạ', | |
'ă': 'ắằẳẵặ', | |
'â': 'ấầẩẫậ', | |
'á': 'aàảãạ', | |
'à': 'aáảãạ', | |
'ả': 'aáàãạ', | |
'ã': 'aáàảạ', | |
'ạ': 'aáàảã', | |
'ắ': 'ăằẳẵặ', | |
'ằ': 'ăắẳẵặ', | |
'ẳ': 'ăắằẵặ', | |
'ặ': 'ăắằẳẵ', | |
'ẵ': 'ăắằẳặ', | |
'ấ': 'âầẩẫậ', | |
'ầ': 'âấẩẫậ', | |
'ẩ': 'âấầẫậ', | |
'ẫ': 'âấầẩậ', | |
'ậ': 'âấầẩẫ', | |
'e': 'èéẻẽẹ', | |
'é': 'eèẻẽẹ', | |
'è': 'eéẻẽẹ', | |
'ẻ': 'eéèẽẹ', | |
'ẽ': 'eéèẻẹ', | |
'ẹ': 'eéèẻẽ', | |
'ê': 'ếềểễệ', | |
'ế': 'êềểễệ', | |
'ề': 'êếểễệ', | |
'ể': 'êếềễệ', | |
'ễ': 'êếềểệ', | |
'ệ': 'êếềểễ', | |
'i': 'íìỉĩị', | |
'í': 'iìỉĩị', | |
'ì': 'iíỉĩị', | |
'ỉ': 'iíìĩị', | |
'ĩ': 'iíìỉị', | |
'ị': 'iíìỉĩ', | |
'o': 'òóỏọõ', | |
'ó': 'oòỏọõ', | |
'ò': 'oóỏọõ', | |
'ỏ': 'oóòọõ', | |
'õ': 'oóòỏọ', | |
'ọ': 'oóòỏõ', | |
'ô': 'ốồổỗộ', | |
'ố': 'ôồổỗộ', | |
'ồ': 'ôốổỗộ', | |
'ổ': 'ôốồỗộ', | |
'ộ': 'ôốồổỗ', | |
'ỗ': 'ôốồổộ', | |
'ơ': 'ớờởợỡ', | |
'ớ': 'ơờởợỡ', | |
'ờ': 'ơớởợỡ', | |
'ở': 'ơớờợỡ', | |
'ợ': 'ơớờởỡ', | |
'ỡ': 'ơớờởợ', | |
'u': 'úùủũụ', | |
'ú': 'uùủũụ', | |
'ù': 'uúủũụ', | |
'ủ': 'uúùũụ', | |
'ũ': 'uúùủụ', | |
'ụ': 'uúùủũ', | |
'ư': 'ứừữửự', | |
'ứ': 'ưừữửự', | |
'ừ': 'ưứữửự', | |
'ử': 'ưứừữự', | |
'ữ': 'ưứừửự', | |
'ự': 'ưứừữử', | |
'y': 'ýỳỷỵỹ', | |
'ý': 'yỳỷỵỹ', | |
'ỳ': 'yýỷỵỹ', | |
'ỷ': 'yýỳỵỹ', | |
'ỵ': 'yýỳỷỹ', | |
'ỹ': 'yýỳỷỵ'} | |
pattern = "(" + "|".join(keyboard_neighbor.keys()) + "){1}" | |
def make_accent_change_candidates(text): | |
result = re.findall(pattern, text) | |
candidates = [] | |
for candidate in result: | |
[candidates.append(text.replace(candidate, x)) for x in keyboard_neighbor[candidate]] | |
return set(candidates) | |
typo = json.load(open("../noising_resources/typo.json", "r", encoding="utf-8")) | |
typo_pattern = "(" + "|".join(typo.keys()) + "){1}" | |
accent_pattern = "(s|f|r|x|j|1|2|3|4|5){1}" | |
def convert_to_non_telex(text): | |
word = copy(text) | |
candidates = re.findall(typo_pattern, text) | |
for candidate in candidates: | |
replaced = typo[candidate][0] | |
# Move accent to the end of text | |
if len(re.findall(accent_pattern, replaced)) != 0: | |
word = re.sub(candidate, replaced[0:-1], word) | |
word += replaced[-1] | |
else: | |
word = re.sub(candidate, replaced, word) | |
return word | |
def keep_1_distance_candidates(text, nguyen_am_errors : set): | |
nguyen_am_errors = list(nguyen_am_errors) | |
text = convert_to_non_telex(text) | |
distances = [textdistance.damerau_levenshtein(text, convert_to_non_telex(error)) for error in nguyen_am_errors] | |
indies_to_keep = np.where(np.array(distances) <= 1)[0] | |
return set([nguyen_am_errors[i] for i in indies_to_keep]) | |
for key in tqdm(confusion_set.keys()): | |
candidates = make_accent_change_candidates(key) | |
one_distance_candidates = keep_1_distance_candidates(key, confusion_set[key]['nguyen_am']) | |
candidates = candidates.union(one_distance_candidates) | |
high_probs_list = candidates.intersection(confusion_set[key]['nguyen_am']) | |
lower_probs_list = confusion_set[key]['nguyen_am'].difference(high_probs_list) | |
confusion_set[key]['nguyen_am'] = [high_probs_list, lower_probs_list] | |
for key in tqdm(confusion_set.keys()): | |
confusion_set[key]['nguyen_am'] = [list(confusion_set[key]['nguyen_am'][0]), list(confusion_set[key]['nguyen_am'][1])] | |
confusion_set[key]['phu_am_dau'] = list(confusion_set[key]['phu_am_dau']) | |
confusion_set[key]['phu_am_cuoi'] = list(confusion_set[key]['phu_am_cuoi']) | |
with open("../noising_resources/confusion_set.json", "w+", encoding="utf-8") as outfile: | |
print(confusion_set, file = outfile) | |