from typing import List, Dict, Any from pathlib import Path from utils import get_full_file_path # SENTENCE_STOPPERS = {'!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~'} # VIETNAMESE_SPECIAL_CHARACTERS = {'à', 'á', 'ả', 'ã', 'ạ', 'â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ'} # def is_Vietnamese_character(char): # return char.isalpha() or char in VIETNAMESE_SPECIAL_CHARACTERS # def categorize_word(word: str) -> str: # """ # Categoize word into 3 types: # - "vi": likely Vietnamese. # - "lo": likely Laos. # - "num": a number # """ # if any(char.isdigit() for char in word): # return "num" # for stopper in SENTENCE_STOPPERS: # if word.endswith(stopper): # word = word[:-1] # if len(word) == 0: # break # if len(word) > 0 and any(not is_Vietnamese_character(char) for char in word): # return "lo" # else: # return "vi" # # def open_dataset( # dataset_filename: str, # src_lang: str = "lo", # tgt_lang: str = "vi" # ) -> List[Dict[str, Dict[str,str]]]: # ds = [] # file_path = get_full_file_path(dataset_filename) # with open(file_path, 'r', encoding='utf-8') as file: # lines = file.readlines() # for index, line in enumerate(lines): # line = line.split(sep=None) # lo_positions = [i for i, word in enumerate(line) if categorize_word(word) == "lo"] # if len(lo_positions) == 0: # # print(line) # continue # split_index = max(lo_positions) # assert split_index is not None, f"Dataset error on line {index+1}." # src_text = ' '.join(line[:split_index+1]) # tgt_text = line[split_index+1:] # if index <= 5: # print(src_text, tgt_text, sep="\n", end="\n-------") # # TODO: post process the tgt_text to split all numbers in to single digits. # ds.append({'translation':{src_lang:src_text, tgt_lang:tgt_text}}) # return ds # open_dataset('datasets/dev_clean.dat') def load_local_dataset( dataset_filename: str, src_lang: str = "lo", tgt_lang: str = "vi" ) -> List[Dict[str, Dict[str,str]]]: ds = [] file_path = get_full_file_path(dataset_filename) with open(file_path, 'r', encoding='utf-8') as file: lines = file.readlines() for index, line in enumerate(lines): src_text, tgt_text = line.split(sep="\t", maxsplit=1) ds.append({'translation':{src_lang:src_text, tgt_lang:tgt_text}}) return ds def load_local_bleu_dataset( src_dataset_filename: str, tgt_dataset_filename: str, src_lang: str = "lo", tgt_lang: str = "vi" ) -> List[Dict[str, Dict[str,str]]]: def load_local_monolanguage_dataset(dataset_filename: str): mono_ds = [] file_path = get_full_file_path(dataset_filename) with open(file_path, 'r', encoding='utf-8') as file: lines = file.readlines() for line in lines: mono_ds.append(line) return mono_ds src_texts = load_local_monolanguage_dataset(src_dataset_filename) tgt_texts = load_local_monolanguage_dataset(tgt_dataset_filename) assert len(src_texts) == len(tgt_texts) ds = [] for i in range(len(src_texts)): ds.append({'translation':{src_lang:src_texts[i], tgt_lang:tgt_texts[i]}}) return ds