Spaces:
Running
on
Zero
Running
on
Zero
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
""" | |
Credits | |
This code is modified from https://github.com/GitYCC/g2pW | |
""" | |
import os | |
import re | |
def wordize_and_map(text: str): | |
words = [] | |
index_map_from_text_to_word = [] | |
index_map_from_word_to_text = [] | |
while len(text) > 0: | |
match_space = re.match(r'^ +', text) | |
if match_space: | |
space_str = match_space.group(0) | |
index_map_from_text_to_word += [None] * len(space_str) | |
text = text[len(space_str):] | |
continue | |
match_en = re.match(r'^[a-zA-Z0-9]+', text) | |
if match_en: | |
en_word = match_en.group(0) | |
word_start_pos = len(index_map_from_text_to_word) | |
word_end_pos = word_start_pos + len(en_word) | |
index_map_from_word_to_text.append((word_start_pos, word_end_pos)) | |
index_map_from_text_to_word += [len(words)] * len(en_word) | |
words.append(en_word) | |
text = text[len(en_word):] | |
else: | |
word_start_pos = len(index_map_from_text_to_word) | |
word_end_pos = word_start_pos + 1 | |
index_map_from_word_to_text.append((word_start_pos, word_end_pos)) | |
index_map_from_text_to_word += [len(words)] | |
words.append(text[0]) | |
text = text[1:] | |
return words, index_map_from_text_to_word, index_map_from_word_to_text | |
def tokenize_and_map(tokenizer, text: str): | |
words, text2word, word2text = wordize_and_map(text=text) | |
tokens = [] | |
index_map_from_token_to_text = [] | |
for word, (word_start, word_end) in zip(words, word2text): | |
word_tokens = tokenizer.tokenize(word) | |
if len(word_tokens) == 0 or word_tokens == ['[UNK]']: | |
index_map_from_token_to_text.append((word_start, word_end)) | |
tokens.append('[UNK]') | |
else: | |
current_word_start = word_start | |
for word_token in word_tokens: | |
word_token_len = len(re.sub(r'^##', '', word_token)) | |
index_map_from_token_to_text.append( | |
(current_word_start, current_word_start + word_token_len)) | |
current_word_start = current_word_start + word_token_len | |
tokens.append(word_token) | |
index_map_from_text_to_token = text2word | |
for i, (token_start, token_end) in enumerate(index_map_from_token_to_text): | |
for token_pos in range(token_start, token_end): | |
index_map_from_text_to_token[token_pos] = i | |
return tokens, index_map_from_text_to_token, index_map_from_token_to_text | |
def _load_config(config_path: os.PathLike): | |
import importlib.util | |
spec = importlib.util.spec_from_file_location('__init__', config_path) | |
config = importlib.util.module_from_spec(spec) | |
spec.loader.exec_module(config) | |
return config | |
default_config_dict = { | |
'manual_seed': 1313, | |
'model_source': 'bert-base-chinese', | |
'window_size': 32, | |
'num_workers': 2, | |
'use_mask': True, | |
'use_char_phoneme': False, | |
'use_conditional': True, | |
'param_conditional': { | |
'affect_location': 'softmax', | |
'bias': True, | |
'char-linear': True, | |
'pos-linear': False, | |
'char+pos-second': True, | |
'char+pos-second_lowrank': False, | |
'lowrank_size': 0, | |
'char+pos-second_fm': False, | |
'fm_size': 0, | |
'fix_mode': None, | |
'count_json': 'train.count.json' | |
}, | |
'lr': 5e-5, | |
'val_interval': 200, | |
'num_iter': 10000, | |
'use_focal': False, | |
'param_focal': { | |
'alpha': 0.0, | |
'gamma': 0.7 | |
}, | |
'use_pos': True, | |
'param_pos ': { | |
'weight': 0.1, | |
'pos_joint_training': True, | |
'train_pos_path': 'train.pos', | |
'valid_pos_path': 'dev.pos', | |
'test_pos_path': 'test.pos' | |
} | |
} | |
def load_config(config_path: os.PathLike, use_default: bool=False): | |
config = _load_config(config_path) | |
if use_default: | |
for attr, val in default_config_dict.items(): | |
if not hasattr(config, attr): | |
setattr(config, attr, val) | |
elif isinstance(val, dict): | |
d = getattr(config, attr) | |
for dict_k, dict_v in val.items(): | |
if dict_k not in d: | |
d[dict_k] = dict_v | |
return config | |