Spaces:
Runtime error
Runtime error
""" | |
Build vocabulary with given tokenizer | |
""" | |
import sys | |
import os | |
import argparse | |
tencentpretrain_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) | |
sys.path.append(tencentpretrain_dir) | |
from tencentpretrain.utils import * | |
from tencentpretrain.utils.vocab import Vocab | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
parser.add_argument("--corpus_path", required=True) | |
parser.add_argument("--delimiter", choices=["char", "space"], required=True, | |
help="Tokenizing the corpus in char-level or by the provided spaces.") | |
parser.add_argument("--output_path", required=True, | |
help="The output path to save the vocabulary.") | |
parser.add_argument("--workers_num", type=int, default=1, | |
help="The number of processes to build vocabulary.") | |
parser.add_argument("--min_count", type=int, default=1, | |
help="The minimum count of words retained in the vocabulary.") | |
args = parser.parse_args() | |
# Build tokenizer only for char and space. | |
args.vocab_path, args.spm_model_path = "./models/reserved_vocab.txt", None | |
tokenizer = str2tokenizer[args.delimiter](args) | |
# Build and save vocabulary using CharTokenizer or SpaceTokenizer. | |
vocab = Vocab() | |
vocab.build(args.corpus_path, tokenizer, args.workers_num, args.min_count) | |
vocab.save(args.output_path) | |