VISOR-GPT / train /scripts /build_vocab.py
szukevin's picture
upload
7900c16
raw
history blame
1.48 kB
"""
Build vocabulary with given tokenizer
"""
import sys
import os
import argparse
tencentpretrain_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
sys.path.append(tencentpretrain_dir)
from tencentpretrain.utils import *
from tencentpretrain.utils.vocab import Vocab
if __name__ == '__main__':
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--corpus_path", required=True)
parser.add_argument("--delimiter", choices=["char", "space"], required=True,
help="Tokenizing the corpus in char-level or by the provided spaces.")
parser.add_argument("--output_path", required=True,
help="The output path to save the vocabulary.")
parser.add_argument("--workers_num", type=int, default=1,
help="The number of processes to build vocabulary.")
parser.add_argument("--min_count", type=int, default=1,
help="The minimum count of words retained in the vocabulary.")
args = parser.parse_args()
# Build tokenizer only for char and space.
args.vocab_path, args.spm_model_path = "./models/reserved_vocab.txt", None
tokenizer = str2tokenizer[args.delimiter](args)
# Build and save vocabulary using CharTokenizer or SpaceTokenizer.
vocab = Vocab()
vocab.build(args.corpus_path, tokenizer, args.workers_num, args.min_count)
vocab.save(args.output_path)