File size: 1,477 Bytes
7900c16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
"""
Build vocabulary with given tokenizer
"""
import sys
import os
import argparse

tencentpretrain_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
sys.path.append(tencentpretrain_dir)

from tencentpretrain.utils import *
from tencentpretrain.utils.vocab import Vocab


if __name__ == '__main__':
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument("--corpus_path", required=True)
    parser.add_argument("--delimiter", choices=["char", "space"], required=True,
                        help="Tokenizing the corpus in char-level or by the provided spaces.")
    parser.add_argument("--output_path", required=True,
                        help="The output path to save the vocabulary.")
    parser.add_argument("--workers_num", type=int, default=1,
                        help="The number of processes to build vocabulary.")
    parser.add_argument("--min_count", type=int, default=1,
                        help="The minimum count of words retained in the vocabulary.")
    
    args = parser.parse_args()

    # Build tokenizer only for char and space.
    args.vocab_path, args.spm_model_path = "./models/reserved_vocab.txt", None
    tokenizer = str2tokenizer[args.delimiter](args)

    # Build and save vocabulary using CharTokenizer or SpaceTokenizer.
    vocab = Vocab()
    vocab.build(args.corpus_path, tokenizer, args.workers_num, args.min_count)
    vocab.save(args.output_path)