# from tokenization_gptpangu import GPTPanguTokenizer | |
# import json | |
# | |
# tokenizer = GPTPanguTokenizer.from_pretrained(".") | |
# with open("tokenizer.json",encoding="utf-8") as f: | |
# cofig = json.load(f) | |
# | |
# | |
# vocab_file = "vocab.vocab" | |
# | |
# f = open(vocab_file, 'r', encoding="utf-8") | |
# lines = f.readlines() | |
# vocab = [] | |
# for line in enumerate(lines): | |
# key = line[1].split('\t')[0] | |
# pair = [key,line[0]] | |
# vocab.append(pair) | |
# | |
# cofig['model']['vocab'] = vocab | |
# | |
# with open("new_tokenizer.json","w",encoding="utf-8") as w: | |
# d = json.dumps(cofig) | |
# w.write(d) | |
# | |
# print("ok") | |
from transformers import AutoTokenizer | |
tokenizer = AutoTokenizer.from_pretrained(".") | |