# original author: intervitens | |
import sentencepiece.sentencepiece_model_pb2 as model | |
# replace 1 -> <|end_of_text|> | |
# replace 2 -> <|begin_of_text|> | |
# replace 10 -> <|start_header_id|> | |
# repalce 11 -> end_header_id | |
# reaplce 12 -> eot_id | |
m = model.ModelProto() | |
m.ParseFromString(open('./tokenizer.model', 'rb').read()) | |
m.pieces[1].piece = '<|end_of_text|>' | |
m.pieces[2].piece = '<|begin_of_text|>' | |
m.pieces[10].piece = '<|start_header_id|>' | |
m.pieces[11].piece = '<|end_header_id|>' | |
m.pieces[12].piece = '<|eot_id|>' | |
with open('tokenizer.model', 'wb') as f: | |
f.write(m.SerializeToString()) | |