# original author: intervitens import sentencepiece.sentencepiece_model_pb2 as model # replace 1 -> <|end_of_text|> # replace 2 -> <|begin_of_text|> # replace 10 -> <|start_header_id|> # repalce 11 -> end_header_id # reaplce 12 -> eot_id m = model.ModelProto() m.ParseFromString(open('./tokenizer.model', 'rb').read()) m.pieces[1].piece = '<|end_of_text|>' m.pieces[2].piece = '<|begin_of_text|>' m.pieces[10].piece = '<|start_header_id|>' m.pieces[11].piece = '<|end_header_id|>' m.pieces[12].piece = '<|eot_id|>' with open('tokenizer.model', 'wb') as f: f.write(m.SerializeToString())