gemma-2-9b-l3 / gemma-tokenizer-llama3.py
lodrick-the-lafted's picture
Upload folder using huggingface_hub
3b12d8b verified
raw
history blame
594 Bytes
# original author: intervitens
import sentencepiece.sentencepiece_model_pb2 as model
# replace 1 -> <|end_of_text|>
# replace 2 -> <|begin_of_text|>
# replace 10 -> <|start_header_id|>
# repalce 11 -> end_header_id
# reaplce 12 -> eot_id
m = model.ModelProto()
m.ParseFromString(open('./tokenizer.model', 'rb').read())
m.pieces[1].piece = '<|end_of_text|>'
m.pieces[2].piece = '<|begin_of_text|>'
m.pieces[10].piece = '<|start_header_id|>'
m.pieces[11].piece = '<|end_header_id|>'
m.pieces[12].piece = '<|eot_id|>'
with open('tokenizer.model', 'wb') as f:
f.write(m.SerializeToString())