GuacamolSELFIETokenizer / tokenizer.json
haydn-jones's picture
Update tokenizer.json
d8e9dcb
raw
history blame
4.05 kB
{
"version": "1.0",
"truncation": null,
"padding": {
"strategy": "BatchLongest",
"direction": "Right",
"pad_to_multiple_of": null,
"pad_id": 2,
"pad_type_id": 0,
"pad_token": "<PAD>"
},
"added_tokens": [
{
"id": 0,
"content": "<CLS>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<EOS>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<PAD>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<UNK>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Split",
"pattern": {
"String": "]"
},
"behavior": "MergedWithPrevious",
"invert": false
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "<CLS>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "<EOS>",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {
"<CLS>": {
"id": "<CLS>",
"ids": [
0
],
"tokens": [
"<CLS>"
]
},
"<EOS>": {
"id": "<EOS>",
"ids": [
1
],
"tokens": [
"<EOS>"
]
}
}
},
"decoder": null,
"model": {
"type": "WordLevel",
"vocab": {
"<CLS>": 0,
"<EOS>": 1,
"<PAD>": 2,
"<UNK>": 3,
"[C]": 4,
"[=C]": 5,
"[Ring1]": 6,
"[Branch1]": 7,
"[N]": 8,
"[=Branch1]": 9,
"[O]": 10,
"[=O]": 11,
"[Ring2]": 12,
"[Branch2]": 13,
"[=N]": 14,
"[S]": 15,
"[#Branch1]": 16,
"[=Branch2]": 17,
"[F]": 18,
"[#Branch2]": 19,
"[#C]": 20,
"[Cl]": 21,
"[P]": 22,
"[NH1]": 23,
"[=Ring1]": 24,
"[O-1]": 25,
"[N+1]": 26,
"[Br]": 27,
"[#N]": 28,
"[=Ring2]": 29,
"[=S]": 30,
"[=N+1]": 31,
"[I]": 32,
"[S+1]": 33,
"[B]": 34,
"[Si]": 35,
"[=N-1]": 36,
"[=P]": 37,
"[Se]": 38,
"[H]": 39,
"[N-1]": 40,
"[C-1]": 41,
"[#N+1]": 42,
"[P+1]": 43,
"[OH0]": 44,
"[B-1]": 45,
"[PH1]": 46,
"[S-1]": 47,
"[=O+1]": 48,
"[=S+1]": 49,
"[=Se]": 50,
"[NH3+1]": 51,
"[C+1]": 52,
"[NH1+1]": 53,
"[BH2-1]": 54,
"[NH2+1]": 55,
"[O+1]": 56,
"[SeH1]": 57,
"[SH1]": 58,
"[SiH2]": 59,
"[=SH1]": 60,
"[=Se+1]": 61,
"[=OH1+1]": 62,
"[=PH1]": 63,
"[#C-1]": 64,
"[=NH1+1]": 65,
"[=NH2+1]": 66,
"[BH3-1]": 67,
"[CH1-1]": 68,
"[I+1]": 69,
"[CH1+1]": 70,
"[NH1-1]": 71,
"[BH1-1]": 72,
"[SiH1]": 73,
"[Se+1]": 74,
"[=C-1]": 75,
"[F+1]": 76,
"[=B]": 77,
"[=Si]": 78,
"[BH0]": 79,
"[CH1]": 80,
"[CH2+1]": 81,
"[Cl+1]": 82,
"[NH0]": 83,
"[#O+1]": 84,
"[Br+2]": 85,
"[Br-1]": 86,
"[CH2]": 87,
"[Cl+2]": 88,
"[Cl+3]": 89,
"[Cl-1]": 90,
"[F-1]": 91,
"[I+2]": 92,
"[I+3]": 93,
"[PH2+1]": 94,
"[Se-1]": 95,
"[SeH2]": 96,
"[Si-1]": 97,
"[SiH1-1]": 98
},
"unk_token": "<UNK>"
}
}