nicoboss commited on
Commit
66d1e9e
1 Parent(s): 9629b9e

Updated the tokenizer to latest from mistralai/Mistral-Nemo-Base-2407 to make the model llama.cpp compatible

Browse files
Files changed (2) hide show
  1. tokenizer.json +0 -36
  2. tokenizer_config.json +5 -21
tokenizer.json CHANGED
@@ -9002,24 +9002,6 @@
9002
  "rstrip": false,
9003
  "normalized": false,
9004
  "special": true
9005
- },
9006
- {
9007
- "id": 131072,
9008
- "content": "<|im_end|>",
9009
- "single_word": false,
9010
- "lstrip": false,
9011
- "rstrip": false,
9012
- "normalized": false,
9013
- "special": true
9014
- },
9015
- {
9016
- "id": 131073,
9017
- "content": "<|im_start|>",
9018
- "single_word": false,
9019
- "lstrip": false,
9020
- "rstrip": false,
9021
- "normalized": false,
9022
- "special": false
9023
  }
9024
  ],
9025
  "normalizer": null,
@@ -9056,12 +9038,6 @@
9056
  "id": "A",
9057
  "type_id": 0
9058
  }
9059
- },
9060
- {
9061
- "SpecialToken": {
9062
- "id": "</s>",
9063
- "type_id": 0
9064
- }
9065
  }
9066
  ],
9067
  "pair": [
@@ -9077,12 +9053,6 @@
9077
  "type_id": 0
9078
  }
9079
  },
9080
- {
9081
- "SpecialToken": {
9082
- "id": "</s>",
9083
- "type_id": 0
9084
- }
9085
- },
9086
  {
9087
  "SpecialToken": {
9088
  "id": "<s>",
@@ -9094,12 +9064,6 @@
9094
  "id": "B",
9095
  "type_id": 1
9096
  }
9097
- },
9098
- {
9099
- "SpecialToken": {
9100
- "id": "</s>",
9101
- "type_id": 1
9102
- }
9103
  }
9104
  ],
9105
  "special_tokens": {
 
9002
  "rstrip": false,
9003
  "normalized": false,
9004
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9005
  }
9006
  ],
9007
  "normalizer": null,
 
9038
  "id": "A",
9039
  "type_id": 0
9040
  }
 
 
 
 
 
 
9041
  }
9042
  ],
9043
  "pair": [
 
9053
  "type_id": 0
9054
  }
9055
  },
 
 
 
 
 
 
9056
  {
9057
  "SpecialToken": {
9058
  "id": "<s>",
 
9064
  "id": "B",
9065
  "type_id": 1
9066
  }
 
 
 
 
 
 
9067
  }
9068
  ],
9069
  "special_tokens": {
tokenizer_config.json CHANGED
@@ -1,4 +1,6 @@
1
  {
 
 
2
  "add_prefix_space": false,
3
  "added_tokens_decoder": {
4
  "0": {
@@ -8000,30 +8002,12 @@
8000
  "rstrip": false,
8001
  "single_word": false,
8002
  "special": true
8003
- },
8004
- "131072": {
8005
- "content": "<|im_end|>",
8006
- "lstrip": false,
8007
- "normalized": false,
8008
- "rstrip": false,
8009
- "single_word": false,
8010
- "special": true
8011
- },
8012
- "131073": {
8013
- "content": "<|im_start|>",
8014
- "lstrip": false,
8015
- "normalized": false,
8016
- "rstrip": false,
8017
- "single_word": false,
8018
- "special": false
8019
  }
8020
  },
8021
  "bos_token": "<s>",
8022
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
8023
- "clean_up_tokenization_spaces": true,
8024
- "eos_token": "<|im_end|>",
8025
  "model_max_length": 1000000000000000019884624838656,
8026
- "pad_token": "<pad>",
8027
- "tokenizer_class": "GPT2Tokenizer",
8028
  "unk_token": "<unk>"
8029
  }
 
1
  {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
  "add_prefix_space": false,
5
  "added_tokens_decoder": {
6
  "0": {
 
8002
  "rstrip": false,
8003
  "single_word": false,
8004
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8005
  }
8006
  },
8007
  "bos_token": "<s>",
8008
+ "clean_up_tokenization_spaces": false,
8009
+ "eos_token": "</s>",
 
8010
  "model_max_length": 1000000000000000019884624838656,
8011
+ "tokenizer_class": "PreTrainedTokenizerFast",
 
8012
  "unk_token": "<unk>"
8013
  }