Updated the tokenizer to latest from mistralai/Mistral-Nemo-Base-2407 to make the model llama.cpp compatible
Browse files- tokenizer.json +0 -36
- tokenizer_config.json +5 -21
tokenizer.json
CHANGED
@@ -9002,24 +9002,6 @@
|
|
9002 |
"rstrip": false,
|
9003 |
"normalized": false,
|
9004 |
"special": true
|
9005 |
-
},
|
9006 |
-
{
|
9007 |
-
"id": 131072,
|
9008 |
-
"content": "<|im_end|>",
|
9009 |
-
"single_word": false,
|
9010 |
-
"lstrip": false,
|
9011 |
-
"rstrip": false,
|
9012 |
-
"normalized": false,
|
9013 |
-
"special": true
|
9014 |
-
},
|
9015 |
-
{
|
9016 |
-
"id": 131073,
|
9017 |
-
"content": "<|im_start|>",
|
9018 |
-
"single_word": false,
|
9019 |
-
"lstrip": false,
|
9020 |
-
"rstrip": false,
|
9021 |
-
"normalized": false,
|
9022 |
-
"special": false
|
9023 |
}
|
9024 |
],
|
9025 |
"normalizer": null,
|
@@ -9056,12 +9038,6 @@
|
|
9056 |
"id": "A",
|
9057 |
"type_id": 0
|
9058 |
}
|
9059 |
-
},
|
9060 |
-
{
|
9061 |
-
"SpecialToken": {
|
9062 |
-
"id": "</s>",
|
9063 |
-
"type_id": 0
|
9064 |
-
}
|
9065 |
}
|
9066 |
],
|
9067 |
"pair": [
|
@@ -9077,12 +9053,6 @@
|
|
9077 |
"type_id": 0
|
9078 |
}
|
9079 |
},
|
9080 |
-
{
|
9081 |
-
"SpecialToken": {
|
9082 |
-
"id": "</s>",
|
9083 |
-
"type_id": 0
|
9084 |
-
}
|
9085 |
-
},
|
9086 |
{
|
9087 |
"SpecialToken": {
|
9088 |
"id": "<s>",
|
@@ -9094,12 +9064,6 @@
|
|
9094 |
"id": "B",
|
9095 |
"type_id": 1
|
9096 |
}
|
9097 |
-
},
|
9098 |
-
{
|
9099 |
-
"SpecialToken": {
|
9100 |
-
"id": "</s>",
|
9101 |
-
"type_id": 1
|
9102 |
-
}
|
9103 |
}
|
9104 |
],
|
9105 |
"special_tokens": {
|
|
|
9002 |
"rstrip": false,
|
9003 |
"normalized": false,
|
9004 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9005 |
}
|
9006 |
],
|
9007 |
"normalizer": null,
|
|
|
9038 |
"id": "A",
|
9039 |
"type_id": 0
|
9040 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
9041 |
}
|
9042 |
],
|
9043 |
"pair": [
|
|
|
9053 |
"type_id": 0
|
9054 |
}
|
9055 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
9056 |
{
|
9057 |
"SpecialToken": {
|
9058 |
"id": "<s>",
|
|
|
9064 |
"id": "B",
|
9065 |
"type_id": 1
|
9066 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
9067 |
}
|
9068 |
],
|
9069 |
"special_tokens": {
|
tokenizer_config.json
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
{
|
|
|
|
|
2 |
"add_prefix_space": false,
|
3 |
"added_tokens_decoder": {
|
4 |
"0": {
|
@@ -8000,30 +8002,12 @@
|
|
8000 |
"rstrip": false,
|
8001 |
"single_word": false,
|
8002 |
"special": true
|
8003 |
-
},
|
8004 |
-
"131072": {
|
8005 |
-
"content": "<|im_end|>",
|
8006 |
-
"lstrip": false,
|
8007 |
-
"normalized": false,
|
8008 |
-
"rstrip": false,
|
8009 |
-
"single_word": false,
|
8010 |
-
"special": true
|
8011 |
-
},
|
8012 |
-
"131073": {
|
8013 |
-
"content": "<|im_start|>",
|
8014 |
-
"lstrip": false,
|
8015 |
-
"normalized": false,
|
8016 |
-
"rstrip": false,
|
8017 |
-
"single_word": false,
|
8018 |
-
"special": false
|
8019 |
}
|
8020 |
},
|
8021 |
"bos_token": "<s>",
|
8022 |
-
"
|
8023 |
-
"
|
8024 |
-
"eos_token": "<|im_end|>",
|
8025 |
"model_max_length": 1000000000000000019884624838656,
|
8026 |
-
"
|
8027 |
-
"tokenizer_class": "GPT2Tokenizer",
|
8028 |
"unk_token": "<unk>"
|
8029 |
}
|
|
|
1 |
{
|
2 |
+
"add_bos_token": true,
|
3 |
+
"add_eos_token": false,
|
4 |
"add_prefix_space": false,
|
5 |
"added_tokens_decoder": {
|
6 |
"0": {
|
|
|
8002 |
"rstrip": false,
|
8003 |
"single_word": false,
|
8004 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8005 |
}
|
8006 |
},
|
8007 |
"bos_token": "<s>",
|
8008 |
+
"clean_up_tokenization_spaces": false,
|
8009 |
+
"eos_token": "</s>",
|
|
|
8010 |
"model_max_length": 1000000000000000019884624838656,
|
8011 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
|
|
8012 |
"unk_token": "<unk>"
|
8013 |
}
|