ybelkada commited on
Commit
c23b9a0
1 Parent(s): ad7da73

Upload tokenizer

Browse files
Files changed (4) hide show
  1. README.md +4 -4
  2. special_tokens_map.json +2 -9
  3. tokenizer.json +6 -6
  4. tokenizer_config.json +12 -8
README.md CHANGED
@@ -1,12 +1,12 @@
1
  ---
 
2
  library_name: transformers
 
 
 
3
  tags:
4
  - bitnet
5
  - falcon3
6
- base_model: tiiuae/Falcon3-3B-Instruct
7
- license: other
8
- license_name: falcon-llm-license
9
- license_link: https://falconllm.tii.ae/falcon-terms-and-conditions.html
10
  ---
11
 
12
 
 
1
  ---
2
+ base_model: tiiuae/Falcon3-3B-Instruct
3
  library_name: transformers
4
+ license: other
5
+ license_name: falcon-llm-license
6
+ license_link: https://falconllm.tii.ae/falcon-terms-and-conditions.html
7
  tags:
8
  - bitnet
9
  - falcon3
 
 
 
 
10
  ---
11
 
12
 
special_tokens_map.json CHANGED
@@ -24,22 +24,15 @@
24
  ">>PASSWORD<<",
25
  ">>KEY<<"
26
  ],
27
- "bos_token": {
28
- "content": "<|startoftext|>",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false
33
- },
34
  "eos_token": {
35
- "content": "<|im_end|>",
36
  "lstrip": false,
37
  "normalized": false,
38
  "rstrip": false,
39
  "single_word": false
40
  },
41
  "pad_token": {
42
- "content": "<|endoftext|>",
43
  "lstrip": false,
44
  "normalized": false,
45
  "rstrip": false,
 
24
  ">>PASSWORD<<",
25
  ">>KEY<<"
26
  ],
 
 
 
 
 
 
 
27
  "eos_token": {
28
+ "content": "<|endoftext|>",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
32
  "single_word": false
33
  },
34
  "pad_token": {
35
+ "content": "<|pad|>",
36
  "lstrip": false,
37
  "normalized": false,
38
  "rstrip": false,
tokenizer.json CHANGED
@@ -1139,7 +1139,7 @@
1139
  },
1140
  {
1141
  "id": 126,
1142
- "content": "<|im_start|>",
1143
  "single_word": false,
1144
  "lstrip": false,
1145
  "rstrip": false,
@@ -1148,7 +1148,7 @@
1148
  },
1149
  {
1150
  "id": 127,
1151
- "content": "<|im_end|>",
1152
  "single_word": false,
1153
  "lstrip": false,
1154
  "rstrip": false,
@@ -18212,7 +18212,7 @@
18212
  },
18213
  {
18214
  "id": 2023,
18215
- "content": ">>UNUSED_1897<<",
18216
  "single_word": false,
18217
  "lstrip": false,
18218
  "rstrip": false,
@@ -18383,8 +18383,8 @@
18383
  ">>POS_97<<": 123,
18384
  ">>POS_98<<": 124,
18385
  ">>POS_99<<": 125,
18386
- "<|im_start|>": 126,
18387
- "<|im_end|>": 127,
18388
  ">>UNUSED_2<<": 128,
18389
  ">>UNUSED_3<<": 129,
18390
  ">>UNUSED_4<<": 130,
@@ -20280,7 +20280,7 @@
20280
  ">>UNUSED_1894<<": 2020,
20281
  ">>UNUSED_1895<<": 2021,
20282
  ">>UNUSED_1896<<": 2022,
20283
- ">>UNUSED_1897<<": 2023,
20284
  "!": 2024,
20285
  "\"": 2025,
20286
  "#": 2026,
 
1139
  },
1140
  {
1141
  "id": 126,
1142
+ "content": ">>UNUSED_0<<",
1143
  "single_word": false,
1144
  "lstrip": false,
1145
  "rstrip": false,
 
1148
  },
1149
  {
1150
  "id": 127,
1151
+ "content": ">>UNUSED_1<<",
1152
  "single_word": false,
1153
  "lstrip": false,
1154
  "rstrip": false,
 
18212
  },
18213
  {
18214
  "id": 2023,
18215
+ "content": "<|pad|>",
18216
  "single_word": false,
18217
  "lstrip": false,
18218
  "rstrip": false,
 
18383
  ">>POS_97<<": 123,
18384
  ">>POS_98<<": 124,
18385
  ">>POS_99<<": 125,
18386
+ ">>UNUSED_0<<": 126,
18387
+ ">>UNUSED_1<<": 127,
18388
  ">>UNUSED_2<<": 128,
18389
  ">>UNUSED_3<<": 129,
18390
  ">>UNUSED_4<<": 130,
 
20280
  ">>UNUSED_1894<<": 2020,
20281
  ">>UNUSED_1895<<": 2021,
20282
  ">>UNUSED_1896<<": 2022,
20283
+ "<|pad|>": 2023,
20284
  "!": 2024,
20285
  "\"": 2025,
20286
  "#": 2026,
tokenizer_config.json CHANGED
@@ -1010,7 +1010,7 @@
1010
  "special": true
1011
  },
1012
  "126": {
1013
- "content": "<|im_start|>",
1014
  "lstrip": false,
1015
  "normalized": false,
1016
  "rstrip": false,
@@ -1018,7 +1018,7 @@
1018
  "special": true
1019
  },
1020
  "127": {
1021
- "content": "<|im_end|>",
1022
  "lstrip": false,
1023
  "normalized": false,
1024
  "rstrip": false,
@@ -16186,7 +16186,7 @@
16186
  "special": true
16187
  },
16188
  "2023": {
16189
- "content": ">>UNUSED_1897<<",
16190
  "lstrip": false,
16191
  "normalized": false,
16192
  "rstrip": false,
@@ -16219,11 +16219,15 @@
16219
  ">>PASSWORD<<",
16220
  ">>KEY<<"
16221
  ],
16222
- "bos_token": "<|startoftext|>",
16223
- "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
16224
  "clean_up_tokenization_spaces": true,
16225
- "eos_token": "<|im_end|>",
16226
- "model_max_length": 8192,
16227
- "pad_token": "<|endoftext|>",
 
 
 
 
 
16228
  "tokenizer_class": "PreTrainedTokenizerFast"
16229
  }
 
1010
  "special": true
1011
  },
1012
  "126": {
1013
+ "content": ">>UNUSED_0<<",
1014
  "lstrip": false,
1015
  "normalized": false,
1016
  "rstrip": false,
 
1018
  "special": true
1019
  },
1020
  "127": {
1021
+ "content": ">>UNUSED_1<<",
1022
  "lstrip": false,
1023
  "normalized": false,
1024
  "rstrip": false,
 
16186
  "special": true
16187
  },
16188
  "2023": {
16189
+ "content": "<|pad|>",
16190
  "lstrip": false,
16191
  "normalized": false,
16192
  "rstrip": false,
 
16219
  ">>PASSWORD<<",
16220
  ">>KEY<<"
16221
  ],
16222
+ "chat_template": "{% if tools %}{% for message in messages %}{% if message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + '\nYou are an expert in composing functions. You are given a question and a set of possible functions. \nBased on the question, you will need to make one or more function/tool calls to achieve the purpose. \nIf none of the functions can be used, point it out and refuse to answer. \nIf the given question lacks the parameters required by the function, also point it out.\n\n You have access to the following tools:\n<tools>' + tools|tojson + '</tools>\n\nThe output MUST strictly adhere to the following format, and NO other text MUST be included.\nThe example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make the tool calls an empty list [].\n<tool_call>[\n{\"name\": \"function_name1\", \"arguments\": {\"argument1\": \"value1\", \"argument2\": \"value2\"}},\n... (more tool calls as required)\n]</tool_call>' }}{% elif message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '\n' }}{% elif message['role'] == 'assistant' %}{% if not loop.last %}{{ '<|assistant|>\n' + message['content'] + eos_token + '\n' }}{% else %}{{ '<|assistant|>\n' + message['content'] + eos_token }}{% endif %}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}{% endfor %}{% else %}{% for message in messages %}{% if message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + '\n' }}{% elif message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '\n' }}{% elif message['role'] == 'assistant' %}{% if not loop.last %}{{ '<|assistant|>\n' + message['content'] + eos_token + '\n' }}{% else %}{{ '<|assistant|>\n' + message['content'] + eos_token }}{% endif %}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}{% endfor %}{% endif %}",
 
16223
  "clean_up_tokenization_spaces": true,
16224
+ "eos_token": "<|endoftext|>",
16225
+ "extra_special_tokens": {},
16226
+ "model_input_names": [
16227
+ "input_ids",
16228
+ "attention_mask"
16229
+ ],
16230
+ "model_max_length": 32768,
16231
+ "pad_token": "<|pad|>",
16232
  "tokenizer_class": "PreTrainedTokenizerFast"
16233
  }