Upload tokenizer
Browse files- merges.txt +89 -0
- special_tokens_map.json +3 -13
- tokenizer.json +0 -0
- tokenizer_config.json +6 -44
- vocab.json +0 -0
merges.txt
CHANGED
@@ -24742,3 +24742,92 @@ Harm ac
|
|
24742 |
Pan ter
|
24743 |
Pan ama
|
24744 |
Ray n
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24742 |
Pan ter
|
24743 |
Pan ama
|
24744 |
Ray n
|
24745 |
+
Ray kee
|
24746 |
+
Tam er
|
24747 |
+
ĠBell ion
|
24748 |
+
ĠBell ringer
|
24749 |
+
cho Cinco
|
24750 |
+
As ia
|
24751 |
+
As alt
|
24752 |
+
As aulty
|
24753 |
+
As phyxi
|
24754 |
+
Cry Baby
|
24755 |
+
Cry pt
|
24756 |
+
Cry pto
|
24757 |
+
Jun ior
|
24758 |
+
Ly ric
|
24759 |
+
Ly ssa
|
24760 |
+
Ly kr
|
24761 |
+
Mort ar
|
24762 |
+
Mort is
|
24763 |
+
Mayhem s
|
24764 |
+
Pand oras
|
24765 |
+
Sand storm
|
24766 |
+
isty k
|
24767 |
+
ĠLo athe
|
24768 |
+
Scrap e
|
24769 |
+
Scrap per
|
24770 |
+
Scrap pers
|
24771 |
+
Day a
|
24772 |
+
Day zee
|
24773 |
+
Day tona
|
24774 |
+
Pl ower
|
24775 |
+
UR His
|
24776 |
+
Ġla ya
|
24777 |
+
isk atrix
|
24778 |
+
isk ated
|
24779 |
+
ĠDod g
|
24780 |
+
ors et
|
24781 |
+
ĠLu cha
|
24782 |
+
ĠAw kward
|
24783 |
+
ĠWitch y
|
24784 |
+
Ali as
|
24785 |
+
Bell ona
|
24786 |
+
Bell istic
|
24787 |
+
On na
|
24788 |
+
On Ya
|
24789 |
+
Riot er
|
24790 |
+
Tar en
|
24791 |
+
Tar ah
|
24792 |
+
to es
|
24793 |
+
ĠMay o
|
24794 |
+
ĠMay ham
|
24795 |
+
ĠMay onnaise
|
24796 |
+
ĠDemon ium
|
24797 |
+
ĠEd ee
|
24798 |
+
ĠEd dy
|
24799 |
+
ĠEd ges
|
24800 |
+
ĠEd usk
|
24801 |
+
ĠKnox xx
|
24802 |
+
Mega Bite
|
24803 |
+
Ber i
|
24804 |
+
Ber ri
|
24805 |
+
Count em
|
24806 |
+
Count essa
|
24807 |
+
Fre enia
|
24808 |
+
Hal a
|
24809 |
+
Op al
|
24810 |
+
Sher e
|
24811 |
+
Sher ri
|
24812 |
+
Sher man
|
24813 |
+
Sher yl
|
24814 |
+
ĠDer rière
|
24815 |
+
heart less
|
24816 |
+
ĠStomp Her
|
24817 |
+
ĠStomp sky
|
24818 |
+
Ana is
|
24819 |
+
Short Stack
|
24820 |
+
opp osition
|
24821 |
+
ĠSkull en
|
24822 |
+
ĠSkull Shiner
|
24823 |
+
ĠSkull Krusher
|
24824 |
+
Slaughter mel
|
24825 |
+
Goldie Lock
|
24826 |
+
Bra un
|
24827 |
+
Bra kes
|
24828 |
+
Bra ids
|
24829 |
+
Can uk
|
24830 |
+
Can uckle
|
24831 |
+
Cam reon
|
24832 |
+
Crack lin
|
24833 |
+
Crack alack
|
special_tokens_map.json
CHANGED
@@ -1,15 +1,5 @@
|
|
1 |
{
|
2 |
-
"bos_token": "
|
3 |
-
"
|
4 |
-
"
|
5 |
-
"mask_token": {
|
6 |
-
"content": "[MASK]",
|
7 |
-
"lstrip": true,
|
8 |
-
"normalized": false,
|
9 |
-
"rstrip": false,
|
10 |
-
"single_word": false
|
11 |
-
},
|
12 |
-
"pad_token": "<pad>",
|
13 |
-
"sep_token": "[SEP]",
|
14 |
-
"unk_token": "<unk>"
|
15 |
}
|
|
|
1 |
{
|
2 |
+
"bos_token": "<|endoftext|>",
|
3 |
+
"eos_token": "<|endoftext|>",
|
4 |
+
"unk_token": "<|endoftext|>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
}
|
tokenizer.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
CHANGED
@@ -1,57 +1,19 @@
|
|
1 |
{
|
|
|
2 |
"added_tokens_decoder": {
|
3 |
"0": {
|
4 |
-
"content": "
|
5 |
"lstrip": false,
|
6 |
"normalized": false,
|
7 |
"rstrip": false,
|
8 |
"single_word": false,
|
9 |
"special": true
|
10 |
-
},
|
11 |
-
"1": {
|
12 |
-
"content": "[SEP]",
|
13 |
-
"lstrip": false,
|
14 |
-
"normalized": false,
|
15 |
-
"rstrip": false,
|
16 |
-
"single_word": false,
|
17 |
-
"special": true
|
18 |
-
},
|
19 |
-
"2": {
|
20 |
-
"content": "<unk>",
|
21 |
-
"lstrip": false,
|
22 |
-
"normalized": false,
|
23 |
-
"rstrip": false,
|
24 |
-
"single_word": false,
|
25 |
-
"special": true
|
26 |
-
},
|
27 |
-
"3": {
|
28 |
-
"content": "<pad>",
|
29 |
-
"lstrip": false,
|
30 |
-
"normalized": false,
|
31 |
-
"rstrip": false,
|
32 |
-
"single_word": false,
|
33 |
-
"special": true
|
34 |
-
},
|
35 |
-
"4": {
|
36 |
-
"content": "[MASK]",
|
37 |
-
"lstrip": true,
|
38 |
-
"normalized": false,
|
39 |
-
"rstrip": false,
|
40 |
-
"single_word": false,
|
41 |
-
"special": true
|
42 |
}
|
43 |
},
|
44 |
-
"bos_token": "
|
45 |
"clean_up_tokenization_spaces": false,
|
46 |
-
"
|
47 |
-
"do_lower_case": true,
|
48 |
-
"eos_token": "[SEP]",
|
49 |
-
"keep_accents": false,
|
50 |
-
"mask_token": "[MASK]",
|
51 |
"model_max_length": 1000000000000000019884624838656,
|
52 |
-
"
|
53 |
-
"
|
54 |
-
"sep_token": "[SEP]",
|
55 |
-
"tokenizer_class": "AlbertTokenizer",
|
56 |
-
"unk_token": "<unk>"
|
57 |
}
|
|
|
1 |
{
|
2 |
+
"add_prefix_space": false,
|
3 |
"added_tokens_decoder": {
|
4 |
"0": {
|
5 |
+
"content": "<|endoftext|>",
|
6 |
"lstrip": false,
|
7 |
"normalized": false,
|
8 |
"rstrip": false,
|
9 |
"single_word": false,
|
10 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
}
|
12 |
},
|
13 |
+
"bos_token": "<|endoftext|>",
|
14 |
"clean_up_tokenization_spaces": false,
|
15 |
+
"eos_token": "<|endoftext|>",
|
|
|
|
|
|
|
|
|
16 |
"model_max_length": 1000000000000000019884624838656,
|
17 |
+
"tokenizer_class": "GPT2Tokenizer",
|
18 |
+
"unk_token": "<|endoftext|>"
|
|
|
|
|
|
|
19 |
}
|
vocab.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|