ahmedhassan7030
commited on
Commit
•
5a02355
1
Parent(s):
1fab08f
Upload tokenizer
Browse files- added_tokens.json +3 -0
- merges.txt +177 -0
- special_tokens_map.json +17 -3
- tokenizer.json +0 -0
- tokenizer_config.json +11 -37
- vocab.json +1 -0
added_tokens.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"<|endoftext|>": 437
|
3 |
+
}
|
merges.txt
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#version: 0.2
|
2 |
+
Þ ¦
|
3 |
+
Ġ Þ
|
4 |
+
Þ °
|
5 |
+
Þ ¬
|
6 |
+
Þ ª
|
7 |
+
Þ Ĥ
|
8 |
+
Þ ¨
|
9 |
+
Þ ĩ
|
10 |
+
Þ §
|
11 |
+
Þ ĥ
|
12 |
+
Þ ī
|
13 |
+
Þ Ĩ
|
14 |
+
Þ Ī
|
15 |
+
ĠÞ ĩ
|
16 |
+
Þ İ
|
17 |
+
Þ ©
|
18 |
+
Þ Į
|
19 |
+
Þ Ģ
|
20 |
+
Þ ģ
|
21 |
+
Þ ĭ
|
22 |
+
Þ IJ
|
23 |
+
Þ į
|
24 |
+
Þ Ń
|
25 |
+
Þ ®
|
26 |
+
Þ Ĭ
|
27 |
+
ĠÞ Ģ
|
28 |
+
ĠÞ ī
|
29 |
+
ĠÞ Ĩ
|
30 |
+
Þ Ħ
|
31 |
+
ĠÞ Ħ
|
32 |
+
ĠÞ Ī
|
33 |
+
ĠÞ ĭ
|
34 |
+
ĠÞ Ĭ
|
35 |
+
ĠÞ Ĥ
|
36 |
+
Þ ħ
|
37 |
+
Þ Ķ
|
38 |
+
Þ ij
|
39 |
+
Þ ĵ
|
40 |
+
ĠÞ İ
|
41 |
+
Þ ¯
|
42 |
+
Þ «
|
43 |
+
Þ ĸ
|
44 |
+
ĠÞ Į
|
45 |
+
ĠÞ ĥ
|
46 |
+
ĠÞ IJ
|
47 |
+
ĠÞ į
|
48 |
+
Þ Ĵ
|
49 |
+
ÞĤ Þİ
|
50 |
+
Þ Ŀ
|
51 |
+
ÞĤ Þij
|
52 |
+
Þ ķ
|
53 |
+
ĠÞ ĸ
|
54 |
+
Þ Ĺ
|
55 |
+
Ø Ł
|
56 |
+
ÞĤ Þĭ
|
57 |
+
ĠÞ Ĵ
|
58 |
+
Þ ¢
|
59 |
+
Þ ¤
|
60 |
+
Þ Ļ
|
61 |
+
ĠÞ Ļ
|
62 |
+
ĠÞ ¢
|
63 |
+
ĠÞ Ŀ
|
64 |
+
Þ¦ ØŁ
|
65 |
+
ĠÞ ļ
|
66 |
+
ĠÞ ĵ
|
67 |
+
Þ ŀ
|
68 |
+
ĠÞ Ķ
|
69 |
+
ĠÞ ij
|
70 |
+
ĠÞ ķ
|
71 |
+
ĠÞ ¤
|
72 |
+
Þ ļ
|
73 |
+
ÞĤ ÞĦ
|
74 |
+
ï ·
|
75 |
+
Þ¬ ØŁ
|
76 |
+
Þ ı
|
77 |
+
ï· ²
|
78 |
+
Þ ł
|
79 |
+
Þ° ØŁ
|
80 |
+
ĠÞ ŀ
|
81 |
+
Ø Į
|
82 |
+
Þ Ł
|
83 |
+
Þ° .
|
84 |
+
Þ¬ !
|
85 |
+
Þ ĺ
|
86 |
+
Ġ ï·²
|
87 |
+
Þ© .
|
88 |
+
Þĥ ÞĪ
|
89 |
+
Þĥ Þĵ
|
90 |
+
ĠÞ Ĺ
|
91 |
+
Þ© ØŁ
|
92 |
+
Þ§ ØŁ
|
93 |
+
Þ £
|
94 |
+
Þ¦ Þ¦
|
95 |
+
â Ģ
|
96 |
+
ĠÞ ħ
|
97 |
+
Þ¬ Þ¬
|
98 |
+
ÞĤ Þĩ
|
99 |
+
Þ¨ .
|
100 |
+
Þĥ ÞĨ
|
101 |
+
Þĥ Þİ
|
102 |
+
Þ¯ ØŁ
|
103 |
+
Ġ -
|
104 |
+
Ġ ï·
|
105 |
+
Ġï· º
|
106 |
+
Þ Ľ
|
107 |
+
ĠÞ Ł
|
108 |
+
ĠÞ ł
|
109 |
+
Þ° Þ°
|
110 |
+
Þ° ØĮ
|
111 |
+
Þª Þª
|
112 |
+
Þª ØŁ
|
113 |
+
Þ¨ ØĮ
|
114 |
+
Þĥ ÞĮ
|
115 |
+
Þĥ Þij
|
116 |
+
Þĥ Þķ
|
117 |
+
ÞŃ !
|
118 |
+
âĢ ĺ
|
119 |
+
Þª ØĮ
|
120 |
+
Ġ âĢĺ
|
121 |
+
Þ¦ Þ¨
|
122 |
+
ĠÞ ĺ
|
123 |
+
Þ¬ .
|
124 |
+
Þ¬ Þ°
|
125 |
+
Þĩ Þĩ
|
126 |
+
Þ§ Þ§
|
127 |
+
Þĥ ÞĤ
|
128 |
+
Þĥ ÞĦ
|
129 |
+
Þİ Þĩ
|
130 |
+
âĢ Ļ
|
131 |
+
Þ ¥
|
132 |
+
Þĥ Þī
|
133 |
+
ĠÞ £
|
134 |
+
Þ ¡
|
135 |
+
Ġ ;
|
136 |
+
Ġ ØŁ
|
137 |
+
Þ¦ !
|
138 |
+
Þ¦ Þª
|
139 |
+
Þ¦ Þ§
|
140 |
+
Þ° !
|
141 |
+
Þ° âĢĻ
|
142 |
+
Þª Þ¦
|
143 |
+
ÞĤ ÞĤ
|
144 |
+
ÞĤ ÞĨ
|
145 |
+
ÞĤ ÞĢ
|
146 |
+
ÞĤ Þģ
|
147 |
+
ÞĤ Þĸ
|
148 |
+
Þ¨ -
|
149 |
+
Þ¨ Þ©
|
150 |
+
Þĩ Þĥ
|
151 |
+
Þ§ Þ°
|
152 |
+
Þ§ ØĮ
|
153 |
+
Þĥ ÞIJ
|
154 |
+
Þĥ Þį
|
155 |
+
Þĥ ÞĬ
|
156 |
+
Þĥ ÞĴ
|
157 |
+
Þĥ ÞĹ
|
158 |
+
Þī ÞĪ
|
159 |
+
ÞĪ Þĩ
|
160 |
+
Þİ Þĥ
|
161 |
+
Þİ Þİ
|
162 |
+
Þ© :
|
163 |
+
Þ© ;
|
164 |
+
Þ© ØĮ
|
165 |
+
Þ© âĢĻ
|
166 |
+
ÞĢ Þĩ
|
167 |
+
ÞĢ Þĭ
|
168 |
+
ÞŃ .
|
169 |
+
ÞŃ ØŁ
|
170 |
+
ÞŃ ØĮ
|
171 |
+
Þ® Þ¦
|
172 |
+
ĠÞĤ Þij
|
173 |
+
Þ¯ !
|
174 |
+
Þ¯ .
|
175 |
+
ĠÞIJ Þĵ
|
176 |
+
Þı Þģ
|
177 |
+
ï·² Þİ
|
special_tokens_map.json
CHANGED
@@ -1,8 +1,22 @@
|
|
1 |
{
|
2 |
-
"
|
3 |
-
"content": "
|
4 |
"lstrip": false,
|
5 |
-
"normalized":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
"rstrip": false,
|
7 |
"single_word": false
|
8 |
}
|
|
|
1 |
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<|endoftext|>",
|
4 |
"lstrip": false,
|
5 |
+
"normalized": true,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<|endoftext|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": true,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"unk_token": {
|
17 |
+
"content": "<|endoftext|>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": true,
|
20 |
"rstrip": false,
|
21 |
"single_word": false
|
22 |
}
|
tokenizer.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
CHANGED
@@ -1,49 +1,23 @@
|
|
1 |
{
|
|
|
|
|
2 |
"added_tokens_decoder": {
|
3 |
-
"
|
4 |
-
"content": "
|
5 |
"lstrip": false,
|
6 |
-
"normalized":
|
7 |
-
"rstrip": false,
|
8 |
-
"single_word": false,
|
9 |
-
"special": true
|
10 |
-
},
|
11 |
-
"1": {
|
12 |
-
"content": "<s>",
|
13 |
-
"lstrip": false,
|
14 |
-
"normalized": false,
|
15 |
-
"rstrip": false,
|
16 |
-
"single_word": false,
|
17 |
-
"special": true
|
18 |
-
},
|
19 |
-
"2": {
|
20 |
-
"content": "</s>",
|
21 |
-
"lstrip": false,
|
22 |
-
"normalized": false,
|
23 |
-
"rstrip": false,
|
24 |
-
"single_word": false,
|
25 |
-
"special": true
|
26 |
-
},
|
27 |
-
"3": {
|
28 |
-
"content": "<unk>",
|
29 |
-
"lstrip": false,
|
30 |
-
"normalized": false,
|
31 |
-
"rstrip": false,
|
32 |
-
"single_word": false,
|
33 |
-
"special": true
|
34 |
-
},
|
35 |
-
"8000": {
|
36 |
-
"content": "[PAD]",
|
37 |
-
"lstrip": false,
|
38 |
-
"normalized": false,
|
39 |
"rstrip": false,
|
40 |
"single_word": false,
|
41 |
"special": true
|
42 |
}
|
43 |
},
|
|
|
44 |
"clean_up_tokenization_spaces": false,
|
|
|
|
|
45 |
"extra_special_tokens": {},
|
46 |
"model_max_length": 1000000000000000019884624838656,
|
47 |
-
"pad_token":
|
48 |
-
"tokenizer_class": "
|
|
|
49 |
}
|
|
|
1 |
{
|
2 |
+
"add_bos_token": false,
|
3 |
+
"add_prefix_space": false,
|
4 |
"added_tokens_decoder": {
|
5 |
+
"437": {
|
6 |
+
"content": "<|endoftext|>",
|
7 |
"lstrip": false,
|
8 |
+
"normalized": true,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
"rstrip": false,
|
10 |
"single_word": false,
|
11 |
"special": true
|
12 |
}
|
13 |
},
|
14 |
+
"bos_token": "<|endoftext|>",
|
15 |
"clean_up_tokenization_spaces": false,
|
16 |
+
"eos_token": "<|endoftext|>",
|
17 |
+
"errors": "replace",
|
18 |
"extra_special_tokens": {},
|
19 |
"model_max_length": 1000000000000000019884624838656,
|
20 |
+
"pad_token": null,
|
21 |
+
"tokenizer_class": "GPT2Tokenizer",
|
22 |
+
"unk_token": "<|endoftext|>"
|
23 |
}
|
vocab.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"<s>":0,"<pad>":1,"</s>":2,"<unk>":3,"<mask>":4,"!":5,"\"":6,"#":7,"$":8,"%":9,"&":10,"'":11,"(":12,")":13,"*":14,"+":15,",":16,"-":17,".":18,"/":19,"0":20,"1":21,"2":22,"3":23,"4":24,"5":25,"6":26,"7":27,"8":28,"9":29,":":30,";":31,"<":32,"=":33,">":34,"?":35,"@":36,"A":37,"B":38,"C":39,"D":40,"E":41,"F":42,"G":43,"H":44,"I":45,"J":46,"K":47,"L":48,"M":49,"N":50,"O":51,"P":52,"Q":53,"R":54,"S":55,"T":56,"U":57,"V":58,"W":59,"X":60,"Y":61,"Z":62,"[":63,"\\":64,"]":65,"^":66,"_":67,"`":68,"a":69,"b":70,"c":71,"d":72,"e":73,"f":74,"g":75,"h":76,"i":77,"j":78,"k":79,"l":80,"m":81,"n":82,"o":83,"p":84,"q":85,"r":86,"s":87,"t":88,"u":89,"v":90,"w":91,"x":92,"y":93,"z":94,"{":95,"|":96,"}":97,"~":98,"¡":99,"¢":100,"£":101,"¤":102,"¥":103,"¦":104,"§":105,"¨":106,"©":107,"ª":108,"«":109,"¬":110,"®":111,"¯":112,"°":113,"±":114,"²":115,"³":116,"´":117,"µ":118,"¶":119,"·":120,"¸":121,"¹":122,"º":123,"»":124,"¼":125,"½":126,"¾":127,"¿":128,"À":129,"Á":130,"Â":131,"Ã":132,"Ä":133,"Å":134,"Æ":135,"Ç":136,"È":137,"É":138,"Ê":139,"Ë":140,"Ì":141,"Í":142,"Î":143,"Ï":144,"Ð":145,"Ñ":146,"Ò":147,"Ó":148,"Ô":149,"Õ":150,"Ö":151,"×":152,"Ø":153,"Ù":154,"Ú":155,"Û":156,"Ü":157,"Ý":158,"Þ":159,"ß":160,"à":161,"á":162,"â":163,"ã":164,"ä":165,"å":166,"æ":167,"ç":168,"è":169,"é":170,"ê":171,"ë":172,"ì":173,"í":174,"î":175,"ï":176,"ð":177,"ñ":178,"ò":179,"ó":180,"ô":181,"õ":182,"ö":183,"÷":184,"ø":185,"ù":186,"ú":187,"û":188,"ü":189,"ý":190,"þ":191,"ÿ":192,"Ā":193,"ā":194,"Ă":195,"ă":196,"Ą":197,"ą":198,"Ć":199,"ć":200,"Ĉ":201,"ĉ":202,"Ċ":203,"ċ":204,"Č":205,"č":206,"Ď":207,"ď":208,"Đ":209,"đ":210,"Ē":211,"ē":212,"Ĕ":213,"ĕ":214,"Ė":215,"ė":216,"Ę":217,"ę":218,"Ě":219,"ě":220,"Ĝ":221,"ĝ":222,"Ğ":223,"ğ":224,"Ġ":225,"ġ":226,"Ģ":227,"ģ":228,"Ĥ":229,"ĥ":230,"Ħ":231,"ħ":232,"Ĩ":233,"ĩ":234,"Ī":235,"ī":236,"Ĭ":237,"ĭ":238,"Į":239,"į":240,"İ":241,"ı":242,"IJ":243,"ij":244,"Ĵ":245,"ĵ":246,"Ķ":247,"ķ":248,"ĸ":249,"Ĺ":250,"ĺ":251,"Ļ":252,"ļ":253,"Ľ":254,"ľ":255,"Ŀ":256,"ŀ":257,"Ł":258,"ł":259,"Ń":260,"Þ¦":261,"ĠÞ":262,"Þ°":263,"Þ¬":264,"Þª":265,"ÞĤ":266,"Þ¨":267,"Þĩ":268,"Þ§":269,"Þĥ":270,"Þī":271,"ÞĨ":272,"ÞĪ":273,"ĠÞĩ":274,"Þİ":275,"Þ©":276,"ÞĮ":277,"ÞĢ":278,"Þģ":279,"Þĭ":280,"ÞIJ":281,"Þį":282,"ÞŃ":283,"Þ®":284,"ÞĬ":285,"ĠÞĢ":286,"ĠÞī":287,"ĠÞĨ":288,"ÞĦ":289,"ĠÞĦ":290,"ĠÞĪ":291,"ĠÞĭ":292,"ĠÞĬ":293,"ĠÞĤ":294,"Þħ":295,"ÞĶ":296,"Þij":297,"Þĵ":298,"ĠÞİ":299,"Þ¯":300,"Þ«":301,"Þĸ":302,"ĠÞĮ":303,"ĠÞĥ":304,"ĠÞIJ":305,"ĠÞį":306,"ÞĴ":307,"ÞĤÞİ":308,"ÞĿ":309,"ÞĤÞij":310,"Þķ":311,"ĠÞĸ":312,"ÞĹ":313,"ØŁ":314,"ÞĤÞĭ":315,"ĠÞĴ":316,"Þ¢":317,"Þ¤":318,"ÞĻ":319,"ĠÞĻ":320,"ĠÞ¢":321,"ĠÞĿ":322,"Þ¦ØŁ":323,"ĠÞļ":324,"ĠÞĵ":325,"Þŀ":326,"ĠÞĶ":327,"ĠÞij":328,"ĠÞķ":329,"ĠÞ¤":330,"Þļ":331,"ÞĤÞĦ":332,"ï·":333,"Þ¬ØŁ":334,"Þı":335,"ï·²":336,"Þł":337,"Þ°ØŁ":338,"ĠÞŀ":339,"ØĮ":340,"ÞŁ":341,"Þ°.":342,"Þ¬!":343,"Þĺ":344,"Ġï·²":345,"Þ©.":346,"ÞĥÞĪ":347,"ÞĥÞĵ":348,"ĠÞĹ":349,"Þ©ØŁ":350,"Þ§ØŁ":351,"Þ£":352,"Þ¦Þ¦":353,"âĢ":354,"ĠÞħ":355,"Þ¬Þ¬":356,"ÞĤÞĩ":357,"Þ¨.":358,"ÞĥÞĨ":359,"ÞĥÞİ":360,"Þ¯ØŁ":361,"Ġ-":362,"Ġï·":363,"Ġï·º":364,"ÞĽ":365,"ĠÞŁ":366,"ĠÞł":367,"Þ°Þ°":368,"Þ°ØĮ":369,"ÞªÞª":370,"ÞªØŁ":371,"Þ¨ØĮ":372,"ÞĥÞĮ":373,"ÞĥÞij":374,"ÞĥÞķ":375,"ÞŃ!":376,"âĢĺ":377,"ÞªØĮ":378,"ĠâĢĺ":379,"Þ¦Þ¨":380,"ĠÞĺ":381,"Þ¬.":382,"Þ¬Þ°":383,"ÞĩÞĩ":384,"Þ§Þ§":385,"ÞĥÞĤ":386,"ÞĥÞĦ":387,"ÞİÞĩ":388,"âĢĻ":389,"Þ¥":390,"ÞĥÞī":391,"ĠÞ£":392,"Þ¡":393,"Ġ;":394,"ĠØŁ":395,"Þ¦!":396,"Þ¦Þª":397,"Þ¦Þ§":398,"Þ°!":399,"Þ°âĢĻ":400,"ÞªÞ¦":401,"ÞĤÞĤ":402,"ÞĤÞĨ":403,"ÞĤÞĢ":404,"ÞĤÞģ":405,"ÞĤÞĸ":406,"Þ¨-":407,"Þ¨Þ©":408,"ÞĩÞĥ":409,"Þ§Þ°":410,"Þ§ØĮ":411,"ÞĥÞIJ":412,"ÞĥÞį":413,"ÞĥÞĬ":414,"ÞĥÞĴ":415,"ÞĥÞĹ":416,"ÞīÞĪ":417,"ÞĪÞĩ":418,"ÞİÞĥ":419,"ÞİÞİ":420,"Þ©:":421,"Þ©;":422,"Þ©ØĮ":423,"Þ©âĢĻ":424,"ÞĢÞĩ":425,"ÞĢÞĭ":426,"ÞŃ.":427,"ÞŃØŁ":428,"ÞŃØĮ":429,"Þ®Þ¦":430,"ĠÞĤÞij":431,"Þ¯!":432,"Þ¯.":433,"ĠÞIJÞĵ":434,"ÞıÞģ":435,"ï·²Þİ":436}
|