ahmedhassan7030 commited on
Commit
5a02355
1 Parent(s): 1fab08f

Upload tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 437
3
+ }
merges.txt ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version: 0.2
2
+ Þ ¦
3
+ Ġ Þ
4
+ Þ °
5
+ Þ ¬
6
+ Þ ª
7
+ Þ Ĥ
8
+ Þ ¨
9
+ Þ ĩ
10
+ Þ §
11
+ Þ ĥ
12
+ Þ ī
13
+ Þ Ĩ
14
+ Þ Ī
15
+ ĠÞ ĩ
16
+ Þ İ
17
+ Þ ©
18
+ Þ Į
19
+ Þ Ģ
20
+ Þ ģ
21
+ Þ ĭ
22
+ Þ IJ
23
+ Þ į
24
+ Þ Ń
25
+ Þ ®
26
+ Þ Ĭ
27
+ ĠÞ Ģ
28
+ ĠÞ ī
29
+ ĠÞ Ĩ
30
+ Þ Ħ
31
+ ĠÞ Ħ
32
+ ĠÞ Ī
33
+ ĠÞ ĭ
34
+ ĠÞ Ĭ
35
+ ĠÞ Ĥ
36
+ Þ ħ
37
+ Þ Ķ
38
+ Þ ij
39
+ Þ ĵ
40
+ ĠÞ İ
41
+ Þ ¯
42
+ Þ «
43
+ Þ ĸ
44
+ ĠÞ Į
45
+ ĠÞ ĥ
46
+ ĠÞ IJ
47
+ ĠÞ į
48
+ Þ Ĵ
49
+ ÞĤ Þİ
50
+ Þ Ŀ
51
+ ÞĤ Þij
52
+ Þ ķ
53
+ ĠÞ ĸ
54
+ Þ Ĺ
55
+ Ø Ł
56
+ ÞĤ Þĭ
57
+ ĠÞ Ĵ
58
+ Þ ¢
59
+ Þ ¤
60
+ Þ Ļ
61
+ ĠÞ Ļ
62
+ ĠÞ ¢
63
+ ĠÞ Ŀ
64
+ Þ¦ ØŁ
65
+ ĠÞ ļ
66
+ ĠÞ ĵ
67
+ Þ ŀ
68
+ ĠÞ Ķ
69
+ ĠÞ ij
70
+ ĠÞ ķ
71
+ ĠÞ ¤
72
+ Þ ļ
73
+ ÞĤ ÞĦ
74
+ ï ·
75
+ Þ¬ ØŁ
76
+ Þ ı
77
+ ï· ²
78
+ Þ ł
79
+ Þ° ØŁ
80
+ ĠÞ ŀ
81
+ Ø Į
82
+ Þ Ł
83
+ Þ° .
84
+ Þ¬ !
85
+ Þ ĺ
86
+ Ġ ï·²
87
+ Þ© .
88
+ Þĥ ÞĪ
89
+ Þĥ Þĵ
90
+ ĠÞ Ĺ
91
+ Þ© ØŁ
92
+ Þ§ ØŁ
93
+ Þ £
94
+ Þ¦ Þ¦
95
+ â Ģ
96
+ ĠÞ ħ
97
+ Þ¬ Þ¬
98
+ ÞĤ Þĩ
99
+ Þ¨ .
100
+ Þĥ ÞĨ
101
+ Þĥ Þİ
102
+ Þ¯ ØŁ
103
+ Ġ -
104
+ Ġ ï·
105
+ Ġï· º
106
+ Þ Ľ
107
+ ĠÞ Ł
108
+ ĠÞ ł
109
+ Þ° Þ°
110
+ Þ° ØĮ
111
+ Þª Þª
112
+ Þª ØŁ
113
+ Þ¨ ØĮ
114
+ Þĥ ÞĮ
115
+ Þĥ Þij
116
+ Þĥ Þķ
117
+ ÞŃ !
118
+ âĢ ĺ
119
+ Þª ØĮ
120
+ Ġ âĢĺ
121
+ Þ¦ Þ¨
122
+ ĠÞ ĺ
123
+ Þ¬ .
124
+ Þ¬ Þ°
125
+ Þĩ Þĩ
126
+ Þ§ Þ§
127
+ Þĥ ÞĤ
128
+ Þĥ ÞĦ
129
+ Þİ Þĩ
130
+ âĢ Ļ
131
+ Þ ¥
132
+ Þĥ Þī
133
+ ĠÞ £
134
+ Þ ¡
135
+ Ġ ;
136
+ Ġ ØŁ
137
+ Þ¦ !
138
+ Þ¦ Þª
139
+ Þ¦ Þ§
140
+ Þ° !
141
+ Þ° âĢĻ
142
+ Þª Þ¦
143
+ ÞĤ ÞĤ
144
+ ÞĤ ÞĨ
145
+ ÞĤ ÞĢ
146
+ ÞĤ Þģ
147
+ ÞĤ Þĸ
148
+ Þ¨ -
149
+ Þ¨ Þ©
150
+ Þĩ Þĥ
151
+ Þ§ Þ°
152
+ Þ§ ØĮ
153
+ Þĥ ÞIJ
154
+ Þĥ Þį
155
+ Þĥ ÞĬ
156
+ Þĥ ÞĴ
157
+ Þĥ ÞĹ
158
+ Þī ÞĪ
159
+ ÞĪ Þĩ
160
+ Þİ Þĥ
161
+ Þİ Þİ
162
+ Þ© :
163
+ Þ© ;
164
+ Þ© ØĮ
165
+ Þ© âĢĻ
166
+ ÞĢ Þĩ
167
+ ÞĢ Þĭ
168
+ ÞŃ .
169
+ ÞŃ ØŁ
170
+ ÞŃ ØĮ
171
+ Þ® Þ¦
172
+ ĠÞĤ Þij
173
+ Þ¯ !
174
+ Þ¯ .
175
+ ĠÞIJ Þĵ
176
+ Þı Þģ
177
+ ï·² Þİ
special_tokens_map.json CHANGED
@@ -1,8 +1,22 @@
1
  {
2
- "pad_token": {
3
- "content": "[PAD]",
4
  "lstrip": false,
5
- "normalized": false,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "rstrip": false,
7
  "single_word": false
8
  }
 
1
  {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
  "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
  "rstrip": false,
21
  "single_word": false
22
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,49 +1,23 @@
1
  {
 
 
2
  "added_tokens_decoder": {
3
- "0": {
4
- "content": "<pad>",
5
  "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "<s>",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "2": {
20
- "content": "</s>",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "3": {
28
- "content": "<unk>",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "8000": {
36
- "content": "[PAD]",
37
- "lstrip": false,
38
- "normalized": false,
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
  }
43
  },
 
44
  "clean_up_tokenization_spaces": false,
 
 
45
  "extra_special_tokens": {},
46
  "model_max_length": 1000000000000000019884624838656,
47
- "pad_token": "[PAD]",
48
- "tokenizer_class": "PreTrainedTokenizerFast"
 
49
  }
 
1
  {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
  "added_tokens_decoder": {
5
+ "437": {
6
+ "content": "<|endoftext|>",
7
  "lstrip": false,
8
+ "normalized": true,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  "rstrip": false,
10
  "single_word": false,
11
  "special": true
12
  }
13
  },
14
+ "bos_token": "<|endoftext|>",
15
  "clean_up_tokenization_spaces": false,
16
+ "eos_token": "<|endoftext|>",
17
+ "errors": "replace",
18
  "extra_special_tokens": {},
19
  "model_max_length": 1000000000000000019884624838656,
20
+ "pad_token": null,
21
+ "tokenizer_class": "GPT2Tokenizer",
22
+ "unk_token": "<|endoftext|>"
23
  }
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<s>":0,"<pad>":1,"</s>":2,"<unk>":3,"<mask>":4,"!":5,"\"":6,"#":7,"$":8,"%":9,"&":10,"'":11,"(":12,")":13,"*":14,"+":15,",":16,"-":17,".":18,"/":19,"0":20,"1":21,"2":22,"3":23,"4":24,"5":25,"6":26,"7":27,"8":28,"9":29,":":30,";":31,"<":32,"=":33,">":34,"?":35,"@":36,"A":37,"B":38,"C":39,"D":40,"E":41,"F":42,"G":43,"H":44,"I":45,"J":46,"K":47,"L":48,"M":49,"N":50,"O":51,"P":52,"Q":53,"R":54,"S":55,"T":56,"U":57,"V":58,"W":59,"X":60,"Y":61,"Z":62,"[":63,"\\":64,"]":65,"^":66,"_":67,"`":68,"a":69,"b":70,"c":71,"d":72,"e":73,"f":74,"g":75,"h":76,"i":77,"j":78,"k":79,"l":80,"m":81,"n":82,"o":83,"p":84,"q":85,"r":86,"s":87,"t":88,"u":89,"v":90,"w":91,"x":92,"y":93,"z":94,"{":95,"|":96,"}":97,"~":98,"¡":99,"¢":100,"£":101,"¤":102,"¥":103,"¦":104,"§":105,"¨":106,"©":107,"ª":108,"«":109,"¬":110,"®":111,"¯":112,"°":113,"±":114,"²":115,"³":116,"´":117,"µ":118,"¶":119,"·":120,"¸":121,"¹":122,"º":123,"»":124,"¼":125,"½":126,"¾":127,"¿":128,"À":129,"Á":130,"Â":131,"Ã":132,"Ä":133,"Å":134,"Æ":135,"Ç":136,"È":137,"É":138,"Ê":139,"Ë":140,"Ì":141,"Í":142,"Î":143,"Ï":144,"Ð":145,"Ñ":146,"Ò":147,"Ó":148,"Ô":149,"Õ":150,"Ö":151,"×":152,"Ø":153,"Ù":154,"Ú":155,"Û":156,"Ü":157,"Ý":158,"Þ":159,"ß":160,"à":161,"á":162,"â":163,"ã":164,"ä":165,"å":166,"æ":167,"ç":168,"è":169,"é":170,"ê":171,"ë":172,"ì":173,"í":174,"î":175,"ï":176,"ð":177,"ñ":178,"ò":179,"ó":180,"ô":181,"õ":182,"ö":183,"÷":184,"ø":185,"ù":186,"ú":187,"û":188,"ü":189,"ý":190,"þ":191,"ÿ":192,"Ā":193,"ā":194,"Ă":195,"ă":196,"Ą":197,"ą":198,"Ć":199,"ć":200,"Ĉ":201,"ĉ":202,"Ċ":203,"ċ":204,"Č":205,"č":206,"Ď":207,"ď":208,"Đ":209,"đ":210,"Ē":211,"ē":212,"Ĕ":213,"ĕ":214,"Ė":215,"ė":216,"Ę":217,"ę":218,"Ě":219,"ě":220,"Ĝ":221,"ĝ":222,"Ğ":223,"ğ":224,"Ġ":225,"ġ":226,"Ģ":227,"ģ":228,"Ĥ":229,"ĥ":230,"Ħ":231,"ħ":232,"Ĩ":233,"ĩ":234,"Ī":235,"ī":236,"Ĭ":237,"ĭ":238,"Į":239,"į":240,"İ":241,"ı":242,"IJ":243,"ij":244,"Ĵ":245,"ĵ":246,"Ķ":247,"ķ":248,"ĸ":249,"Ĺ":250,"ĺ":251,"Ļ":252,"ļ":253,"Ľ":254,"ľ":255,"Ŀ":256,"ŀ":257,"Ł":258,"ł":259,"Ń":260,"Þ¦":261,"ĠÞ":262,"Þ°":263,"Þ¬":264,"Þª":265,"ÞĤ":266,"Þ¨":267,"Þĩ":268,"Þ§":269,"Þĥ":270,"Þī":271,"ÞĨ":272,"ÞĪ":273,"ĠÞĩ":274,"Þİ":275,"Þ©":276,"ÞĮ":277,"ÞĢ":278,"Þģ":279,"Þĭ":280,"ÞIJ":281,"Þį":282,"ÞŃ":283,"Þ®":284,"ÞĬ":285,"ĠÞĢ":286,"ĠÞī":287,"ĠÞĨ":288,"ÞĦ":289,"ĠÞĦ":290,"ĠÞĪ":291,"ĠÞĭ":292,"ĠÞĬ":293,"ĠÞĤ":294,"Þħ":295,"ÞĶ":296,"Þij":297,"Þĵ":298,"ĠÞİ":299,"Þ¯":300,"Þ«":301,"Þĸ":302,"ĠÞĮ":303,"ĠÞĥ":304,"ĠÞIJ":305,"ĠÞį":306,"ÞĴ":307,"ÞĤÞİ":308,"ÞĿ":309,"ÞĤÞij":310,"Þķ":311,"ĠÞĸ":312,"ÞĹ":313,"ØŁ":314,"ÞĤÞĭ":315,"ĠÞĴ":316,"Þ¢":317,"Þ¤":318,"ÞĻ":319,"ĠÞĻ":320,"ĠÞ¢":321,"ĠÞĿ":322,"Þ¦ØŁ":323,"ĠÞļ":324,"ĠÞĵ":325,"Þŀ":326,"ĠÞĶ":327,"ĠÞij":328,"ĠÞķ":329,"ĠÞ¤":330,"Þļ":331,"ÞĤÞĦ":332,"ï·":333,"Þ¬ØŁ":334,"Þı":335,"ï·²":336,"Þł":337,"Þ°ØŁ":338,"ĠÞŀ":339,"ØĮ":340,"ÞŁ":341,"Þ°.":342,"Þ¬!":343,"Þĺ":344,"Ġï·²":345,"Þ©.":346,"ÞĥÞĪ":347,"ÞĥÞĵ":348,"ĠÞĹ":349,"Þ©ØŁ":350,"Þ§ØŁ":351,"Þ£":352,"Þ¦Þ¦":353,"âĢ":354,"ĠÞħ":355,"Þ¬Þ¬":356,"ÞĤÞĩ":357,"Þ¨.":358,"ÞĥÞĨ":359,"ÞĥÞİ":360,"Þ¯ØŁ":361,"Ġ-":362,"Ġï·":363,"Ġï·º":364,"ÞĽ":365,"ĠÞŁ":366,"ĠÞł":367,"Þ°Þ°":368,"Þ°ØĮ":369,"ÞªÞª":370,"ÞªØŁ":371,"Þ¨ØĮ":372,"ÞĥÞĮ":373,"ÞĥÞij":374,"ÞĥÞķ":375,"ÞŃ!":376,"âĢĺ":377,"ÞªØĮ":378,"ĠâĢĺ":379,"Þ¦Þ¨":380,"ĠÞĺ":381,"Þ¬.":382,"Þ¬Þ°":383,"ÞĩÞĩ":384,"Þ§Þ§":385,"ÞĥÞĤ":386,"ÞĥÞĦ":387,"ÞİÞĩ":388,"âĢĻ":389,"Þ¥":390,"ÞĥÞī":391,"ĠÞ£":392,"Þ¡":393,"Ġ;":394,"ĠØŁ":395,"Þ¦!":396,"Þ¦Þª":397,"Þ¦Þ§":398,"Þ°!":399,"Þ°âĢĻ":400,"ÞªÞ¦":401,"ÞĤÞĤ":402,"ÞĤÞĨ":403,"ÞĤÞĢ":404,"ÞĤÞģ":405,"ÞĤÞĸ":406,"Þ¨-":407,"Þ¨Þ©":408,"ÞĩÞĥ":409,"Þ§Þ°":410,"Þ§ØĮ":411,"ÞĥÞIJ":412,"ÞĥÞį":413,"ÞĥÞĬ":414,"ÞĥÞĴ":415,"ÞĥÞĹ":416,"ÞīÞĪ":417,"ÞĪÞĩ":418,"ÞİÞĥ":419,"ÞİÞİ":420,"Þ©:":421,"Þ©;":422,"Þ©ØĮ":423,"Þ©âĢĻ":424,"ÞĢÞĩ":425,"ÞĢÞĭ":426,"ÞŃ.":427,"ÞŃØŁ":428,"ÞŃØĮ":429,"Þ®Þ¦":430,"ĠÞĤÞij":431,"Þ¯!":432,"Þ¯.":433,"ĠÞIJÞĵ":434,"ÞıÞģ":435,"ï·²Þİ":436}