cydhsieh01 commited on 24 days ago

Commit

56df21f

•

1 Parent(s): be285a6

Upload folder using huggingface_hub

Browse files

Files changed (25) hide show

.gitattributes +1 -0
added_tokens.json +428 -0
config.json +55 -0
config.yaml +337 -0
config_molmo.py +154 -0
generation_config.json +4 -0
image_preprocessing_molmo.py +559 -0
merges.txt +0 -0
model-00001-of-00007.safetensors +3 -0
model-00002-of-00007.safetensors +3 -0
model-00003-of-00007.safetensors +3 -0
model-00004-of-00007.safetensors +3 -0
model-00005-of-00007.safetensors +3 -0
model-00006-of-00007.safetensors +3 -0
model-00007-of-00007.safetensors +3 -0
model.pt +3 -0
model.safetensors.index.json +592 -0
modeling_molmo.py +1398 -0
preprocessing_molmo.py +189 -0
preprocessor_config.json +22 -0
processor_config.json +6 -0
special_tokens_map.json +441 -0
tokenizer.json +3 -0
tokenizer_config.json +3852 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,428 @@

+{
+  "<im_col>": 152067,
+  "<im_end>": 152065,
+  "<im_patch>": 152066,
+  "<im_start>": 152064,
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image|>": 152068,
+  "|<EXTRA_TOKENS_0>|": 151646,
+  "|<EXTRA_TOKENS_100>|": 151746,
+  "|<EXTRA_TOKENS_101>|": 151747,
+  "|<EXTRA_TOKENS_102>|": 151748,
+  "|<EXTRA_TOKENS_103>|": 151749,
+  "|<EXTRA_TOKENS_104>|": 151750,
+  "|<EXTRA_TOKENS_105>|": 151751,
+  "|<EXTRA_TOKENS_106>|": 151752,
+  "|<EXTRA_TOKENS_107>|": 151753,
+  "|<EXTRA_TOKENS_108>|": 151754,
+  "|<EXTRA_TOKENS_109>|": 151755,
+  "|<EXTRA_TOKENS_10>|": 151656,
+  "|<EXTRA_TOKENS_110>|": 151756,
+  "|<EXTRA_TOKENS_111>|": 151757,
+  "|<EXTRA_TOKENS_112>|": 151758,
+  "|<EXTRA_TOKENS_113>|": 151759,
+  "|<EXTRA_TOKENS_114>|": 151760,
+  "|<EXTRA_TOKENS_115>|": 151761,
+  "|<EXTRA_TOKENS_116>|": 151762,
+  "|<EXTRA_TOKENS_117>|": 151763,
+  "|<EXTRA_TOKENS_118>|": 151764,
+  "|<EXTRA_TOKENS_119>|": 151765,
+  "|<EXTRA_TOKENS_11>|": 151657,
+  "|<EXTRA_TOKENS_120>|": 151766,
+  "|<EXTRA_TOKENS_121>|": 151767,
+  "|<EXTRA_TOKENS_122>|": 151768,
+  "|<EXTRA_TOKENS_123>|": 151769,
+  "|<EXTRA_TOKENS_124>|": 151770,
+  "|<EXTRA_TOKENS_125>|": 151771,
+  "|<EXTRA_TOKENS_126>|": 151772,
+  "|<EXTRA_TOKENS_127>|": 151773,
+  "|<EXTRA_TOKENS_128>|": 151774,
+  "|<EXTRA_TOKENS_129>|": 151775,
+  "|<EXTRA_TOKENS_12>|": 151658,
+  "|<EXTRA_TOKENS_130>|": 151776,
+  "|<EXTRA_TOKENS_131>|": 151777,
+  "|<EXTRA_TOKENS_132>|": 151778,
+  "|<EXTRA_TOKENS_133>|": 151779,
+  "|<EXTRA_TOKENS_134>|": 151780,
+  "|<EXTRA_TOKENS_135>|": 151781,
+  "|<EXTRA_TOKENS_136>|": 151782,
+  "|<EXTRA_TOKENS_137>|": 151783,
+  "|<EXTRA_TOKENS_138>|": 151784,
+  "|<EXTRA_TOKENS_139>|": 151785,
+  "|<EXTRA_TOKENS_13>|": 151659,
+  "|<EXTRA_TOKENS_140>|": 151786,
+  "|<EXTRA_TOKENS_141>|": 151787,
+  "|<EXTRA_TOKENS_142>|": 151788,
+  "|<EXTRA_TOKENS_143>|": 151789,
+  "|<EXTRA_TOKENS_144>|": 151790,
+  "|<EXTRA_TOKENS_145>|": 151791,
+  "|<EXTRA_TOKENS_146>|": 151792,
+  "|<EXTRA_TOKENS_147>|": 151793,
+  "|<EXTRA_TOKENS_148>|": 151794,
+  "|<EXTRA_TOKENS_149>|": 151795,
+  "|<EXTRA_TOKENS_14>|": 151660,
+  "|<EXTRA_TOKENS_150>|": 151796,
+  "|<EXTRA_TOKENS_151>|": 151797,
+  "|<EXTRA_TOKENS_152>|": 151798,
+  "|<EXTRA_TOKENS_153>|": 151799,
+  "|<EXTRA_TOKENS_154>|": 151800,
+  "|<EXTRA_TOKENS_155>|": 151801,
+  "|<EXTRA_TOKENS_156>|": 151802,
+  "|<EXTRA_TOKENS_157>|": 151803,
+  "|<EXTRA_TOKENS_158>|": 151804,
+  "|<EXTRA_TOKENS_159>|": 151805,
+  "|<EXTRA_TOKENS_15>|": 151661,
+  "|<EXTRA_TOKENS_160>|": 151806,
+  "|<EXTRA_TOKENS_161>|": 151807,
+  "|<EXTRA_TOKENS_162>|": 151808,
+  "|<EXTRA_TOKENS_163>|": 151809,
+  "|<EXTRA_TOKENS_164>|": 151810,
+  "|<EXTRA_TOKENS_165>|": 151811,
+  "|<EXTRA_TOKENS_166>|": 151812,
+  "|<EXTRA_TOKENS_167>|": 151813,
+  "|<EXTRA_TOKENS_168>|": 151814,
+  "|<EXTRA_TOKENS_169>|": 151815,
+  "|<EXTRA_TOKENS_16>|": 151662,
+  "|<EXTRA_TOKENS_170>|": 151816,
+  "|<EXTRA_TOKENS_171>|": 151817,
+  "|<EXTRA_TOKENS_172>|": 151818,
+  "|<EXTRA_TOKENS_173>|": 151819,
+  "|<EXTRA_TOKENS_174>|": 151820,
+  "|<EXTRA_TOKENS_175>|": 151821,
+  "|<EXTRA_TOKENS_176>|": 151822,
+  "|<EXTRA_TOKENS_177>|": 151823,
+  "|<EXTRA_TOKENS_178>|": 151824,
+  "|<EXTRA_TOKENS_179>|": 151825,
+  "|<EXTRA_TOKENS_17>|": 151663,
+  "|<EXTRA_TOKENS_180>|": 151826,
+  "|<EXTRA_TOKENS_181>|": 151827,
+  "|<EXTRA_TOKENS_182>|": 151828,
+  "|<EXTRA_TOKENS_183>|": 151829,
+  "|<EXTRA_TOKENS_184>|": 151830,
+  "|<EXTRA_TOKENS_185>|": 151831,
+  "|<EXTRA_TOKENS_186>|": 151832,
+  "|<EXTRA_TOKENS_187>|": 151833,
+  "|<EXTRA_TOKENS_188>|": 151834,
+  "|<EXTRA_TOKENS_189>|": 151835,
+  "|<EXTRA_TOKENS_18>|": 151664,
+  "|<EXTRA_TOKENS_190>|": 151836,
+  "|<EXTRA_TOKENS_191>|": 151837,
+  "|<EXTRA_TOKENS_192>|": 151838,
+  "|<EXTRA_TOKENS_193>|": 151839,
+  "|<EXTRA_TOKENS_194>|": 151840,
+  "|<EXTRA_TOKENS_195>|": 151841,
+  "|<EXTRA_TOKENS_196>|": 151842,
+  "|<EXTRA_TOKENS_197>|": 151843,
+  "|<EXTRA_TOKENS_198>|": 151844,
+  "|<EXTRA_TOKENS_199>|": 151845,
+  "|<EXTRA_TOKENS_19>|": 151665,
+  "|<EXTRA_TOKENS_1>|": 151647,
+  "|<EXTRA_TOKENS_200>|": 151846,
+  "|<EXTRA_TOKENS_201>|": 151847,
+  "|<EXTRA_TOKENS_202>|": 151848,
+  "|<EXTRA_TOKENS_203>|": 151849,
+  "|<EXTRA_TOKENS_204>|": 151850,
+  "|<EXTRA_TOKENS_205>|": 151851,
+  "|<EXTRA_TOKENS_206>|": 151852,
+  "|<EXTRA_TOKENS_207>|": 151853,
+  "|<EXTRA_TOKENS_208>|": 151854,
+  "|<EXTRA_TOKENS_209>|": 151855,
+  "|<EXTRA_TOKENS_20>|": 151666,
+  "|<EXTRA_TOKENS_210>|": 151856,
+  "|<EXTRA_TOKENS_211>|": 151857,
+  "|<EXTRA_TOKENS_212>|": 151858,
+  "|<EXTRA_TOKENS_213>|": 151859,
+  "|<EXTRA_TOKENS_214>|": 151860,
+  "|<EXTRA_TOKENS_215>|": 151861,
+  "|<EXTRA_TOKENS_216>|": 151862,
+  "|<EXTRA_TOKENS_217>|": 151863,
+  "|<EXTRA_TOKENS_218>|": 151864,
+  "|<EXTRA_TOKENS_219>|": 151865,
+  "|<EXTRA_TOKENS_21>|": 151667,
+  "|<EXTRA_TOKENS_220>|": 151866,
+  "|<EXTRA_TOKENS_221>|": 151867,
+  "|<EXTRA_TOKENS_222>|": 151868,
+  "|<EXTRA_TOKENS_223>|": 151869,
+  "|<EXTRA_TOKENS_224>|": 151870,
+  "|<EXTRA_TOKENS_225>|": 151871,
+  "|<EXTRA_TOKENS_226>|": 151872,
+  "|<EXTRA_TOKENS_227>|": 151873,
+  "|<EXTRA_TOKENS_228>|": 151874,
+  "|<EXTRA_TOKENS_229>|": 151875,
+  "|<EXTRA_TOKENS_22>|": 151668,
+  "|<EXTRA_TOKENS_230>|": 151876,
+  "|<EXTRA_TOKENS_231>|": 151877,
+  "|<EXTRA_TOKENS_232>|": 151878,
+  "|<EXTRA_TOKENS_233>|": 151879,
+  "|<EXTRA_TOKENS_234>|": 151880,
+  "|<EXTRA_TOKENS_235>|": 151881,
+  "|<EXTRA_TOKENS_236>|": 151882,
+  "|<EXTRA_TOKENS_237>|": 151883,
+  "|<EXTRA_TOKENS_238>|": 151884,
+  "|<EXTRA_TOKENS_239>|": 151885,
+  "|<EXTRA_TOKENS_23>|": 151669,
+  "|<EXTRA_TOKENS_240>|": 151886,
+  "|<EXTRA_TOKENS_241>|": 151887,
+  "|<EXTRA_TOKENS_242>|": 151888,
+  "|<EXTRA_TOKENS_243>|": 151889,
+  "|<EXTRA_TOKENS_244>|": 151890,
+  "|<EXTRA_TOKENS_245>|": 151891,
+  "|<EXTRA_TOKENS_246>|": 151892,
+  "|<EXTRA_TOKENS_247>|": 151893,
+  "|<EXTRA_TOKENS_248>|": 151894,
+  "|<EXTRA_TOKENS_249>|": 151895,
+  "|<EXTRA_TOKENS_24>|": 151670,
+  "|<EXTRA_TOKENS_250>|": 151896,
+  "|<EXTRA_TOKENS_251>|": 151897,
+  "|<EXTRA_TOKENS_252>|": 151898,
+  "|<EXTRA_TOKENS_253>|": 151899,
+  "|<EXTRA_TOKENS_254>|": 151900,
+  "|<EXTRA_TOKENS_255>|": 151901,
+  "|<EXTRA_TOKENS_256>|": 151902,
+  "|<EXTRA_TOKENS_257>|": 151903,
+  "|<EXTRA_TOKENS_258>|": 151904,
+  "|<EXTRA_TOKENS_259>|": 151905,
+  "|<EXTRA_TOKENS_25>|": 151671,
+  "|<EXTRA_TOKENS_260>|": 151906,
+  "|<EXTRA_TOKENS_261>|": 151907,
+  "|<EXTRA_TOKENS_262>|": 151908,
+  "|<EXTRA_TOKENS_263>|": 151909,
+  "|<EXTRA_TOKENS_264>|": 151910,
+  "|<EXTRA_TOKENS_265>|": 151911,
+  "|<EXTRA_TOKENS_266>|": 151912,
+  "|<EXTRA_TOKENS_267>|": 151913,
+  "|<EXTRA_TOKENS_268>|": 151914,
+  "|<EXTRA_TOKENS_269>|": 151915,
+  "|<EXTRA_TOKENS_26>|": 151672,
+  "|<EXTRA_TOKENS_270>|": 151916,
+  "|<EXTRA_TOKENS_271>|": 151917,
+  "|<EXTRA_TOKENS_272>|": 151918,
+  "|<EXTRA_TOKENS_273>|": 151919,
+  "|<EXTRA_TOKENS_274>|": 151920,
+  "|<EXTRA_TOKENS_275>|": 151921,
+  "|<EXTRA_TOKENS_276>|": 151922,
+  "|<EXTRA_TOKENS_277>|": 151923,
+  "|<EXTRA_TOKENS_278>|": 151924,
+  "|<EXTRA_TOKENS_279>|": 151925,
+  "|<EXTRA_TOKENS_27>|": 151673,
+  "|<EXTRA_TOKENS_280>|": 151926,
+  "|<EXTRA_TOKENS_281>|": 151927,
+  "|<EXTRA_TOKENS_282>|": 151928,
+  "|<EXTRA_TOKENS_283>|": 151929,
+  "|<EXTRA_TOKENS_284>|": 151930,
+  "|<EXTRA_TOKENS_285>|": 151931,
+  "|<EXTRA_TOKENS_286>|": 151932,
+  "|<EXTRA_TOKENS_287>|": 151933,
+  "|<EXTRA_TOKENS_288>|": 151934,
+  "|<EXTRA_TOKENS_289>|": 151935,
+  "|<EXTRA_TOKENS_28>|": 151674,
+  "|<EXTRA_TOKENS_290>|": 151936,
+  "|<EXTRA_TOKENS_291>|": 151937,
+  "|<EXTRA_TOKENS_292>|": 151938,
+  "|<EXTRA_TOKENS_293>|": 151939,
+  "|<EXTRA_TOKENS_294>|": 151940,
+  "|<EXTRA_TOKENS_295>|": 151941,
+  "|<EXTRA_TOKENS_296>|": 151942,
+  "|<EXTRA_TOKENS_297>|": 151943,
+  "|<EXTRA_TOKENS_298>|": 151944,
+  "|<EXTRA_TOKENS_299>|": 151945,
+  "|<EXTRA_TOKENS_29>|": 151675,
+  "|<EXTRA_TOKENS_2>|": 151648,
+  "|<EXTRA_TOKENS_300>|": 151946,
+  "|<EXTRA_TOKENS_301>|": 151947,
+  "|<EXTRA_TOKENS_302>|": 151948,
+  "|<EXTRA_TOKENS_303>|": 151949,
+  "|<EXTRA_TOKENS_304>|": 151950,
+  "|<EXTRA_TOKENS_305>|": 151951,
+  "|<EXTRA_TOKENS_306>|": 151952,
+  "|<EXTRA_TOKENS_307>|": 151953,
+  "|<EXTRA_TOKENS_308>|": 151954,
+  "|<EXTRA_TOKENS_309>|": 151955,
+  "|<EXTRA_TOKENS_30>|": 151676,
+  "|<EXTRA_TOKENS_310>|": 151956,
+  "|<EXTRA_TOKENS_311>|": 151957,
+  "|<EXTRA_TOKENS_312>|": 151958,
+  "|<EXTRA_TOKENS_313>|": 151959,
+  "|<EXTRA_TOKENS_314>|": 151960,
+  "|<EXTRA_TOKENS_315>|": 151961,
+  "|<EXTRA_TOKENS_316>|": 151962,
+  "|<EXTRA_TOKENS_317>|": 151963,
+  "|<EXTRA_TOKENS_318>|": 151964,
+  "|<EXTRA_TOKENS_319>|": 151965,
+  "|<EXTRA_TOKENS_31>|": 151677,
+  "|<EXTRA_TOKENS_320>|": 151966,
+  "|<EXTRA_TOKENS_321>|": 151967,
+  "|<EXTRA_TOKENS_322>|": 151968,
+  "|<EXTRA_TOKENS_323>|": 151969,
+  "|<EXTRA_TOKENS_324>|": 151970,
+  "|<EXTRA_TOKENS_325>|": 151971,
+  "|<EXTRA_TOKENS_326>|": 151972,
+  "|<EXTRA_TOKENS_327>|": 151973,
+  "|<EXTRA_TOKENS_328>|": 151974,
+  "|<EXTRA_TOKENS_329>|": 151975,
+  "|<EXTRA_TOKENS_32>|": 151678,
+  "|<EXTRA_TOKENS_330>|": 151976,
+  "|<EXTRA_TOKENS_331>|": 151977,
+  "|<EXTRA_TOKENS_332>|": 151978,
+  "|<EXTRA_TOKENS_333>|": 151979,
+  "|<EXTRA_TOKENS_334>|": 151980,
+  "|<EXTRA_TOKENS_335>|": 151981,
+  "|<EXTRA_TOKENS_336>|": 151982,
+  "|<EXTRA_TOKENS_337>|": 151983,
+  "|<EXTRA_TOKENS_338>|": 151984,
+  "|<EXTRA_TOKENS_339>|": 151985,
+  "|<EXTRA_TOKENS_33>|": 151679,
+  "|<EXTRA_TOKENS_340>|": 151986,
+  "|<EXTRA_TOKENS_341>|": 151987,
+  "|<EXTRA_TOKENS_342>|": 151988,
+  "|<EXTRA_TOKENS_343>|": 151989,
+  "|<EXTRA_TOKENS_344>|": 151990,
+  "|<EXTRA_TOKENS_345>|": 151991,
+  "|<EXTRA_TOKENS_346>|": 151992,
+  "|<EXTRA_TOKENS_347>|": 151993,
+  "|<EXTRA_TOKENS_348>|": 151994,
+  "|<EXTRA_TOKENS_349>|": 151995,
+  "|<EXTRA_TOKENS_34>|": 151680,
+  "|<EXTRA_TOKENS_350>|": 151996,
+  "|<EXTRA_TOKENS_351>|": 151997,
+  "|<EXTRA_TOKENS_352>|": 151998,
+  "|<EXTRA_TOKENS_353>|": 151999,
+  "|<EXTRA_TOKENS_354>|": 152000,
+  "|<EXTRA_TOKENS_355>|": 152001,
+  "|<EXTRA_TOKENS_356>|": 152002,
+  "|<EXTRA_TOKENS_357>|": 152003,
+  "|<EXTRA_TOKENS_358>|": 152004,
+  "|<EXTRA_TOKENS_359>|": 152005,
+  "|<EXTRA_TOKENS_35>|": 151681,
+  "|<EXTRA_TOKENS_360>|": 152006,
+  "|<EXTRA_TOKENS_361>|": 152007,
+  "|<EXTRA_TOKENS_362>|": 152008,
+  "|<EXTRA_TOKENS_363>|": 152009,
+  "|<EXTRA_TOKENS_364>|": 152010,
+  "|<EXTRA_TOKENS_365>|": 152011,
+  "|<EXTRA_TOKENS_366>|": 152012,
+  "|<EXTRA_TOKENS_367>|": 152013,
+  "|<EXTRA_TOKENS_368>|": 152014,
+  "|<EXTRA_TOKENS_369>|": 152015,
+  "|<EXTRA_TOKENS_36>|": 151682,
+  "|<EXTRA_TOKENS_370>|": 152016,
+  "|<EXTRA_TOKENS_371>|": 152017,
+  "|<EXTRA_TOKENS_372>|": 152018,
+  "|<EXTRA_TOKENS_373>|": 152019,
+  "|<EXTRA_TOKENS_374>|": 152020,
+  "|<EXTRA_TOKENS_375>|": 152021,
+  "|<EXTRA_TOKENS_376>|": 152022,
+  "|<EXTRA_TOKENS_377>|": 152023,
+  "|<EXTRA_TOKENS_378>|": 152024,
+  "|<EXTRA_TOKENS_379>|": 152025,
+  "|<EXTRA_TOKENS_37>|": 151683,
+  "|<EXTRA_TOKENS_380>|": 152026,
+  "|<EXTRA_TOKENS_381>|": 152027,
+  "|<EXTRA_TOKENS_382>|": 152028,
+  "|<EXTRA_TOKENS_383>|": 152029,
+  "|<EXTRA_TOKENS_384>|": 152030,
+  "|<EXTRA_TOKENS_385>|": 152031,
+  "|<EXTRA_TOKENS_386>|": 152032,
+  "|<EXTRA_TOKENS_387>|": 152033,
+  "|<EXTRA_TOKENS_388>|": 152034,
+  "|<EXTRA_TOKENS_389>|": 152035,
+  "|<EXTRA_TOKENS_38>|": 151684,
+  "|<EXTRA_TOKENS_390>|": 152036,
+  "|<EXTRA_TOKENS_391>|": 152037,
+  "|<EXTRA_TOKENS_392>|": 152038,
+  "|<EXTRA_TOKENS_393>|": 152039,
+  "|<EXTRA_TOKENS_394>|": 152040,
+  "|<EXTRA_TOKENS_395>|": 152041,
+  "|<EXTRA_TOKENS_396>|": 152042,
+  "|<EXTRA_TOKENS_397>|": 152043,
+  "|<EXTRA_TOKENS_398>|": 152044,
+  "|<EXTRA_TOKENS_399>|": 152045,
+  "|<EXTRA_TOKENS_39>|": 151685,
+  "|<EXTRA_TOKENS_3>|": 151649,
+  "|<EXTRA_TOKENS_400>|": 152046,
+  "|<EXTRA_TOKENS_401>|": 152047,
+  "|<EXTRA_TOKENS_402>|": 152048,
+  "|<EXTRA_TOKENS_403>|": 152049,
+  "|<EXTRA_TOKENS_404>|": 152050,
+  "|<EXTRA_TOKENS_405>|": 152051,
+  "|<EXTRA_TOKENS_406>|": 152052,
+  "|<EXTRA_TOKENS_407>|": 152053,
+  "|<EXTRA_TOKENS_408>|": 152054,
+  "|<EXTRA_TOKENS_409>|": 152055,
+  "|<EXTRA_TOKENS_40>|": 151686,
+  "|<EXTRA_TOKENS_410>|": 152056,
+  "|<EXTRA_TOKENS_411>|": 152057,
+  "|<EXTRA_TOKENS_412>|": 152058,
+  "|<EXTRA_TOKENS_413>|": 152059,
+  "|<EXTRA_TOKENS_414>|": 152060,
+  "|<EXTRA_TOKENS_415>|": 152061,
+  "|<EXTRA_TOKENS_416>|": 152062,
+  "|<EXTRA_TOKENS_417>|": 152063,
+  "|<EXTRA_TOKENS_41>|": 151687,
+  "|<EXTRA_TOKENS_42>|": 151688,
+  "|<EXTRA_TOKENS_43>|": 151689,
+  "|<EXTRA_TOKENS_44>|": 151690,
+  "|<EXTRA_TOKENS_45>|": 151691,
+  "|<EXTRA_TOKENS_46>|": 151692,
+  "|<EXTRA_TOKENS_47>|": 151693,
+  "|<EXTRA_TOKENS_48>|": 151694,
+  "|<EXTRA_TOKENS_49>|": 151695,
+  "|<EXTRA_TOKENS_4>|": 151650,
+  "|<EXTRA_TOKENS_50>|": 151696,
+  "|<EXTRA_TOKENS_51>|": 151697,
+  "|<EXTRA_TOKENS_52>|": 151698,
+  "|<EXTRA_TOKENS_53>|": 151699,
+  "|<EXTRA_TOKENS_54>|": 151700,
+  "|<EXTRA_TOKENS_55>|": 151701,
+  "|<EXTRA_TOKENS_56>|": 151702,
+  "|<EXTRA_TOKENS_57>|": 151703,
+  "|<EXTRA_TOKENS_58>|": 151704,
+  "|<EXTRA_TOKENS_59>|": 151705,
+  "|<EXTRA_TOKENS_5>|": 151651,
+  "|<EXTRA_TOKENS_60>|": 151706,
+  "|<EXTRA_TOKENS_61>|": 151707,
+  "|<EXTRA_TOKENS_62>|": 151708,
+  "|<EXTRA_TOKENS_63>|": 151709,
+  "|<EXTRA_TOKENS_64>|": 151710,
+  "|<EXTRA_TOKENS_65>|": 151711,
+  "|<EXTRA_TOKENS_66>|": 151712,
+  "|<EXTRA_TOKENS_67>|": 151713,
+  "|<EXTRA_TOKENS_68>|": 151714,
+  "|<EXTRA_TOKENS_69>|": 151715,
+  "|<EXTRA_TOKENS_6>|": 151652,
+  "|<EXTRA_TOKENS_70>|": 151716,
+  "|<EXTRA_TOKENS_71>|": 151717,
+  "|<EXTRA_TOKENS_72>|": 151718,
+  "|<EXTRA_TOKENS_73>|": 151719,
+  "|<EXTRA_TOKENS_74>|": 151720,
+  "|<EXTRA_TOKENS_75>|": 151721,
+  "|<EXTRA_TOKENS_76>|": 151722,
+  "|<EXTRA_TOKENS_77>|": 151723,
+  "|<EXTRA_TOKENS_78>|": 151724,
+  "|<EXTRA_TOKENS_79>|": 151725,
+  "|<EXTRA_TOKENS_7>|": 151653,
+  "|<EXTRA_TOKENS_80>|": 151726,
+  "|<EXTRA_TOKENS_81>|": 151727,
+  "|<EXTRA_TOKENS_82>|": 151728,
+  "|<EXTRA_TOKENS_83>|": 151729,
+  "|<EXTRA_TOKENS_84>|": 151730,
+  "|<EXTRA_TOKENS_85>|": 151731,
+  "|<EXTRA_TOKENS_86>|": 151732,
+  "|<EXTRA_TOKENS_87>|": 151733,
+  "|<EXTRA_TOKENS_88>|": 151734,
+  "|<EXTRA_TOKENS_89>|": 151735,
+  "|<EXTRA_TOKENS_8>|": 151654,
+  "|<EXTRA_TOKENS_90>|": 151736,
+  "|<EXTRA_TOKENS_91>|": 151737,
+  "|<EXTRA_TOKENS_92>|": 151738,
+  "|<EXTRA_TOKENS_93>|": 151739,
+  "|<EXTRA_TOKENS_94>|": 151740,
+  "|<EXTRA_TOKENS_95>|": 151741,
+  "|<EXTRA_TOKENS_96>|": 151742,
+  "|<EXTRA_TOKENS_97>|": 151743,
+  "|<EXTRA_TOKENS_98>|": 151744,
+  "|<EXTRA_TOKENS_99>|": 151745,
+  "|<EXTRA_TOKENS_9>|": 151655
+}

config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "activation_type": "silu",
+  "additional_vocab_size": 128,
+  "architectures": [
+    "MolmoForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "attention_type": "sdpa",
+  "auto_map": {
+    "AutoConfig": "config_molmo.MolmoConfig",
+    "AutoModelForCausalLM": "modeling_molmo.MolmoForCausalLM"
+  },
+  "bias_for_layer_norm": false,
+  "clip_qkv": null,
+  "embedding_dropout": 0.0,
+  "embedding_size": 152064,
+  "float32_attention": true,
+  "hidden_size": 3584,
+  "image_feature_dropout": 0.0,
+  "image_padding_embed": "pad_and_partial_pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 37888,
+  "layer_norm_eps": 1e-06,
+  "layer_norm_type": "rms",
+  "max_position_embeddings": 4096,
+  "model_type": "molmo",
+  "moe_num_experts": 0,
+  "moe_top_k": 2,
+  "norm_after": false,
+  "normalize_input_embeds": false,
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "qk_layer_norm": false,
+  "qkv_bias": true,
+  "residual_dropout": 0.0,
+  "rope_theta": 1000000.0,
+  "scale_logits": false,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.45.2",
+  "use_cache": true,
+  "use_position_ids": true,
+  "vision_config": {
+    "attention_dropout": 0.0,
+    "initializer_range": 0.02,
+    "model_type": ""
+  },
+  "vit_layers": [
+    -2,
+    -9
+  ],
+  "vocab_size": 152064,
+  "weight_tying": false
+}

config.yaml ADDED Viewed

	@@ -0,0 +1,337 @@

+run_name: multitask_train
+seed: 6198
+epoch: null
+dry_run: false
+model:
+  d_model: 3584
+  n_heads: 28
+  n_kv_heads: 4
+  qkv_bias: true
+  clip_qkv: null
+  n_layers: 28
+  mlp_ratio: 4
+  mlp_hidden_size: 37888
+  activation_type: swiglu
+  block_type: sequential
+  block_group_size: 1
+  alibi: false
+  alibi_bias_max: 8.0
+  rope: true
+  rope_full_precision: true
+  rope_theta: 1000000.0
+  rope_impl: llama
+  vision_backbone:
+    image_model_type: openai
+    image_default_input_size:
+    - 336
+    - 336
+    image_patch_size: 14
+    image_pos_patch_size: 14
+    image_emb_dim: 1024
+    image_num_heads: 16
+    image_num_key_value_heads: 16
+    image_num_layers: 23
+    image_head_dim: 64
+    image_mlp_dim: 4096
+    image_mlp_activations: quick_gelu
+    image_dropout_rate: 0.0
+    image_num_pos: 577
+    image_norm_eps: 1.0e-05
+    attention_dropout: 0.0
+    residual_dropout: 0.0
+    initializer_range: 0.02
+    fsdp_wrap: false
+    resize_mode: default
+  vit_load_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt
+  llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
+  low_cpu_fsdp: true
+  attention_type: sdpa
+  float32_attention: true
+  attention_dropout: 0.0
+  response_attention_dropout: 0.0
+  multi_query_attention: null
+  attention_layer_norm: false
+  residual_dropout: 0.0
+  response_residual_dropout: 0.1
+  embedding_dropout: 0.0
+  layer_norm_type: rms
+  layer_norm_with_affine: true
+  layer_norm_eps: 1.0e-06
+  attention_layer_norm_with_affine: true
+  max_sequence_length: 4096
+  max_position_embeddings: null
+  include_bias: false
+  bias_for_layer_norm: null
+  scale_logits: false
+  vocab_size: 152064
+  embedding_size: 152064
+  additional_vocab_size: 128
+  new_embedding_init_range: 0.02
+  weight_tying: false
+  pad_token_id: -1
+  init_device: null
+  init_fn: normal
+  init_std: 0.02
+  init_cutoff_factor: null
+  norm_after: false
+  precision: amp_bf16
+  max_crops: 12
+  crop_mode: overlap-and-resize-c2
+  do_random_scale: false
+  use_col_tokens: true
+  prompt_type: none
+  system_prompt_kind: style_and_length
+  message_formatting: none
+  always_start_with_space: true
+  prompt_override: null
+  default_inference_len: 65
+  overlap_margins:
+  - 4
+  - 4
+  image_padding_embed: pad_and_partial_pad
+  vit_layers:
+  - -2
+  - -9
+  image_pooling_h: 2
+  image_pooling_w: 2
+  image_pooling_2d: attention_meanq
+  image_projector: mlp
+  image_feature_dropout: 0.0
+  use_cls_feature: false
+  fix_image_input_idx: 2
+  unconditioned: false
+  pad_to: null
+  initializer_range: 0.02
+  pad_tokenizer: true
+  normalize_input_embeds: false
+  use_position_ids: true
+  query_pre_attn_scalar: 224
+  attn_logit_softcapping: null
+  final_logit_softcapping: null
+  head_dim: null
+  tokenizer:
+    identifier: mm:hf-Qwen/Qwen2-7B
+    truncate_direction: right
+    tokenizer_adds_space: false
+    tokenizer_dir: null
+    olmo_bos_token_id: null
+    olmo_eos_token_id: null
+  loss_token_weighting: null
+  gin_bindings: null
+ft_llm: true
+ft_vit: true
+ft_connector: true
+ft_embedding: lm_head
+optimizer:
+  name: adamw
+  learning_rate: 0.0001
+  weight_decay: 0.01
+  betas:
+  - 0.9
+  - 0.95
+  eps: 1.0e-05
+  connector_learning_rate: 0.0002
+  vit_learning_rate: 6.0e-06
+  llm_learning_rate: 2.0e-05
+  connector_weight_decay: 0.0
+  vit_weight_decay: 0.0
+  llm_weight_decay: 0.0
+  connector_betas:
+  - 0.9
+  - 0.95
+  vit_betas:
+  - 0.9
+  - 0.95
+  llm_betas:
+  - 0.9
+  - 0.95
+  connector_eps: 1.0e-06
+  vit_eps: 1.0e-06
+  llm_eps: 1.0e-06
+  no_decay_norm_and_bias: null
+  decay_norm_and_bias: false
+  decay_embeddings: false
+  metrics_log_interval: 20
+scheduler:
+  name: multimodal
+  units: steps
+  t_warmup: 100
+  t_max: null
+  alpha_f: 0.1
+  connector_t_warmup: 200
+  vit_t_warmup: 2000
+  llm_t_warmup: 2000
+  grad_clip_warmup_steps: null
+  grad_clip_warmup_factor: null
+  warmup_min_lr: 0.0
+data:
+  multi_modal: true
+  mixture_or_task_name: cockatoo_and_transcript_712k_sept6
+  paths: null
+  datasets: null
+  label_mask_paths: null
+  pad_direction: right
+  generate_attention_mask: false
+  num_workers: 0
+  drop_last: true
+  pin_memory: false
+  prefetch_factor: null
+  persistent_workers: false
+  timeout: 0
+  seed: null
+  instance_filter: null
+  mixture: null
+  sequence_length: 2304
+  shuffle: true
+  for_inference: false
+  split: train
+  use_memory_cache: false
+  num_epochs: null
+  shuffle_buffer_size: 1000
+  per_node_data_loader: null
+restore_dataloader: true
+fast_forward_batches: null
+evaluators:
+- label: val
+  type: multi_modal_lm
+  data:
+    multi_modal: true
+    mixture_or_task_name: cockatoo_and_transcript_712k_sept6
+    paths: null
+    datasets: null
+    label_mask_paths: null
+    pad_direction: right
+    generate_attention_mask: false
+    num_workers: 0
+    drop_last: true
+    pin_memory: false
+    prefetch_factor: null
+    persistent_workers: false
+    timeout: 0
+    seed: null
+    instance_filter: null
+    mixture: null
+    sequence_length: 2304
+    shuffle: false
+    for_inference: false
+    split: validation
+    use_memory_cache: false
+    num_epochs: null
+    shuffle_buffer_size: 1000
+    per_node_data_loader: null
+  device_eval_batch_size: null
+  subset_num_batches: 8
+  max_new_tokens: 448
+  mm_evaluator: null
+  save_dir: null
+  save_to_checkpoint_dir: false
+  eval_name: null
+  skip_if_metrics_cached: true
+- label: caption_val
+  type: multi_modal_lm
+  data:
+    multi_modal: true
+    mixture_or_task_name: cockatoo_476k_gpt_captions
+    paths: null
+    datasets: null
+    label_mask_paths: null
+    pad_direction: right
+    generate_attention_mask: false
+    num_workers: 0
+    drop_last: true
+    pin_memory: false
+    prefetch_factor: null
+    persistent_workers: false
+    timeout: 0
+    seed: null
+    instance_filter: null
+    mixture: null
+    sequence_length: 2304
+    shuffle: false
+    for_inference: false
+    split: validation
+    use_memory_cache: false
+    num_epochs: null
+    shuffle_buffer_size: 1000
+    per_node_data_loader: null
+  device_eval_batch_size: null
+  subset_num_batches: 8
+  max_new_tokens: 448
+  mm_evaluator: null
+  save_dir: null
+  save_to_checkpoint_dir: false
+  eval_name: null
+  skip_if_metrics_cached: true
+eval_interval: 1000
+inf_eval_interval: -1
+inf_evaluators: []
+save_folder: /weka/oe-training-default/chrisc/cockatoo/models/dense-captioner-v22-qwen2/v2-lr2620
+remote_save_folder: null
+canceled_check_interval: 50
+save_interval: 4000
+save_interval_unsharded: 22300
+save_interval_ephemeral: null
+save_num_checkpoints_to_keep: 1
+save_num_unsharded_checkpoints_to_keep: -1
+save_overwrite: true
+force_save_unsharded: false
+no_pre_train_checkpoint: true
+initial_model_checkpoint: null
+load_model_config: null
+load_path: null
+load_path_sharded_checkpointer: null
+reset_optimizer_state: false
+reset_trainer_state: false
+save_dataloader_state: false
+reset_dataloader_state: false
+sharded_checkpointer: torch_legacy
+new_style_checkpoints: null
+max_duration: 22300
+global_train_batch_size: 128
+device_train_batch_size: 2
+device_train_microbatch_size: 4
+device_eval_batch_size: 4
+eval_subset_num_batches: -1
+eval_on_load: false
+device_inf_eval_batch_size: 16
+inf_eval_subset_num_batches: -1
+device_train_grad_accum: 0
+max_grad_norm: 1.0
+batch_divisor: global_batch
+max_grad_norm_ratio: null
+precision: amp_bf16
+wandb:
+  project: cockatoo
+  entity: prior-ai2
+  group: dense-captioner-v22-qwen2
+  name: v2-lr2620
+  tags:
+  - watching
+  log_artifacts: false
+  rank_zero_only: true
+  log_interval: 20
+speed_monitor:
+  window_size: 20
+  gpu_flops_available: null
+console_log_interval: 20
+gen1_gc_interval: 1
+compile: null
+fsdp:
+  use_orig_params: true
+  sharding_strategy: FULL_SHARD
+  wrapping_strategy: by_block_and_size
+  precision: float
+  hybrid_sharding_num_model_replicas: null
+softmax_auxiliary_loss: true
+softmax_auxiliary_loss_scale: 0.0001
+time_limit: null
+extra_steps_after_cancel: 10
+early_stopping_factor: null
+save_data_indices: false
+python_profiling: false
+torch_profiling: false
+stop_at: 22300
+stop_after: null
+activation_checkpointing: whole_layer
+fused_loss: null
+tfds_dir: /weka/oe-training-default/mm-olmo/tensorflow_datasets

config_molmo.py ADDED Viewed

	@@ -0,0 +1,154 @@

+from typing import List, Tuple
+from transformers import PretrainedConfig, AutoTokenizer
+class MolmoVisionConfig(PretrainedConfig):
+    def __init__(
+        self,
+        image_default_input_size: Tuple[int, int] = (336, 336),
+        image_patch_size: int = 14,
+        image_pos_patch_size: int = 14,
+        image_emb_dim: int = 1024,
+        image_num_heads: int = 16,
+        image_num_key_value_heads: int = 16,
+        image_num_layers: int = 23,
+        image_head_dim: int = 64,
+        image_mlp_dim: int = 4096,
+        image_mlp_activations: str = "quick_gelu",
+        residual_dropout: float = 0,
+        image_num_pos: int = 577,
+        image_norm_eps: float = 1e-5,
+        float32_attention: bool = True,
+        attention_type: str = "spda",
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.image_default_input_size = image_default_input_size
+        self.image_patch_size = image_patch_size
+        self.image_pos_patch_size = image_pos_patch_size
+        self.image_emb_dim = image_emb_dim
+        self.image_num_heads = image_num_heads
+        self.image_num_key_value_heads = image_num_key_value_heads
+        self.image_num_layers = image_num_layers
+        self.image_head_dim = image_head_dim
+        self.image_mlp_dim = image_mlp_dim
+        self.image_mlp_activations = image_mlp_activations
+        self.residual_dropout = residual_dropout
+        self.image_num_pos = image_num_pos
+        self.image_norm_eps = image_norm_eps
+        self.float32_attention = float32_attention
+    @property
+    def image_num_patch(self):
+        h, w = self.image_default_input_size
+        return h // self.image_patch_size, w // self.image_patch_size
+class MolmoConfig(PretrainedConfig):
+    model_type = "molmo"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=50304,
+        embedding_size=50304,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        float32_attention=True,
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        use_cache=True,
+        layer_norm_eps: float = 1e-5,
+        rope_theta=10000.0,
+        clip_qkv=None,
+        activation_type="silu",
+        qkv_bias: bool = False,
+        weight_tying: bool = False,
+        use_position_ids: bool=True,
+        tie_word_embeddings: bool=True,
+        bias_for_layer_norm: bool=False,
+        qk_layer_norm: bool=False,
+        norm_after: bool = False,
+        layer_norm_type: str="rms",
+        vision_config: MolmoVisionConfig=None,
+        vit_layers=(-2, -9),
+        residual_dropout: float=0.0,
+        embedding_dropout: float=0.0,
+        attention_dropout: float=0.0,
+        image_feature_dropout: float=0.0,
+        additional_vocab_size=128,
+        attention_type: str = "sdpa",
+        image_padding_embed="pad_and_partial_pad",
+        moe_num_experts=None,
+        moe_top_k=None,
+        normalize_input_embeds: bool=False,
+        scale_logits: bool=False,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = MolmoVisionConfig(**vision_config)
+        elif vision_config is None:
+            self.vision_config = MolmoVisionConfig()
+        else:
+            self.vision_config = vision_config
+        self.vocab_size = vocab_size
+        self.embedding_size = embedding_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.layer_norm_eps = layer_norm_eps
+        self.weight_tying = weight_tying
+        self.use_position_ids = use_position_ids
+        self.qk_layer_norm = qk_layer_norm
+        self.num_key_value_heads = num_key_value_heads
+        self.float32_attention= float32_attention
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.clip_qkv = clip_qkv
+        self.activation_type = activation_type
+        self.qkv_bias = qkv_bias
+        self.norm_after = norm_after
+        self.tie_word_embeddings = tie_word_embeddings
+        self.layer_norm_type = layer_norm_type
+        self.moe_num_experts = moe_num_experts
+        self.moe_top_k = moe_top_k
+        self.vit_layers = vit_layers
+        self.residual_dropout = residual_dropout
+        self.embedding_dropout = embedding_dropout
+        self.attention_dropout = attention_dropout
+        self.image_feature_dropout = image_feature_dropout
+        self.image_padding_embed = image_padding_embed
+        self.bias_for_layer_norm = bias_for_layer_norm
+        self.additional_vocab_size = additional_vocab_size
+        self.attention_type = attention_type
+        self.normalize_input_embeds = normalize_input_embeds
+        self.scale_logits = scale_logits
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    @property
+    def effective_num_key_value_heads(self) -> int:
+        if self.num_key_value_heads is None:
+            return self.num_attention_heads
+        else:
+            return self.num_key_value_heads
+    @property
+    def image_num_patch(self):
+        assert self.vision_config is not None
+        return self.vision_config.image_num_patch
+MolmoVisionConfig.register_for_auto_class()
+MolmoConfig.register_for_auto_class()

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.45.2"
+}

image_preprocessing_molmo.py ADDED Viewed

	@@ -0,0 +1,559 @@

+# coding=utf-8
+# Copyright 2024 FIXME copyright?
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Molmo"""
+import pdb
+from typing import List, Optional, Union, Mapping
+import numpy as np
+import torch
+import torchvision.transforms
+from torchvision.transforms import InterpolationMode
+from torchvision.transforms.functional import convert_image_dtype
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ImageInput,
+)
+from transformers.processing_utils import ImagesKwargs
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+def resize_and_pad(
+    image,
+    desired_output_size,
+    resize_method="torch-bilinear",
+    pad_value=0,
+    normalize=True,
+    image_mean=OPENAI_CLIP_MEAN,
+    image_std=OPENAI_CLIP_STD,
+):
+    """Resize an image while padding to preserve uts aspect ratio."""
+    desired_height, desired_width = desired_output_size
+    height, width = image.shape[:2]
+    # Cast into float32 since the training code did this in float32 and it (very rarely) effects
+    # the results after rounding.
+    image_scale_y = np.array(desired_height, np.float32) / np.array(height, np.float32)
+    image_scale_x = np.array(desired_width, np.float32) / np.array(width, np.float32)
+    image_scale = min(image_scale_x, image_scale_y)
+    scaled_height = int(np.array(height, np.float32) * image_scale)
+    scaled_width = int(np.array(width, np.float32) * image_scale)
+    if resize_method == "tensorflow":
+        # This how the original training code did resizing, it can produce slightly different
+        # results then using torch resize so we keep it just in case
+        import tensorflow as tf
+        image = tf.image.convert_image_dtype(tf.constant(image), dtype=tf.float32)
+        image = tf.image.resize(
+            image,
+            [scaled_height, scaled_width],
+            method=tf.image.ResizeMethod.BILINEAR,
+            antialias=True,
+        )
+        image = tf.clip_by_value(image, 0.0, 1.0)
+        image = image.numpy()
+    elif resize_method == "torch-bilinear":
+        image = torch.permute(torch.from_numpy(image), [2, 0, 1])
+        image = convert_image_dtype(image)  # resize in float32 to match the training code
+        image = torchvision.transforms.Resize(
+            [scaled_height, scaled_width], InterpolationMode.BILINEAR, antialias=True
+        )(image)
+        image = torch.clip(image, 0.0, 1.0)
+        image = torch.permute(image, [1, 2, 0]).numpy()
+    else:
+        raise NotImplementedError(resize_method)
+    top_pad = (desired_height - scaled_height) // 2
+    left_pad = (desired_width - scaled_width) // 2
+    padding = [
+        [top_pad, desired_height - scaled_height - top_pad],
+        [left_pad, desired_width - scaled_width - left_pad],
+        [0, 0]
+    ]
+    image_mask = np.pad(np.ones_like(image[:, :, 0], dtype=bool), padding[:2])
+    image = np.pad(image, padding, constant_values=pad_value)
+    return image, image_mask
+def select_tiling(h, w, patch_size, max_num_crops):
+    """Divide in image of size [w, h] in up to max_num_patches of size patch_size"""
+    original_size = np.stack([h, w])  # [1, 2]
+    original_res = h * w
+    tilings = []
+    for i in range(1, max_num_crops + 1):
+        for j in range(1, max_num_crops + 1):
+            if i*j <= max_num_crops:
+                tilings.append((i, j))
+    # sort so argmin and argmax favour smaller tilings in the event of a tie
+    tilings.sort(key=lambda x: (x[0]*x[1], x[0]))
+    candidate_tilings = np.array(tilings, dtype=np.int32)  # [n_resolutions, 2]
+    candidate_resolutions = candidate_tilings * patch_size  # [n_resolutions, 2]
+    # How much we would need to scale the image to fit exactly in each tiling
+    original_size = np.stack([h, w], dtype=np.float32)  # [1, 2]
+    required_scale_d = candidate_resolutions.astype(np.float32) / original_size
+    required_scale = np.min(required_scale_d, axis=-1, keepdims=True)  # [n_resolutions, 1]
+    if np.all(required_scale < 1):
+        # We are forced to downscale, so try to minimize the amount of downscaling
+        ix = np.argmax(required_scale)
+    else:
+        # Pick the resolution that required the least upscaling so that it most closely fits the image
+        required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
+        ix = np.argmin(required_scale)
+    return candidate_tilings[ix]
+def pixels_to_patches(array, patch_size):
+    """Reshape an image of [h, w, 3] -> [n_patches, pixels_per_patch]"""
+    w, h, c = array.shape
+    h_patches = h//patch_size
+    w_patches = w//patch_size
+    array = np.reshape(array, [h_patches, patch_size, w_patches, patch_size, c])
+    array = np.transpose(array, [0, 2, 1, 3, 4])
+    array = np.reshape(array, [h_patches*w_patches, patch_size*patch_size*c])
+    return array
+def batch_pixels_to_patches(array, patch_size):
+    """Reshape images of [n_images, h, w, 3] -> [n_images, n_patches, pixels_per_patch]"""
+    if len(array.shape) == 3:
+        n_crops, w, h = array.shape
+        h_patches = h//patch_size
+        w_patches = w//patch_size
+        array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size])
+        array = np.transpose(array, [0, 1, 3, 2, 4])
+        array = np.reshape(array, [n_crops, h_patches*w_patches, patch_size*patch_size])
+        return array
+    else:
+        n_crops, w, h, c = array.shape
+        h_patches = h//patch_size
+        w_patches = w//patch_size
+        array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size, c])
+        array = np.transpose(array, [0, 1, 3, 2, 4, 5])
+        array = np.reshape(array, [n_crops, h_patches*w_patches, patch_size*patch_size*c])
+        return array
+class MolmoImagesKwargs(ImagesKwargs, total=False):
+    max_crops: Optional[int]
+    overlap_margins: Optional[List[int]]
+    base_image_input_size: Optional[List[int]]
+    image_token_length_w: Optional[int]
+    image_token_length_h: Optional[int]
+    image_patch_size: Optional[int]
+    image_padding_mask: Optional[bool]
+class MolmoImageProcessor(BaseImageProcessor):
+    """Preprocess images and multi-model inputs"""
+    def __init__(
+        self,
+        max_crops: int = 12,
+        overlap_margins: List[int] = (4, 4),
+        base_image_input_size: List[int] = (336, 336),
+        image_token_length_w: int = 12,
+        image_token_length_h: int = 12,
+        image_patch_size: int = 14,
+        image_padding_mask: bool = True,
+        do_normalize: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.max_crops = max_crops
+        self.overlap_margins = overlap_margins
+        self.base_image_input_size = base_image_input_size
+        self.image_token_length_w = image_token_length_w
+        self.image_token_length_h = image_token_length_h
+        self.image_patch_size = image_patch_size
+        self.image_padding_mask = image_padding_mask
+        self.do_normalize = do_normalize
+    def _normalize(self, image):
+        if self.do_normalize:
+            image -= np.array(OPENAI_CLIP_MEAN, dtype=np.float32)[None, None, :]
+            image /= np.array(OPENAI_CLIP_STD, dtype=np.float32)[None, None, :]
+        return image
+    def image_to_patches_and_tokens(
+        self,
+        image: ImageInput,
+        image_patch_token_id: int,
+        image_col_token_id: int,
+        image_start_token_id: int,
+        image_end_token_id: int,
+        max_crops: Optional[int] = None,
+        overlap_margins: Optional[List[int]] = None,
+        base_image_input_size: Optional[Union[int, List[int]]] = None,
+        image_token_length_w: Optional[int] = None,
+        image_token_length_h: Optional[int] = None,
+        image_patch_size: Optional[int] = None,
+    ):
+        if isinstance(base_image_input_size, int):
+            base_image_input_size = (base_image_input_size, base_image_input_size)
+        base_image_input_d = image_patch_size
+        tokens_per_image = image_token_length_w * image_token_length_h
+        image_base_patch_w = base_image_input_size[1] // base_image_input_d
+        image_base_patch_h = base_image_input_size[0] // base_image_input_d
+        original_image_h, original_image_w = image.shape[:2]
+        crop_size = base_image_input_size[0]
+        # Discard this many patches from the (left/top, right/bottom) of crops
+        left_margin, right_margin = overlap_margins
+        # left_margin, right_margin = 2, 2
+        assert left_margin % 2 == 0  # Required for compatibility with 2x2 pooling
+        total_margin_pixels = base_image_input_d*(right_margin + left_margin)  # pixels removed per dim
+        crop_patches = base_image_input_size[0] // base_image_input_d  # patches per crop dim
+        crop_window_patches = crop_patches - (right_margin + left_margin)  # usable patches
+        crop_window_size = crop_window_patches * base_image_input_d
+        # Decide how to tile the image, to account for the overlap margins we compute the tiling
+        # as if we had an image without the margins and were using a crop size without the margins
+        tiling = select_tiling(
+            original_image_h - total_margin_pixels,
+            original_image_w - total_margin_pixels,
+            crop_window_size,
+            max_crops
+        )
+        src, img_mask = resize_and_pad(
+            image,
+            [tiling[0]*crop_window_size+total_margin_pixels, tiling[1]*crop_window_size+total_margin_pixels]
+        )
+        src = self._normalize(src)
+        # Now we have to split the image into crops, while keeping track of how each patch in the
+        # each crop should be ordered in the global image, this require a lot of tricky booking
+        n_crops = tiling[0] * tiling[1]
+        patches_arr = []
+        mask_arr = []
+        patch_ordering_arr = []
+        # We assume 2x2 pooling, but can allow padding the right/bottom with extra
+        # patches if the number of patches per side is not even
+        assert (crop_patches+1)//2 == image_token_length_h
+        assert (crop_patches+1)//2 == image_token_length_w
+        on = 0
+        on_patch = 0
+        for i in range(tiling[0]):
+            y0 = i*crop_window_size
+            if i == 0:
+                crop_y0 = 0
+            else:
+                crop_y0 = left_margin // 2
+            crop_h = image_base_patch_h - (right_margin + left_margin)
+            if i == 0:
+                crop_h += left_margin
+            if i == (tiling[0]-1):
+                crop_h += right_margin
+            for j in range(tiling[1]):
+                x0 = j*crop_window_size
+                if j == 0:
+                    crop_x0 = 0
+                else:
+                    crop_x0 = left_margin // 2
+                crop_w = image_base_patch_w - (right_margin + left_margin)
+                if j == 0:
+                    crop_w += left_margin
+                if j == (tiling[1]-1):
+                    crop_w += right_margin
+                pooled_w = (crop_w + 1) // 2
+                pooled_h = (crop_h + 1) // 2
+                after_padding_width = image_token_length_w - pooled_w - crop_x0
+                after_padding_height = image_token_length_h - pooled_h - crop_y0
+                patch_ordering_arr.append(
+                    np.pad(
+                        np.reshape(
+                            np.arange(on, on+pooled_h*pooled_w, dtype=np.int32),
+                            (pooled_h, pooled_w)),
+                        [[crop_y0, after_padding_height], [crop_x0, after_padding_width]],
+                        constant_values=-1, mode='constant'
+                    )
+                )
+                patches_arr.append(src[y0:y0+crop_size, x0:x0+crop_size])
+                mask_arr.append(img_mask[y0:y0+crop_size, x0:x0+crop_size])
+                on += pooled_h*pooled_w
+                on_patch += 1
+        patches = np.stack(patches_arr)
+        patch_ordering = np.stack(patch_ordering_arr)
+        img_mask = np.stack(mask_arr)
+        # Switch to [n_crops, n_patches, pixels_per_patch] format
+        image_layout_impatch_w, image_layout_impatch_h = tiling[0], tiling[1]
+        patches = batch_pixels_to_patches(patches, image_patch_size)
+        img_mask = batch_pixels_to_patches(img_mask, image_patch_size)
+        img_mask = img_mask.astype(np.float32).mean(axis=-1)
+        patch_ordering = np.reshape(patch_ordering, [-1])
+        valid = patch_ordering >= 0
+        # Path order numbers the patches crop-by-crop, here we transpose
+        # it to get left-to-right order
+        patch_ordering_rh = np.reshape(
+            patch_ordering,
+            [tiling[0], tiling[1], image_token_length_h, image_token_length_w]
+        )
+        patch_ordering_rh = np.transpose(patch_ordering_rh, [0, 2, 1, 3])
+        patch_ordering_rh = np.reshape(patch_ordering_rh, [-1])
+        # The transpose will screw up which patches are masked, project the
+        # new order into sparse structure of `patch_ordering` to fix it
+        patch_ordering[valid] = patch_ordering_rh[patch_ordering_rh >= 0]
+        # Now build the output tokens
+        h = tiling[0] * crop_window_patches + (right_margin+left_margin)
+        w = tiling[1] * crop_window_patches + (right_margin+left_margin)
+        per_row = np.full(
+            ((w+1)//2,),
+            image_patch_token_id,
+        )
+        per_row = np.concatenate([per_row, [image_col_token_id]], 0)
+        joint = np.tile(per_row, [(h+1)//2])
+        joint = [
+            [image_start_token_id],
+            joint,
+            [image_end_token_id]
+        ]
+        # Finally do the same for the global image
+        resized, _ = resize_and_pad(image, base_image_input_size)
+        resized = self._normalize(resized)
+        resized = pixels_to_patches(resized, image_patch_size)
+        patches = np.concatenate([np.expand_dims(resized, 0), patches], 0)
+        # Global image goes first, so the order of patches in previous crops gets increased
+        patch_ordering = np.where(
+            patch_ordering >= 0,
+            patch_ordering + tokens_per_image,
+            -1
+        )
+        patch_ordering = np.concatenate([np.arange(0, tokens_per_image), patch_ordering], 0)
+        per_row = np.full(
+            (image_token_length_w,),
+            image_patch_token_id,
+        )
+        per_row = np.concatenate([per_row, [image_col_token_id]], 0)
+        extra_tokens = np.tile(per_row, [image_token_length_h])
+        joint = [
+                    [image_start_token_id],
+                    extra_tokens,
+                    [image_end_token_id],
+                ] + joint
+        joint = np.concatenate(joint, 0)
+        img_mask = np.pad(img_mask, [[0, 1], [0, 0]], constant_values=-1)
+        return patches, joint, patch_ordering, img_mask
+    def build_image_input_idx(
+        self,
+        image_tokens: np.ndarray,
+        patch_order: np.ndarray,
+        image_patch_token_id: int,
+        image_token_length_w: int,
+        image_token_length_h: int,
+    ):
+        """Converts `patch_order` into a mapping of token_id -> patch_id"""
+        tokens_per_image = image_token_length_w * image_token_length_h
+        # Indices to insert the patches
+        image_input_idx = image_tokens == image_patch_token_id
+        image_input_idx = np.nonzero(image_input_idx)[0].astype(np.int32)
+        if patch_order is not None:
+            n_tokens = image_input_idx.shape[0]
+            patch_order = np.reshape(patch_order, [-1])
+            n_patches = patch_order.shape[0]
+            valid = patch_order >= 0
+            n_valid_patches = valid.sum()
+            assert len(image_input_idx) == n_valid_patches
+            sorted_patch_ixs = np.zeros([n_tokens], np.int32)
+            sorted_patch_ixs[patch_order[valid]] = np.arange(n_valid_patches, dtype=np.int32)
+            # Project the inverted mapping into same sparse structure
+            sorted_patch_ixs_ex = np.full(np.shape(patch_order), -1)
+            sorted_patch_ixs_ex[valid] = sorted_patch_ixs
+            # Do the gather and then re-masked outputs that were masked in `sorted_patch_ixs`
+            valid = (sorted_patch_ixs_ex >= 0).astype(np.int32)
+            image_input_idx = image_input_idx[sorted_patch_ixs_ex*valid]
+            image_input_idx = image_input_idx*valid - 100*(1 - valid)
+            image_input_idx = np.reshape(image_input_idx, [-1, tokens_per_image])
+        return image_input_idx
+    def preprocess(
+        self,
+        image: np.ndarray,
+        image_patch_token_id: int,
+        image_col_token_id: int,
+        image_start_token_id: int,
+        image_end_token_id: int,
+        max_crops: Optional[int] = None,
+        overlap_margins: Optional[List[int]] = None,
+        base_image_input_size: Optional[Union[int, List[int]]] = None,
+        image_token_length_w: Optional[int] = None,
+        image_token_length_h: Optional[int] = None,
+        image_patch_size: Optional[int] = None,
+        **kwargs,
+    ):
+        """Preprocesses a single image
+        Returns:
+            crops: (n_crops, n_patches, patch_dim) individual crops, `n_crops` might
+                   change between images but the other dimension are fixed
+            tokens: (n_tokens,) int32 tokens, pad tokens indicate where to insert the
+                                patch features, might include other special tokens as well
+            image_idx: (n_crops, n_patches) index in `tokens` to put the patch features from the
+                       crops after pooling, negative values indicates patches features to exclude
+            padding_mask: (n_crops, n_patches) what percent of each crop is padding, can be None
+                          if the image mask is not being used.
+        """
+        max_crops = max_crops or self.max_crops
+        overlap_margins = overlap_margins or self.overlap_margins
+        base_image_input_size = base_image_input_size or self.base_image_input_size
+        image_token_length_w = image_token_length_w or self.image_token_length_w
+        image_token_length_h = image_token_length_h or self.image_token_length_h
+        image_patch_size = image_patch_size or self.image_patch_size
+        crops, image_tokens, patch_ordering, img_mask = self.image_to_patches_and_tokens(
+            image,
+            image_patch_token_id,
+            image_col_token_id,
+            image_start_token_id,
+            image_end_token_id,
+            max_crops,
+            overlap_margins,
+            base_image_input_size,
+            image_token_length_w,
+            image_token_length_h,
+            image_patch_size,
+        )
+        patch_idx = self.build_image_input_idx(
+            image_tokens,
+            patch_ordering,
+            image_patch_token_id,
+            image_token_length_w=image_token_length_w,
+            image_token_length_h=image_token_length_h,
+        )
+        return crops, image_tokens, patch_idx, img_mask
+    def multimodal_preprocess(
+        self,
+        images: np.ndarray,
+        tokens: List[int],
+        image_idx: np.ndarray,
+        sequence_length: int,
+        image_patch_token_id: int,
+        image_col_token_id: int,
+        image_start_token_id: int,
+        image_end_token_id: int,
+        **kwargs,
+    ):
+        """Merge images and text tokens into multi-modal features for the model
+        :param images: images to use as input
+        :param tokens: input text tokens
+        :param image_idx: where to insert the images into `tokens`
+        :params image_patch_token_id: id to use of tokens that will contain image features
+        :params image_col_token_id: token id for image column special tokens
+        :params image_start_token_id: token id for image start special tokens
+        :params image_end_token_id: token id for image end special tokens
+        :params kwargs: override preprocessor default args
+        """
+        if images is None:
+            return {"input_ids": tokens}
+        max_total_crops = kwargs.get("max_crops") or self.max_crops
+        image_token_length_w = kwargs.get("image_token_length_w") or self.image_token_length_w
+        image_token_length_h = kwargs.get("image_token_length_h") or self.image_token_length_h
+        image_patch_size = kwargs.get("image_patch_size") or self.image_patch_size
+        base_image_input_size = kwargs.get("base_image_input_size") or self.base_image_input_size
+        image_num_patch = (
+            base_image_input_size[0] // image_patch_size,
+            base_image_input_size[1] // image_patch_size,
+        )
+        image_padding_mask = kwargs.get("image_padding_mask") or self.image_padding_mask
+        tokens_per_image = image_token_length_w * image_token_length_h
+        n_pixels = image_patch_size * image_patch_size * 3
+        n_patches = image_num_patch[0] * image_num_patch[1]
+        n = len(images)
+        all_crops = []
+        all_image_idx = []
+        out_tokens = []
+        all_crop_masks = []
+        for ix in range(n):
+            token_ix = image_idx[ix]
+            crops, image_tokens, patch_idx, img_mask = self.preprocess(
+                images[ix],
+                image_patch_token_id,
+                image_col_token_id,
+                image_start_token_id,
+                image_end_token_id,
+                **kwargs,
+            )
+            if token_ix == -1:  # -1 is an image inserted at the very start
+                start = 0
+                token_ix = 0
+                end = 0
+            else:
+                start = 0 if ix == 0 else image_idx[ix-1] + 1
+                end = token_ix + 1
+            all_image_idx.append(patch_idx + token_ix)
+            all_crops.append(crops)
+            out_tokens.append(tokens[start:token_ix])
+            out_tokens.append(image_tokens)
+            if ix == (n - 1):
+                out_tokens.append(tokens[end:])
+            if image_padding_mask:
+                all_crop_masks.append(img_mask)
+        input_ids = np.concatenate(out_tokens, 0)
+        images = np.concatenate(all_crops, 0)
+        image_input_idx = np.concatenate(all_image_idx, 0)
+        if image_padding_mask:
+            image_masks = np.concatenate(all_crop_masks, 0)
+        else:
+            image_masks = None
+        out = {
+            "input_ids": input_ids,
+            "images": images,
+            "image_input_idx": image_input_idx
+        }
+        if image_masks is not None:
+            out["image_masks"] = image_masks
+        return out
+MolmoImageProcessor.register_for_auto_class()

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00e280f08b21b532fd24f3612b4ec22a13deaba10f791c941bf60f09d294d276
+size 4978535216

model-00002-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4efc71ab1c571f6bc98f92730564d0e6c43ae19bf2064935a59e1578a594885d
+size 4778633832

model-00003-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5351111ee31fe0a75e761e821cd450784872b2b227b1d62d635b1557f29ab6a5
+size 4661160096

model-00004-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:022f009cc892dbf74d0bdf326d44d9cecc2feee0269ddcba7e8dec8d905b13d0
+size 4661160112

model-00005-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c64261b5d186bfba867b39c14d69ecb9025732c9f6a18ef148e382d511e49f7d
+size 4661160112

model-00006-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:81b93d05e9e5d33f87859abc5d469eee265da6a4310f2588f2cc09d37cd5d704
+size 4543686344

model-00007-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c98e944773bb67434de633e5e3ba67c77d64108e06dd6641d30fac1b3f91193
+size 3799841448

model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14dec03b7a8516c7482a938b07665db1ac8330a2c52d8ae2a4f767d09316b248
+size 32084399338

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,592 @@

+{
+  "metadata": {
+    "total_size": 32084101120
+  },
+  "weight_map": {
+    "model.transformer.blocks.0.attn.att_proj.bias": "model-00001-of-00007.safetensors",
+    "model.transformer.blocks.0.attn.att_proj.weight": "model-00001-of-00007.safetensors",
+    "model.transformer.blocks.0.attn.attn_out.weight": "model-00001-of-00007.safetensors",
+    "model.transformer.blocks.0.attn_norm.weight": "model-00001-of-00007.safetensors",
+    "model.transformer.blocks.0.ff_norm.weight": "model-00001-of-00007.safetensors",
+    "model.transformer.blocks.0.mlp.ff_out.weight": "model-00001-of-00007.safetensors",
+    "model.transformer.blocks.0.mlp.ff_proj.weight": "model-00001-of-00007.safetensors",
+    "model.transformer.blocks.1.attn.att_proj.bias": "model-00001-of-00007.safetensors",
+    "model.transformer.blocks.1.attn.att_proj.weight": "model-00001-of-00007.safetensors",
+    "model.transformer.blocks.1.attn.attn_out.weight": "model-00001-of-00007.safetensors",
+    "model.transformer.blocks.1.attn_norm.weight": "model-00001-of-00007.safetensors",
+    "model.transformer.blocks.1.ff_norm.weight": "model-00001-of-00007.safetensors",
+    "model.transformer.blocks.1.mlp.ff_out.weight": "model-00001-of-00007.safetensors",
+    "model.transformer.blocks.1.mlp.ff_proj.weight": "model-00001-of-00007.safetensors",
+    "model.transformer.blocks.10.attn.att_proj.bias": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.10.attn.att_proj.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.10.attn.attn_out.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.10.attn_norm.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.10.ff_norm.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.10.mlp.ff_out.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.10.mlp.ff_proj.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.11.attn.att_proj.bias": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.11.attn.att_proj.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.11.attn.attn_out.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.11.attn_norm.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.11.ff_norm.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.11.mlp.ff_out.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.11.mlp.ff_proj.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.12.attn.att_proj.bias": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.12.attn.att_proj.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.12.attn.attn_out.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.12.attn_norm.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.12.ff_norm.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.12.mlp.ff_out.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.12.mlp.ff_proj.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.13.attn.att_proj.bias": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.13.attn.att_proj.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.13.attn.attn_out.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.13.attn_norm.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.13.ff_norm.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.13.mlp.ff_out.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.13.mlp.ff_proj.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.14.attn.att_proj.bias": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.14.attn.att_proj.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.14.attn.attn_out.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.14.attn_norm.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.14.ff_norm.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.14.mlp.ff_out.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.14.mlp.ff_proj.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.15.attn.att_proj.bias": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.15.attn.att_proj.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.15.attn.attn_out.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.15.attn_norm.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.15.ff_norm.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.15.mlp.ff_out.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.15.mlp.ff_proj.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.16.attn.att_proj.bias": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.16.attn.att_proj.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.16.attn.attn_out.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.16.attn_norm.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.16.ff_norm.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.16.mlp.ff_out.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.16.mlp.ff_proj.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.17.attn.att_proj.bias": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.17.attn.att_proj.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.17.attn.attn_out.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.17.attn_norm.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.17.ff_norm.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.17.mlp.ff_out.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.17.mlp.ff_proj.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.18.attn.att_proj.bias": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.18.attn.att_proj.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.18.attn.attn_out.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.18.attn_norm.weight": "model-00004-of-00007.safetensors",
+    "model.transformer.blocks.18.ff_norm.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.18.mlp.ff_out.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.18.mlp.ff_proj.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.19.attn.att_proj.bias": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.19.attn.att_proj.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.19.attn.attn_out.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.19.attn_norm.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.19.ff_norm.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.19.mlp.ff_out.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.19.mlp.ff_proj.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.2.attn.att_proj.bias": "model-00001-of-00007.safetensors",
+    "model.transformer.blocks.2.attn.att_proj.weight": "model-00001-of-00007.safetensors",
+    "model.transformer.blocks.2.attn.attn_out.weight": "model-00001-of-00007.safetensors",
+    "model.transformer.blocks.2.attn_norm.weight": "model-00001-of-00007.safetensors",
+    "model.transformer.blocks.2.ff_norm.weight": "model-00001-of-00007.safetensors",
+    "model.transformer.blocks.2.mlp.ff_out.weight": "model-00001-of-00007.safetensors",
+    "model.transformer.blocks.2.mlp.ff_proj.weight": "model-00001-of-00007.safetensors",
+    "model.transformer.blocks.20.attn.att_proj.bias": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.20.attn.att_proj.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.20.attn.attn_out.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.20.attn_norm.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.20.ff_norm.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.20.mlp.ff_out.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.20.mlp.ff_proj.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.21.attn.att_proj.bias": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.21.attn.att_proj.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.21.attn.attn_out.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.21.attn_norm.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.21.ff_norm.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.21.mlp.ff_out.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.21.mlp.ff_proj.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.22.attn.att_proj.bias": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.22.attn.att_proj.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.22.attn.attn_out.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.22.attn_norm.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.22.ff_norm.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.22.mlp.ff_out.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.22.mlp.ff_proj.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.23.attn.att_proj.bias": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.23.attn.att_proj.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.23.attn.attn_out.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.23.attn_norm.weight": "model-00005-of-00007.safetensors",
+    "model.transformer.blocks.23.ff_norm.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.23.mlp.ff_out.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.23.mlp.ff_proj.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.24.attn.att_proj.bias": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.24.attn.att_proj.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.24.attn.attn_out.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.24.attn_norm.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.24.ff_norm.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.24.mlp.ff_out.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.24.mlp.ff_proj.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.25.attn.att_proj.bias": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.25.attn.att_proj.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.25.attn.attn_out.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.25.attn_norm.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.25.ff_norm.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.25.mlp.ff_out.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.25.mlp.ff_proj.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.26.attn.att_proj.bias": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.26.attn.att_proj.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.26.attn.attn_out.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.26.attn_norm.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.26.ff_norm.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.26.mlp.ff_out.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.26.mlp.ff_proj.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.27.attn.att_proj.bias": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.27.attn.att_proj.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.27.attn.attn_out.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.27.attn_norm.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.27.ff_norm.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.27.mlp.ff_out.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.27.mlp.ff_proj.weight": "model-00006-of-00007.safetensors",
+    "model.transformer.blocks.3.attn.att_proj.bias": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.3.attn.att_proj.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.3.attn.attn_out.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.3.attn_norm.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.3.ff_norm.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.3.mlp.ff_out.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.3.mlp.ff_proj.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.4.attn.att_proj.bias": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.4.attn.att_proj.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.4.attn.attn_out.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.4.attn_norm.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.4.ff_norm.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.4.mlp.ff_out.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.4.mlp.ff_proj.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.5.attn.att_proj.bias": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.5.attn.att_proj.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.5.attn.attn_out.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.5.attn_norm.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.5.ff_norm.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.5.mlp.ff_out.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.5.mlp.ff_proj.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.6.attn.att_proj.bias": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.6.attn.att_proj.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.6.attn.attn_out.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.6.attn_norm.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.6.ff_norm.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.6.mlp.ff_out.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.6.mlp.ff_proj.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.7.attn.att_proj.bias": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.7.attn.att_proj.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.7.attn.attn_out.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.7.attn_norm.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.7.ff_norm.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.7.mlp.ff_out.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.7.mlp.ff_proj.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.8.attn.att_proj.bias": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.8.attn.att_proj.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.8.attn.attn_out.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.8.attn_norm.weight": "model-00002-of-00007.safetensors",
+    "model.transformer.blocks.8.ff_norm.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.8.mlp.ff_out.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.8.mlp.ff_proj.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.9.attn.att_proj.bias": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.9.attn.att_proj.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.9.attn.attn_out.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.9.attn_norm.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.9.ff_norm.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.9.mlp.ff_out.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.blocks.9.mlp.ff_proj.weight": "model-00003-of-00007.safetensors",
+    "model.transformer.ff_out.weight": "model-00007-of-00007.safetensors",
+    "model.transformer.ln_f.weight": "model-00001-of-00007.safetensors",
+    "model.transformer.wte.embedding": "model-00001-of-00007.safetensors",
+    "model.transformer.wte.new_embedding": "model-00001-of-00007.safetensors",
+    "model.vision_backbone.image_pooling_2d.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_pooling_2d.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_pooling_2d.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_pooling_2d.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_pooling_2d.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_pooling_2d.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_pooling_2d.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_pooling_2d.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_projector.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_projector.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_projector.w3.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.0.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.0.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.0.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.0.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.0.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.0.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.0.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.0.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.0.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.0.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.0.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.0.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.0.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.0.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.0.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.0.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.1.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.1.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.1.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.1.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.1.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.1.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.1.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.1.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.1.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.1.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.1.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.1.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.1.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.1.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.1.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.1.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.10.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.10.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.10.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.10.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.10.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.10.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.10.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.10.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.10.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.10.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.10.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.10.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.10.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.10.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.10.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.10.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.11.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.11.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.11.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.11.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.11.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.11.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.11.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.11.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.11.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.11.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.11.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.11.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.11.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.11.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.11.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.11.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.12.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.12.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.12.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.12.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.12.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.12.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.12.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.12.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.12.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.12.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.12.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.12.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.12.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.12.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.12.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.12.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.13.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.13.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.13.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.13.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.13.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.13.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.13.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.13.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.13.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.13.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.13.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.13.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.13.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.13.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.13.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.13.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.14.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.14.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.14.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.14.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.14.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.14.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.14.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.14.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.14.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.14.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.14.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.14.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.14.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.14.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.14.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.14.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.15.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.15.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.15.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.15.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.15.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.15.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.15.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.15.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.15.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.15.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.15.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.15.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.15.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.15.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.15.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.15.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.16.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.16.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.16.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.16.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.16.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.16.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.16.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.16.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.16.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.16.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.16.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.16.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.16.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.16.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.16.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.16.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.17.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.17.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.17.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.17.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.17.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.17.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.17.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.17.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.17.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.17.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.17.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.17.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.17.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.17.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.17.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.17.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.18.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.18.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.18.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.18.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.18.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.18.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.18.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.18.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.18.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.18.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.18.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.18.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.18.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.18.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.18.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.18.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.19.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.19.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.19.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.19.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.19.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.19.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.19.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.19.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.19.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.19.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.19.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.19.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.19.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.19.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.19.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.19.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.2.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.2.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.2.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.2.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.2.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.2.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.2.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.2.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.2.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.2.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.2.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.2.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.2.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.2.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.2.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.2.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.20.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.20.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.20.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.20.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.20.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.20.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.20.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.20.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.20.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.20.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.20.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.20.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.20.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.20.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.20.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.20.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.21.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.21.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.21.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.21.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.21.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.21.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.21.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.21.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.21.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.21.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.21.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.21.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.21.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.21.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.21.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.21.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.22.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.22.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.22.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.22.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.22.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.22.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.22.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.22.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.22.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.22.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.22.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.22.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.22.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.22.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.22.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.22.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.3.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.3.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.3.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.3.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.3.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.3.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.3.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.3.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.3.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.3.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.3.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.3.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.3.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.3.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.3.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.3.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.4.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.4.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.4.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.4.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.4.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.4.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.4.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.4.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.4.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.4.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.4.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.4.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.4.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.4.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.4.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.4.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.5.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.5.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.5.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.5.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.5.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.5.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.5.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.5.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.5.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.5.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.5.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.5.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.5.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.5.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.5.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.5.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.6.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.6.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.6.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.6.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.6.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.6.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.6.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.6.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.6.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.6.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.6.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.6.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.6.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.6.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.6.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.6.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.7.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.7.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.7.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.7.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.7.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.7.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.7.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.7.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.7.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.7.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.7.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.7.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.7.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.7.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.7.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.7.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.8.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.8.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.8.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.8.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.8.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.8.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.8.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.8.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.8.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.8.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.8.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.8.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.8.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.8.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.8.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.8.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.9.attention.wk.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.9.attention.wk.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.9.attention.wo.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.9.attention.wo.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.9.attention.wq.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.9.attention.wq.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.9.attention.wv.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.9.attention.wv.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.9.attention_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.9.attention_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.9.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.9.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.9.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.9.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.9.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.blocks.9.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.class_embedding": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.patch_embedding.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.positional_embedding": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.pre_ln.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.pre_ln.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.pad_embed": "model-00007-of-00007.safetensors"
+  }
+}

modeling_molmo.py ADDED Viewed

	@@ -0,0 +1,1398 @@

+import math
+from copy import deepcopy
+from dataclasses import fields, dataclass, replace
+from enum import Enum
+from typing import List, Optional, Tuple, Union, Dict, Any, Sequence, Callable, cast, MutableMapping
+import torch
+from transformers import PreTrainedModel, GenerationConfig, add_start_docstrings
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache
+from transformers.modeling_flash_attention_utils import _flash_attention_forward
+from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput
+from transformers.models.auto import AutoModelForCausalLM
+from torch import nn
+from transformers.utils import logging
+from .config_molmo import MolmoConfig, MolmoVisionConfig
+from torch.nn import functional as F
+logger = logging.get_logger(__name__)
+MOLMO_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`MolmoConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare Molmo Model outputting raw hidden-states without any specific head on top.",
+    MOLMO_START_DOCSTRING,
+)
+class MolmoPreTrainedModel(PreTrainedModel):
+    config_class = MolmoConfig
+    base_model_prefix = "model"
+    _no_split_modules = ["MolmoBlock", "MolmoeBlock", "MolmoVisionBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    # supports_gradient_checkpointing = True
+    # _supports_cache_class = True
+    # _supports_static_cache = False
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear,)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+class MolmoRotaryEmbedding(nn.Module):
+    """
+    [Rotary positional embeddings (RoPE)](https://arxiv.org/abs/2104.09864).
+    """
+    def __init__(self, dim, max_position_embeddings=2048, rope_theta=10000, full_precision=True, device=None):
+        super().__init__()
+        self.dim = dim
+        self.rope_theta = rope_theta
+        self.full_precision = full_precision
+        self.max_position_embeddings = max_position_embeddings
+        # Cache sin/cos embeddings
+        dim = self.dim
+        inv_freq = 1.0 / (self.rope_theta ** (torch.arange(0, dim, 2, device=device, dtype=torch.float) / dim))
+        seq = torch.arange(self.max_position_embeddings, device=device, dtype=torch.float)
+        freqs = torch.einsum("i , j -> i j", seq, inv_freq)
+        positions = torch.cat((freqs, freqs), dim=-1)
+        pos_sin, pos_cos = positions.sin()[None, None, :, :], positions.cos()[None, None, :, :]
+        self.register_buffer("rope_pos_sin", pos_sin, persistent=False)
+        self.register_buffer("rope_pos_cos", pos_cos, persistent=False)
+    def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
+        B, nh, T, hs = x.size()
+        x = x.view(B, nh, T, 2, hs // 2)
+        x1, x2 = x.unbind(dim=-2)
+        return torch.cat((-x2, x1), dim=-1)
+    def apply_rotary_pos_emb(self, pos_sin: torch.Tensor, pos_cos: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+        return (t * pos_cos) + (self.rotate_half(t) * pos_sin)
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        position_ids: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.full_precision:
+            q_, k_ = q.float(), k.float()
+        else:
+            q_, k_ = q, k
+        with torch.autocast(q.device.type, enabled=False):
+            batch_size = q_.shape[0]
+            query_len, key_len = q_.shape[-2], k_.shape[-2]  # could be different if layer_past not None
+            if position_ids is not None:
+                freqs_cis_len = self.max_position_embeddings
+            else:
+                freqs_cis_len = key_len
+            # self.get_rotary_embedding(freqs_cis_len, q_.device)
+            pos_sin = self.rope_pos_sin[:, :, :freqs_cis_len, :].type_as(q_)
+            pos_cos = self.rope_pos_cos[:, :, :freqs_cis_len, :].type_as(q_)
+            if position_ids is not None:
+                assert query_len == key_len, "Query and key lengths must be equal when using position IDs."
+                pos_sin = pos_sin[0, 0][position_ids].view(
+                    (batch_size, 1, key_len, pos_sin.shape[-1])
+                )
+                pos_cos = pos_cos[0, 0][position_ids].view(
+                    (batch_size, 1, key_len, pos_cos.shape[-1])
+                )
+            q_ = self.apply_rotary_pos_emb(
+                pos_sin[:, :, key_len - query_len : key_len, :],
+                pos_cos[:, :, key_len - query_len : key_len, :],
+                q_,
+            )
+            k_ = self.apply_rotary_pos_emb(pos_sin, pos_cos, k_)
+        return q_.type_as(q), k_.type_as(k)
+class MolmoAttention(nn.Module):
+    def __init__(
+        self,
+        config: MolmoConfig,
+        device=None
+    ):
+        super().__init__()
+        self.config = config
+        self.rotary_emb = MolmoRotaryEmbedding(
+            config.hidden_size // config.num_attention_heads,
+            config.max_position_embeddings,
+            config.rope_theta, device=device)
+        self.k_norm: Optional[nn.Module] = None
+        self.q_norm: Optional[nn.Module] = None
+        self.hidden_size = config.intermediate_size
+        if config.qk_layer_norm:
+            if config.num_key_value_heads is None:
+                config.num_key_value_heads = config.num_attention_heads
+            self.q_norm = MolmoRmsLayerNorm(
+                config,
+                size=config.hidden_size,
+                eps=config.layer_norm_eps
+            )
+            self.k_norm = MolmoRmsLayerNorm(
+                config,
+                size=config.hidden_size,
+                eps=config.layer_norm_eps
+            )
+        # Attention output projection.
+        input_dim = config.hidden_size
+        head_dim = config.hidden_size // config.num_attention_heads
+        self.fused_dims = (
+            config.hidden_size,
+            config.num_key_value_heads * head_dim,
+            config.num_key_value_heads * head_dim,
+        )
+        self.att_proj = nn.Linear(
+            config.hidden_size, sum(self.fused_dims),
+            bias=config.qkv_bias,
+        )
+        self.attn_out = nn.Linear(
+            input_dim, config.hidden_size,
+            bias=False,
+        )
+    def attention(self,
+                  q: torch.Tensor,
+                  k: torch.Tensor,
+                  v: torch.Tensor,
+                  attention_mask: Optional[torch.Tensor] = None,
+                  position_ids: Optional[torch.Tensor] = None,
+                  drop_mask: Optional[torch.Tensor] = None,
+                  layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+                  use_cache: bool = False,
+                  ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        B, T, C = q.size()  # batch size, sequence length, hidden_size
+        dtype = k.dtype
+        # Optionally apply layer norm to keys and queries.
+        if self.q_norm is not None and self.k_norm is not None:
+            q = self.q_norm(q).to(dtype=dtype)
+            k = self.k_norm(k).to(dtype=dtype)
+        # Move head forward to be next to the batch dim.
+        # shape: (B, nh, T, hs)
+        q = q.view(B, T, self.config.num_attention_heads, C // self.config.num_attention_heads).transpose(1, 2)
+        # shape: (B, n_kv_h, T, hs)
+        k = k.view(B, T, self.config.num_key_value_heads, C // self.config.num_attention_heads).transpose(1, 2)
+        # shape: (B, n_kv_h, T, hs)
+        v = v.view(B, T, self.config.num_key_value_heads, C // self.config.num_attention_heads).transpose(1, 2)
+        # Apply rotary embeddings
+        q, k = self.rotary_emb(q, k, position_ids=position_ids)
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            k = torch.cat((past_key.to(k.device), k), dim=-2)
+            v = torch.cat((past_value.to(v.device), v), dim=-2)
+        present = (k, v) if use_cache else None
+        query_len, key_len = q.shape[-2], k.shape[-2]  # could be different if layer_past not None
+        if attention_mask is not None:
+            attention_mask = attention_mask[:, :, key_len - query_len: key_len, :key_len]
+        # if attention_bias is not None:
+        #     attention_bias = self._cast_attn_bias(
+        #         attention_bias[:, :, key_len - query_len : key_len, :key_len], dtype)
+        # Get the attention scores.
+        # shape: (B, nh, T, hs)
+        att = self._scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attention_mask=attention_mask,
+            dropout_p=0.0 if not self.training else self.config.attention_dropout,
+            is_causal=attention_mask is None,
+        )
+        # Re-assemble all head outputs side-by-side.
+        att = att.transpose(1, 2).contiguous().view(B, T, C)
+        # Apply output projection.
+        return self.attn_out(att), present
+    def _scaled_dot_product_attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        dropout_p: float = 0.0,
+        is_causal: bool = False,
+    ) -> torch.Tensor:
+        if attention_mask is not None:
+            attention_mask = attention_mask.to(q.device)
+        if self.config.attention_type == "sdpa":
+            assert k.size(1) == v.size(1)
+            num_kv_heads = k.size(1)
+            num_q_heads = q.size(1)
+            if num_q_heads != num_kv_heads:
+                assert num_q_heads % num_kv_heads == 0
+                k = k.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
+                v = v.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
+            return F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=attention_mask,
+                dropout_p=dropout_p,
+                is_causal=is_causal,
+            )
+        elif self.config.attention_type == "flash":
+            # Downcast in case we are running with fp32 hidden states
+            # Our attention mask is [1, 1, N, N]
+            valid_mask = torch.reduce_any(attention_mask, -1)[0]
+            attn_output = _flash_attention_forward(
+                q.transpose(1, 2).to(torch.bfloat16),
+                k.transpose(1, 2).to(torch.bfloat16),
+                v.transpose(1, 2).to(torch.bfloat16),
+                attention_mask=valid_mask,
+                query_length=q.shape[2],
+                is_causal=True,
+            )
+        else:
+            raise NotImplementedError(self.config.attention_type)
+    def forward(
+        self,
+        x,
+        attention_mask,
+        position_ids,
+        layer_past,
+        use_cache
+    ):
+        qkv = self.att_proj(x)
+        q, k, v = qkv.split(self.fused_dims, dim=-1)
+        # Get attention scores.
+        att, cache = self.attention(
+            q, k, v,
+            attention_mask,
+            position_ids=position_ids,
+            layer_past=layer_past,
+            use_cache=use_cache
+        )
+        return att, cache
+class MolmoMlp(nn.Module):
+    def __init__(self, input_dim, hidden_size, activation_fn, include_bias=False):
+        super().__init__()
+        self.ff_proj = nn.Linear(input_dim, hidden_size, bias=include_bias)
+        self.ff_out = nn.Linear(hidden_size//2, input_dim, bias=include_bias)
+        self.act = ACT2FN[activation_fn]
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        x = self.ff_proj(x)
+        x, gate = x.chunk(2, dim=-1)
+        x = self.act(gate) * x
+        x = self.ff_out(x)
+        return x
+class MolmoBlock(nn.Module):
+    def __init__(self, config: MolmoConfig, device=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.intermediate_size
+        self.dropout = nn.Dropout(config.residual_dropout)
+        self.attn = MolmoAttention(config)
+        self.attn_norm = MolmoRmsLayerNorm(config, size=config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = MolmoMlp(config.hidden_size, config.intermediate_size, config.activation_type)
+        self.ff_norm = MolmoRmsLayerNorm(config)
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        if not self.config.norm_after:
+            atten_in = self.attn_norm(x)
+        else:
+            atten_in = x
+        att, cache = self.attn(
+            atten_in,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            layer_past=layer_past,
+            use_cache=use_cache
+        )
+        if self.config.norm_after:
+            att = self.attn_norm(att)
+        x = x + self.dropout(att)
+        og_x = x
+        if not self.config.norm_after:
+            x = self.ff_norm(x)
+        x = self.mlp(x)
+        if self.config.norm_after:
+            x = self.ff_norm(x)
+        x = self.dropout(x)
+        x = og_x + x
+        return x, cache
+class MolmoeMLP(nn.Module):
+    def __init__(self, input_dim, hidden_size, activation):
+        super().__init__()
+        self.gate_proj = nn.Linear(input_dim, hidden_size, bias=False)
+        self.up_proj = nn.Linear(input_dim, hidden_size, bias=False)
+        self.down_proj = nn.Linear(hidden_size, input_dim, bias=False)
+        self.act_fn = ACT2FN[activation]
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+class MolmoeMlpExpert(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_experts = config.moe_num_experts
+        self.top_k = config.moe_top_k
+        self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False)
+        self.experts = nn.ModuleList([MolmoeMLP(config.hidden_size, config.intermediate_size // 2, config.activation_type)
+                                      for _ in range(self.num_experts)])
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # hidden_states = self.ff_norm(hidden_states)
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be selected
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.num_experts):
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states, router_logits
+class MolmoeBlock(nn.Module):
+    def __init__(self, config: MolmoConfig):
+        super().__init__()
+        self.attn = MolmoAttention(config)
+        self.attn_norm = MolmoRmsLayerNorm(config, size=config.hidden_size, eps=config.layer_norm_eps)
+        assert config.moe_num_experts > 0
+        self.ff_norm = MolmoRmsLayerNorm(config, size=config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = MolmoeMlpExpert(config)
+        self.config = config
+        self.hidden_size = config.intermediate_size
+        self.dropout = nn.Dropout(config.residual_dropout)
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        if not self.config.norm_after:
+            atten_in = self.attn_norm(x)
+        else:
+            atten_in = x
+        att, cache = self.attn(
+            atten_in,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            layer_past=layer_past,
+            use_cache=use_cache
+        )
+        if self.config.norm_after:
+            att = self.attn_norm(att)
+        x = x + self.dropout(att)
+        og_x = x
+        if not self.config.norm_after:
+            x = self.ff_norm(x)
+        x, _ = self.mlp(x)
+        if self.config.norm_after:
+            x = self.ff_norm(x)
+        x = self.dropout(x)
+        x = og_x + x
+        return x, cache
+class Embedding(nn.Module):
+    def __init__(
+        self,
+        num_embeddings: int,
+        num_new_embeddings: int,
+        features: int,
+        device: Union[str, torch.device] = None,
+        initializer_range: float = 0.02,
+        new_embed_initializer_range: float = 0.02,
+    ):
+        super().__init__()
+        self.initializer_range = initializer_range
+        self.new_embed_initializer_range = new_embed_initializer_range
+        self.embedding = nn.Parameter(
+            torch.zeros(num_embeddings, features, device=device),
+        )
+        # We keep the special token embedding separate from the embedding from the LM so we can
+        # put a separate learning rate of them during training
+        self.new_embedding = nn.Parameter(torch.zeros(num_new_embeddings, features, device=device))
+    def reset_parameters(self):
+        nn.init.normal_(self.embedding, std=self.initializer_range)
+        nn.init.normal_(self.new_embedding, std=self.new_embed_initializer_range)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.embedding(x, torch.cat([self.embedding, self.new_embedding], dim=0))
+def _expand_token(token, batch_size: int):
+    return token.view(1, 1, -1).expand(batch_size, -1, -1)
+class VisionMlp(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int, hidden_act: str, device=None):
+        super().__init__()
+        self.w1 = nn.Linear(dim, hidden_dim, bias=True, device=device)
+        self.act = ACT2FN[hidden_act]
+        self.w2 = nn.Linear(hidden_dim, dim, bias=True, device=device)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w2(self.act(self.w1(x)))
+class MolmoVisionBlock(nn.Module):
+    def __init__(self, config: MolmoVisionConfig, attention_type, device=None):
+        super().__init__()
+        self.attention = VisionAttention(config, device=device, attention_type=attention_type)
+        self.feed_forward = VisionMlp(
+            config.image_emb_dim, config.image_mlp_dim, config.image_mlp_activations, device)
+        self.attention_norm = nn.LayerNorm(
+            config.image_emb_dim,
+            eps=config.image_norm_eps,
+            device=device,
+        )
+        self.ffn_norm = nn.LayerNorm(
+            config.image_emb_dim,
+            eps=config.image_norm_eps,
+            device=device,
+        )
+    def reset_parameters(self):
+        self.attention.reset_parameters()
+        self.feed_forward.reset_parameters()
+        self.attention_norm.reset_parameters()
+        self.ffn_norm.reset_parameters()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attention(self.attention_norm(x))
+        x = x + self.feed_forward(self.ffn_norm(x))
+        return x
+class VisionPreLayerNorm(nn.LayerNorm):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        orig_type = x.dtype
+        x = F.layer_norm(x.to(torch.float32), self.normalized_shape, self.weight.to(torch.float32),
+                         self.bias.to(torch.float32), self.eps)
+        return x.to(orig_type)
+class VisionTransformer(nn.Module):
+    def __init__(self, config: MolmoVisionConfig, attention_type, device=None):
+        super().__init__()
+        self.config = config
+        # class embeddings and positional embeddings
+        self.scale = config.image_emb_dim ** -0.5
+        self.class_embedding = nn.Parameter(
+            torch.zeros(config.image_emb_dim, device=device))
+        self.positional_embedding = nn.Parameter(
+            torch.zeros(config.image_num_pos, config.image_emb_dim, device=device))
+        image_patch_size = config.image_patch_size
+        self.patch_embedding = nn.Linear(
+            image_patch_size * image_patch_size * 3,
+            config.image_emb_dim,
+            bias=False,
+            device=device
+        )
+        self.pre_ln = VisionPreLayerNorm(
+            config.image_emb_dim,
+            eps=config.image_norm_eps,
+        )
+        self.blocks = nn.ModuleList([
+            MolmoVisionBlock(config, attention_type=attention_type, device=device)
+            for _ in range(config.image_num_layers)
+        ])
+    def add_pos_emb(self, x: torch.Tensor, patch_num: int) -> torch.Tensor:
+        cls_emb = self.positional_embedding[0:1]
+        pos_emb = self.positional_embedding[1:]
+        pos_emb = pos_emb.reshape(
+            (int(math.sqrt(pos_emb.shape[0])), int(math.sqrt(pos_emb.shape[0])), pos_emb.shape[1])
+        )
+        (patch_num_0, patch_num_1) = patch_num
+        if pos_emb.shape[0] != patch_num_0 or pos_emb.shape[1] != patch_num_1:
+            # Dervied from https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+            # antialias: default True in jax.image.resize
+            pos_emb = pos_emb.unsqueeze(0).permute(0, 3, 1, 2)
+            pos_emb = F.interpolate(
+                pos_emb, size=(patch_num_0, patch_num_1), mode="bicubic", align_corners=False, antialias=True,
+            )
+            pos_emb = pos_emb.permute(0, 2, 3, 1).squeeze(0)
+        pos_emb = pos_emb.reshape(-1, pos_emb.shape[-1])
+        x = x + torch.cat([cls_emb[None, :, :], pos_emb[None, :, :]], dim=1).to(x.dtype)
+        return x
+    def forward(self, x: torch.Tensor, patch_num: int = None) -> List[torch.Tensor]:
+        if patch_num is None:
+            patch_num = self.config.image_num_patch
+        B, N, D = x.shape
+        x = self.patch_embedding(x)
+        # class embeddings and positional embeddings
+        x = torch.cat([_expand_token(self.class_embedding, x.shape[0]).to(x.dtype), x], dim=1)
+        x = self.add_pos_emb(x, patch_num)
+        x = self.pre_ln(x)
+        hidden_states = []
+        for r in self.blocks:
+            x = r(x)
+            hidden_states.append(x)
+        return hidden_states
+class VisionAttention(nn.Module):
+    def __init__(self, config: MolmoVisionConfig, use_bias: bool =True,
+                 embed_dim: int=None, device=None, attention_type: str="sdpa"):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.image_emb_dim
+        self.num_heads = config.image_num_heads
+        self.head_dim = config.image_head_dim
+        self.num_key_value_heads = config.image_num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.initializer_range = config.initializer_range
+        self.attention_type = attention_type
+        embed_dim = embed_dim if embed_dim else config.image_emb_dim
+        self.wq = nn.Linear(
+            embed_dim,
+            self.num_heads * self.head_dim,
+            bias=use_bias,
+            device=device,
+        )
+        self.wk = nn.Linear(
+            embed_dim,
+            self.num_key_value_heads * self.head_dim,
+            bias=use_bias,
+            device=device,
+        )
+        self.wv = nn.Linear(
+            embed_dim,
+            self.num_key_value_heads * self.head_dim,
+            bias=use_bias,
+            device=device,
+        )
+        self.wo = nn.Linear(
+            self.num_heads * self.head_dim,
+            self.embed_dim,
+            bias=use_bias,
+            device=device,
+        )
+        self.residual_dropout = nn.Dropout(config.residual_dropout)
+    def _split_heads(self, hidden_states, num_heads) -> torch.Tensor:
+        return hidden_states.reshape(hidden_states.shape[:2] + (num_heads, self.head_dim))
+    def _merge_heads(self, hidden_states) -> torch.Tensor:
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
+    def forward(self, inputs_q: torch.Tensor, inputs_kv: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if inputs_kv is not None:
+            inputs_k = inputs_kv
+            inputs_v = inputs_kv
+        else:
+            inputs_k = inputs_q
+            inputs_v = inputs_q
+        xq, xk, xv = self.wq(inputs_q), self.wk(inputs_k), self.wv(inputs_v)
+        xq = self._split_heads(xq, self.num_heads)
+        xk = self._split_heads(xk, self.num_key_value_heads)
+        xv = self._split_heads(xv, self.num_key_value_heads)
+        if self.num_heads != self.num_key_value_heads:
+            xk = xk.repeat_interleave(self.num_key_value_groups, dim=2, output_size=self.num_heads)
+            xv = xv.repeat_interleave(self.num_key_value_groups, dim=2, output_size=self.num_heads)
+        og_dtype = xq.dtype
+        if self.config.float32_attention:
+            xq = xq.to(torch.float)
+            xk = xk.to(torch.float)
+        if self.attention_type == "direct":
+            attn_weights = torch.einsum("...qhd,...khd->...hqk", xq / math.sqrt(xq.size(-1)), xk)
+            attn_weights = F.softmax(attn_weights, dim=-1)
+            attn_output = torch.einsum("...hqk,...khd->...qhd", attn_weights.to(xv.dtype), xv)
+        elif self.attention_type == "sdpa":
+            if self.config.float32_attention and not torch.is_autocast_enabled():
+                xv = xv.to(torch.float32)
+            attn_output = F.scaled_dot_product_attention(
+                xq.transpose(1, 2).contiguous(),
+                xk.transpose(1, 2).contiguous(),
+                xv.transpose(1, 2).contiguous(),
+                is_causal=False,
+            ).transpose(1, 2)
+        elif self.attention_type == "flash":
+            assert not self.config.float32_attention
+            # Downcast in case we are running with fp32 hidden states
+            attn_output = _flash_attention_forward(
+                xq.transpose(1, 2).to(torch.bfloat16),
+                xk.transpose(1, 2).to(torch.bfloat16),
+                xv.transpose(1, 2).to(torch.bfloat16),
+                attention_mask=None,
+                query_length=inputs_q.shape[1],
+                is_causal=False,
+            )
+        else:
+            raise NotImplementedError(self.attention_type)
+        attn_output = attn_output.to(og_dtype)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.wo(attn_output)
+        attn_output = self.residual_dropout(attn_output)
+        return attn_output
+class MolmoImageProjector(nn.Module):
+    def __init__(self, input_dim: int, hidden_dim, output_dim,  act_fn="silu", device=None):
+        super().__init__()
+        self.w1 = nn.Linear(input_dim, hidden_dim, bias=False, device=device)
+        self.w2 = nn.Linear(hidden_dim, output_dim, bias=False, device=device)
+        self.w3 = nn.Linear(input_dim, hidden_dim, bias=False, device=device)
+        self.act_fn = ACT2FN[act_fn]
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w2(self.act_fn(self.w1(x))*self.w3(x))
+class OLMoVisionBackbone(nn.Module):
+    def __init__(self, config: MolmoConfig):
+        super().__init__()
+        self.config = config
+        self.image_vit = VisionTransformer(config.vision_config, config.attention_type)
+        self.image_pooling_2d = VisionAttention(
+            config.vision_config,
+            embed_dim=len(config.vit_layers)*config.vision_config.image_emb_dim,
+            attention_type=config.attention_type
+        )
+        # `MLP` assume the activation takes two inputs, so it must be a 'llama' version
+        if config.activation_type == "swiglu":
+            mlp_config = replace(config, activation_type="llama_swiglu")
+        elif config.activation_type == "gelu":
+            raise NotImplementedError()
+        else:
+            mlp_config = config
+        self.image_projector = MolmoImageProjector(
+            config.vision_config.image_emb_dim,
+            config.intermediate_size//2,  # //2 since `mlp_hidden_size` includes the gate and parts
+            config.hidden_size,
+            act_fn=config.activation_type
+        )
+        self.image_feature_dropout = nn.Dropout(config.image_feature_dropout)
+        self.num_prefix_tokens = 1
+        self.pad_embed = None
+        if config.image_padding_embed:
+            image_dim = config.vision_config.image_emb_dim*len(self.config.vit_layers)
+            if config.image_padding_embed == "pad_and_partial_pad":
+                self.pad_embed = nn.Parameter(torch.zeros((2, image_dim)))
+            else:
+                raise ValueError(config.image_padding_embed)
+    def encode_image(self, images: torch.Tensor) -> torch.Tensor:
+        cfg = self.config
+        v_cfg = self.config.vision_config
+        B, T, N, D = images.shape
+        mask = ~torch.all(images.view(B * T, N, D) == -1, dim=(1, 2), keepdim=True)
+        # Output all hidden states
+        # n_layers x (batch_num_crops, (1+)n_tokens, image_emb_dim)
+        images = images.view(B * T, N, D)
+        image_features = self.image_vit(images)
+        if cfg.vit_layers is not None:
+            features = []
+            for layer in cfg.vit_layers:
+                features.append(image_features[layer])
+            image_features = torch.cat(features, dim=-1)
+        else:
+            image_features = image_features[-1]
+        cls_embed: torch.Tensor = None
+        if self.num_prefix_tokens > 0:
+            cls_embed = image_features[:, 0]
+            image_features = image_features[:, 1:]
+        image_features = image_features * mask
+        image_features = image_features.view(B, T, N, -1)
+        cls_embed = cls_embed.view(B, T, -1) if cls_embed is not None else None
+        return image_features, cls_embed
+    def forward(self, images: torch.Tensor, image_masks: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        cfg = self.config
+        # image_features: (batch_size, num_crops(=num_image), num_patch, nximage_emb_dim)
+        batch_size, num_image = images.shape[:2]
+        image_features, cls_embed = self.encode_image(images)
+        if cfg.image_padding_embed:
+            assert image_masks is not None
+            if cfg.image_padding_embed == "pad_embed":
+                all_pad = (image_masks == 0).to(dtype=torch.float32)
+                pad_embed = self.pad_embed[None, None, None, :]
+                image_features = image_features + pad_embed * torch.unsqueeze(all_pad, -1)
+            elif cfg.image_padding_embed == "regress":
+                pad_embed = self.pad_embed[None, None, None, :]
+                image_features = image_features + pad_embed * torch.unsqueeze(torch.maximum(image_masks, torch.zeros_like(image_masks)), -1)
+            elif cfg.image_padding_embed == "pad_and_partial_pad":
+                pad_embed = self.pad_embed[:, None, None, None, :]
+                all_pad = image_masks == 0
+                partial_pad = torch.logical_and(image_masks < 1, torch.logical_not(all_pad)).to(dtype=image_features.dtype)
+                all_pad = all_pad.to(dtype=image_features.dtype)
+                image_features = image_features + pad_embed[0] * torch.unsqueeze(all_pad, -1)
+                image_features = image_features + pad_embed[1] * torch.unsqueeze(partial_pad, -1)
+            else:
+                raise ValueError(cfg.image_padding_embed)
+        image_features = self.image_feature_dropout(image_features)
+        if cls_embed is not None:
+            cls_embed = self.image_feature_dropout(cls_embed)
+        image_features = image_features.reshape(
+            (batch_size, num_image) + cfg.image_num_patch + (-1,))
+        # transpose to get 2x2 feature squares [n_patches, 4, n_features]
+        batch, n_crops, h, w, c = image_features.shape
+        image_features = torch.reshape(image_features, [batch*n_crops, h//2, 2, w//2, 2, c])
+        image_features = torch.permute(image_features, [0, 1, 3, 2, 4, 5])
+        image_features = torch.reshape(image_features, [batch*n_crops*h//2*w//2, 2*2, c])
+        query = image_features.mean(-2, keepdim=True)
+        image_features = self.image_pooling_2d(query, image_features)
+        h = self.config.vision_config.image_num_patch[0]//2
+        w = self.config.vision_config.image_num_patch[1]//2
+        image_features = image_features.reshape(batch_size, num_image, h * w, -1)
+        # MLP layer to map the feature.
+        image_features = self.image_projector(image_features)
+        # image_features: (batch_size, num_image, num_patch, hidden_size)
+        # cls_embed: (batch_size, num_image, hidden_size)
+        return image_features, cls_embed
+def causal_attention_bias(seq_len: int, device: torch.device) -> torch.FloatTensor:
+    att_bias = torch.triu(
+        torch.ones(seq_len, seq_len, device=device, dtype=torch.float),
+        diagonal=1,
+    )
+    att_bias.masked_fill_(att_bias == 1, torch.finfo(att_bias.dtype).min)
+    return att_bias.view(1, 1, seq_len, seq_len)  # type: ignore
+class MolmoRmsLayerNorm(nn.Module):
+    """
+    RMS layer norm, a simplified :class:`LayerNorm` implementation
+    """
+    def __init__(
+        self,
+        config: MolmoConfig,
+        size: Optional[int] = None,
+        elementwise_affine: Optional[bool] = None,
+        eps: float = 1e-5,
+    ):
+        super().__init__()
+        self.config = config
+        self.eps = self.config.layer_norm_eps or eps
+        self.normalized_shape = (size or config.hidden_size,)
+        if elementwise_affine or (elementwise_affine is None):
+            self.weight = nn.Parameter(torch.ones(self.normalized_shape))
+            use_bias = self.config.bias_for_layer_norm
+            if use_bias:
+                self.bias = nn.Parameter(torch.zeros(self.normalized_shape))
+            else:
+                self.register_parameter("bias", None)
+        else:
+            self.register_parameter("bias", None)
+            self.register_parameter("weight", None)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        with torch.autocast(enabled=False, device_type=x.device.type):
+            og_dtype = x.dtype
+            x = x.to(torch.float32)
+            variance = x.pow(2).mean(-1, keepdim=True)
+            x = x * torch.rsqrt(variance + self.eps)
+            x = x.to(og_dtype)
+        if self.weight is not None:
+            if self.bias is not None:
+                return self.weight * x + self.bias
+            else:
+                return self.weight * x
+        else:
+            return x
+class MolmoModel(MolmoPreTrainedModel):
+    def __init__(self, config: MolmoConfig, init_params: bool = True):
+        super().__init__(config)
+        if self.config.additional_vocab_size is not None:
+            wte = Embedding(
+                config.vocab_size,
+                config.additional_vocab_size,
+                config.hidden_size,
+            )
+        else:
+            wte = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=wte,
+                emb_drop=nn.Dropout(config.embedding_dropout),
+                ln_f=MolmoRmsLayerNorm(config),
+            )
+        )
+        if config.moe_num_experts > 0:
+            blocks = [MolmoeBlock(config) for i in range(config.num_hidden_layers)]
+        else:
+            blocks = [MolmoBlock(config) for i in range(config.num_hidden_layers)]
+        self.transformer.update({"blocks": nn.ModuleList(blocks)})
+        if not config.weight_tying:
+            self.transformer.update(
+                {
+                    "ff_out": nn.Linear(
+                        config.hidden_size,
+                        config.vocab_size,
+                        bias=False,
+                    )
+                }
+            )
+        self.vision_backbone: Optional[OLMoVisionBackbone] = None
+        if config.vision_config is not None:
+            self.vision_backbone = OLMoVisionBackbone(config)
+    def reset_parameters(self):
+        if self.vision_backbone is not None:
+            self.vision_backbone.reset_parameters()
+        self.reset_non_vision_parameters()
+    def reset_non_vision_parameters(self):
+        self.transformer.wte.reset_parameters()
+        if hasattr(self.transformer.wte, "new_embedding"):
+            nn.init.normal_(self.transformer.wte.new_embedding, std=self.config.new_embedding_init_range)
+        if hasattr(self.transformer, "wpe"):
+            nn.init.normal_(self.transformer.wpe, mean=0.0, std=1.0)
+        self.transformer.ln_f.reset_parameters()  # type: ignore
+        if hasattr(self.transformer, "ff_out"):
+            nn.init.normal_(self.transformer.ff_out, mean=0.0, std=0.02)
+        for block in self.transformer.blocks:
+            block.reset_parameters()
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        input_embeddings: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_masks: Optional[torch.Tensor] = None,
+        image_input_idx: Optional[torch.Tensor] = None,
+        subsegment_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Sequence[Tuple[torch.Tensor, torch.Tensor]]] = None,
+        use_cache: bool = False,
+        last_logits_only: bool = False,
+        output_hidden_states: Optional[bool] = None,
+        append_last_valid_logits: Optional[torch.Tensor] = None,
+    ) -> ModelOutput:
+        """
+        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
+        :param input_embeddings: A tensor of shape `(batch_size, seq_len, hidden_size)` with input
+            embeddings. When provided, it is treated as the output of the input embedding layer.
+        :param attention_mask: A tensor of shape `(batch_size, seq_len)` that indicates
+            which input IDs are masked. A `1` value in the mask means that
+            the corresponding input ID should *not* be ignored. A `0` means
+            that the corresponding input ID is masked.
+            This has the same meaning as the `attention_mask` in HuggingFace's `transformers`
+            library.
+        :param attention_bias: A tensor of shape `(batch_size, 1, seq_len, seq_len)`,
+            `(1, 1, seq_len, seq_len)`, or `(seq_len, seq_len)`. This is used
+            to introduce causal or other biases.
+            If the tensor is a bool or byte tensor, a `True` or `1` at `attention_bias[:, :, i, j]`
+            indicates that the i-th element in the sequence is allowed to attend to the j-th
+            element in the sequence.
+            If the tensor is a float tensor, it will just be added to the attention
+            scores before the softmax.
+            The default is causal, which corresponds to a lower-diagonal byte matrix of ones.
+        :param response_mask: A tensor of shape `(batch_size, seq_len)` that indicates
+            the response mask. A `1` value in the mask means that the corresponding token
+            is a response token. A `0` means that the corresponding token is not
+            a response token.
+        :param past_key_values: Pre-computed keys and values for each attention block.
+            Can be used to speed up sequential decoding. The `input_ids` which have
+            their past given to this model should not be passed as `input_ids` as they have already been computed.
+        :param use_cache: If `True`, return key and value tensors for each block.
+        :param last_logits_only: If `True`, only compute the logits for the last token of each sequence.
+            This can speed up decoding when you only care about the next token.
+        """
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        if past_key_values:
+            assert len(past_key_values) == self.config.num_hidden_layers
+        has_image = images is not None
+        assert not (has_image and input_embeddings is not None), "Cannot provide both images and input embeddings."
+        assert not (has_image and past_key_values is not None), "Cached key and values should not be used with images."
+        batch_size, seq_len = input_ids.size() if input_embeddings is None else input_embeddings.size()[:2]
+        if past_key_values is None:
+            past_length = 0
+        else:
+            past_length = past_key_values[0][0].size(-2)
+        if attention_mask is None:
+            attention_mask = input_ids != -1
+        if subsegment_ids is not None:
+            raise NotImplementedError()
+        else:
+            if position_ids is None:
+                position_ids = torch.clamp(
+                    torch.cumsum(attention_mask.to(torch.int32), dim=-1) - 1,
+                    min=0,
+                    ).broadcast_to((batch_size, attention_mask.shape[-1]))
+        # Get embeddings of input.
+        # shape: (batch_size, seq_len, hidden_size)
+        if input_ids is not None:
+            input_ids = input_ids * (input_ids != -1).to(input_ids.dtype)
+        x = self.transformer.wte(input_ids) if input_embeddings is None else input_embeddings  # type: ignore
+        num_image: Optional[int] = None
+        if images is not None:
+            # shape: (batch_size, num_image, num_patch, hidden_size)
+            # cls_embed: (batch_size, num_image, hidden_size)
+            image_features, cls_embed = self.vision_backbone(images, image_masks)
+            num_image, num_patch = image_features.shape[1:3]
+            assert image_input_idx.shape == (batch_size, num_image, num_patch)
+            # inster the image feature into the embedding.
+            image_features = image_features.view(batch_size, num_image * num_patch, -1)
+            image_input_idx = image_input_idx.view(batch_size, num_image * num_patch)
+            valid = image_input_idx >= 0
+            batch_idx = torch.arange(batch_size, device=x.device)
+            batch_idx = torch.tile(batch_idx[:, None], [1, image_features.shape[1]])
+            # For hf demo/endpoint
+            image_features = image_features.to(x.device)
+            x[batch_idx[valid], image_input_idx[valid]] += image_features[valid]
+        # Add input + positional embeddings and apply dropout.
+        # shape: (batch_size, seq_len, hidden_size)
+        x = self.transformer.emb_drop(x)  # type: ignore
+        # normalized
+        if self.config.normalize_input_embeds:
+            x = x * (self.config.hidden_size ** 0.5)
+        # Merge attention mask with attention bias.
+        # FIXME we are ignoring the attention mask input parameter
+        if self.config.attention_type == "flash":
+            attention_mask = input_ids != -1
+        elif (
+            attention_mask is not None
+            or past_key_values is not None
+        ):
+            total_len = (past_length + seq_len)
+            attention_mask = torch.tril(torch.ones(total_len, total_len, device=x.device, dtype=torch.bool))
+            attention_mask = attention_mask.view(1, 1, total_len, total_len)
+        attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = [] if use_cache else None
+        # decoder layers
+        all_hidden_states = []
+        # Apply blocks one-by-one.
+        for block_idx, block in enumerate(self.transformer.blocks):
+            if output_hidden_states:
+                # add hidden states
+                all_hidden_states.append(x)
+            layer_past = None if past_key_values is None else past_key_values[block_idx]
+            x, cache = block(x, attention_mask=attention_mask, position_ids=position_ids, layer_past=layer_past, use_cache=use_cache)
+            if attn_key_values is not None:
+                assert cache is not None
+                attn_key_values.append(cache)
+        if last_logits_only:
+            # shape: (batch_size, 1, hidden_size)
+            if append_last_valid_logits is not None:
+                last_valid_output = x[
+                    torch.arange(x.shape[0], device=x.device), append_last_valid_logits.to(x.device)]
+                x = last_valid_output.unsqueeze(1)
+            else:
+                x = x[:, -1, :].unsqueeze(1)
+        # Apply final layer norm.
+        # shape: (batch_size, seq_len or 1, hidden_size)
+        x = self.transformer.ln_f(x)  # type: ignore
+        if output_hidden_states:
+            # add final hidden state post-final-layernorm, following HuggingFace's convention
+            all_hidden_states.append(x)
+        # Get logits.
+        # shape: (batch_size, seq_len or 1, vocab_size)
+        if self.config.weight_tying:
+            logits = F.linear(x, self.transformer.wte.weight, None)  # type: ignore
+        else:
+            logits = self.transformer.ff_out(x)  # type: ignore
+        if self.config.scale_logits:
+            logits.mul_(1 / math.sqrt(self.config.hidden_size))
+        if not last_logits_only and append_last_valid_logits is not None:
+            last_valid_logit = logits[
+                torch.arange(logits.shape[0], device=logits.device), append_last_valid_logits]
+            logits = torch.cat([logits[:, :-1], last_valid_logit[:, None]], dim=1)
+        return ModelOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None)  # type: ignore[arg-type]
+class MolmoForCausalLM(MolmoPreTrainedModel):
+    def __init__(self, config: MolmoConfig, model: Optional[MolmoModel] = None, init_params: bool = False):
+        super().__init__(config)
+        if not model:
+            self.model = MolmoModel(config, init_params=init_params)
+        else:
+            self.model = model
+        self.post_init()
+    def get_input_embeddings(self) -> torch.nn.Module:
+        return self.model.transformer.wte
+    def get_output_embeddings(self):
+        if self.config.weight_tying:
+            return self.model.transformer.wte
+        else:
+            return self.model.transformer.ff_out
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        attention_bias: Optional[torch.Tensor] = None,
+        response_mask: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_masks: Optional[torch.Tensor] = None,
+        image_input_idx: Optional[torch.Tensor] = None,
+        subsegment_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        loss_masks: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        last_logits_only: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        append_last_valid_logits: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[
+            Cache
+        ] = None,  # This is a hack mitigation of an issue in transformers `4.39.x` https://github.com/huggingface/transformers/issues/29426
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if use_cache is None:
+            use_cache = self.config.use_cache
+        if output_attentions:
+            raise ValueError("output_attentions is not yet supported in Molmo")
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.forward(
+            input_ids=input_ids,
+            input_embeddings=inputs_embeds,
+            attention_mask=attention_mask,
+            images=images,
+            image_masks=image_masks,
+            image_input_idx=image_input_idx,
+            subsegment_ids=subsegment_ids,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            last_logits_only=last_logits_only,
+            output_hidden_states=output_hidden_states,
+            append_last_valid_logits=append_last_valid_logits,
+        )
+        logits = outputs.logits
+        hidden_states = outputs.hidden_states
+        loss = None
+        if labels is not None:
+            if loss_masks is not None:
+                loss_masks = loss_masks * (loss_masks > 0)
+                batch_size_in_tokens = max(loss_masks.sum().item(), 1)
+                labels = labels.long()
+                labels.masked_fill_(~(loss_masks > 0), -100)
+                labels = labels.view(-1)
+                logits_for_loss = logits.to(torch.float32).view(-1, logits.size(-1))
+                loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='none')
+                loss = loss_fct(logits_for_loss, labels)
+                loss = loss.view(input_ids.shape[0], -1)
+                loss = loss * loss_masks
+                loss = loss.sum() / batch_size_in_tokens
+                use_zloss = getattr(self.config, "softmax_auxiliary_loss", False)
+                if use_zloss:
+                    z_squared = logits_for_loss.logsumexp(-1).pow(2)
+                    z_loss = self.config.softmax_auxiliary_loss_scale * z_squared
+                    z_loss = z_loss.view(input_ids.shape[0], -1)
+                    z_loss = z_loss * loss_masks
+                    z_loss = z_loss.sum() / batch_size_in_tokens
+                    loss += z_loss
+            else:
+                # Shift so that tokens < n predict n
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+                # Flatten the tokens
+                loss_fct = torch.nn.CrossEntropyLoss()
+                shift_logits = shift_logits.view(-1, self.config.vocab_size)
+                shift_labels = shift_labels.view(-1)
+                # Enable model parallelism
+                shift_labels = shift_labels.to(shift_logits.device)
+                loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.attn_key_values,
+            hidden_states=hidden_states,
+        )
+    def can_generate(self) -> bool:
+        return True
+    @torch.no_grad()
+    def generate_from_batch(
+        self,
+        batch: Dict[str, Any],
+        generation_config: Optional[GenerationConfig] = None,
+        **kwargs,
+    ):
+        if generation_config is not None:
+            assert generation_config.use_cache
+        images = batch.get("images")
+        image_masks = batch.get("image_masks")
+        image_input_idx = batch.get("image_input_idx")
+        # Validate inputs.
+        input_ids = batch["input_ids"]
+        batch_size, seq_len = input_ids.shape
+        attention_mask = batch.get("attention_mask", None)
+        max_new_tokens = generation_config.max_new_tokens
+        assert max_new_tokens is not None
+        mask_len = seq_len + max_new_tokens
+        position_ids: Optional[torch.Tensor] = None
+        append_last_valid_logits: Optional[torch.Tensor] = None
+        if attention_mask is None:
+            attention_mask = input_ids != -1
+            position_ids = torch.clamp(
+                torch.cumsum(attention_mask.to(torch.int32), dim=-1) - 1,
+                min=0
+            )
+            append_last_valid_logits = attention_mask.long().sum(dim=-1) - 1
+            attention_mask = torch.cat(
+                [attention_mask, attention_mask.new_ones((batch_size, max_new_tokens))],
+                dim=1,
+            )
+        if attention_mask is not None:
+            assert attention_mask.shape == (batch_size, mask_len)
+        out = super().generate(
+            batch["input_ids"],
+            generation_config,
+            attention_mask=attention_mask,
+            images=images,
+            image_masks=image_masks,
+            image_input_idx=image_input_idx,
+            position_ids=position_ids,
+            append_last_valid_logits=append_last_valid_logits,
+            **kwargs,
+        )
+        return out
+    def prepare_inputs_for_generation(
+        self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple]] = None, **kwargs
+    ):
+        if past_key_values:
+            # This is because we want the model to only process the last generated token.
+            input_ids = input_ids[:, -1:]
+        attention_mask = kwargs.get("attention_mask")
+        images = kwargs.get("images")
+        image_masks = kwargs.get("image_masks")
+        image_input_idx = kwargs.get("image_input_idx")
+        position_ids = kwargs.get("position_ids")
+        append_last_valid_logits = kwargs.get("append_last_valid_logits")
+        model_inputs = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "use_cache": True,
+            "last_logits_only": True,
+        }
+        if past_key_values is None:
+            model_inputs["images"] = images
+            model_inputs["image_masks"] = image_masks
+            model_inputs["image_input_idx"] = image_input_idx
+            model_inputs["append_last_valid_logits"] = append_last_valid_logits
+        return model_inputs
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        num_new_tokens: int = 1,
+    ) -> Dict[str, Any]:
+        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+        if "append_last_valid_logits" in model_kwargs:
+            del model_kwargs["append_last_valid_logits"]
+        if "images" in model_kwargs:
+            del model_kwargs["images"]
+            del model_kwargs["image_masks"]
+            del model_kwargs["image_input_idx"]
+        cache_name, cache = super()._extract_past_from_model_output(outputs)
+        model_kwargs[cache_name] = cache
+        model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens
+        return model_kwargs
+# Always register for multi-modal features
+AutoModelForCausalLM.register(MolmoConfig, MolmoForCausalLM)

preprocessing_molmo.py ADDED Viewed

	@@ -0,0 +1,189 @@

+"""
+Processor class for Molmo.
+"""
+from typing import Optional
+import PIL
+from PIL import ImageOps
+from PIL.Image import Image
+try:
+    from typing import Unpack
+except ImportError:
+    from typing_extensions import Unpack
+import numpy as np
+import torch
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import (
+    TextKwargs,
+    ProcessingKwargs,
+    ProcessorMixin,
+)
+from transformers.tokenization_utils_base import TextInput
+from transformers.utils import logging
+from transformers import AutoTokenizer
+from .image_preprocessing_molmo import MolmoImagesKwargs, MolmoImageProcessor
+logger = logging.get_logger(__name__)
+DEFAULT_IMAGE_PATCH_TOKEN = f"<im_patch>"
+DEFAULT_IM_START_TOKEN = f"<im_start>"
+DEFAULT_IM_END_TOKEN = f"<im_end>"
+DEFAULT_IM_COL_TOKEN = f"<im_col>"
+IMAGE_PROMPT = "<|image|>"
+EXTRA_TOKENS = (DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_COL_TOKEN, IMAGE_PROMPT)
+def get_special_token_ids(tokenizer):
+    ids = tokenizer.encode("".join(EXTRA_TOKENS), add_special_tokens=False)
+    assert len(ids) == len(EXTRA_TOKENS)
+    return {k: i for k, i in zip(EXTRA_TOKENS, ids)}
+class MolmoTextKwargs(TextKwargs, total=False):
+    style: Optional[str]
+    system_prompt: Optional[str]
+    message_format: Optional[str]
+    always_start_with_space: Optional[bool]
+    sequence_length: Optional[int]
+class MolmoProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: MolmoTextKwargs
+    images_kwargs: MolmoImagesKwargs
+    _defaults = {
+        "images_kwargs": {
+            "max_crops": 12,
+            "overlap_margins": [4, 4],
+            "base_image_input_size": [336, 336],
+            "image_token_length_w": 12,
+            "image_token_length_h": 12,
+            "image_patch_size": 14,
+            "image_padding_mask": True,
+        },
+        "text_kwargs": {
+            "style": "long_caption",
+            "system_prompt": "none",
+            "message_format": "role",
+            "always_start_with_space": True,
+            "sequence_length": 1536,
+            "padding": False,
+        },
+    }
+class MolmoProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+    def __init__(self, image_processor: MolmoImageProcessor = None, tokenizer : AutoTokenizer = None, **kwargs):
+        # self.image_processor = image_processor
+        # self.tokenizer = tokenizer
+        super().__init__(image_processor, tokenizer)
+        self._special_tokens = None
+    @property
+    def special_token_ids(self):
+        if self._special_tokens is None:
+            self._special_tokens = get_special_token_ids(self.tokenizer)
+        return self._special_tokens
+    def get_tokens_input(self, prompt, message_format, always_start_with_space):
+        if message_format == "none" or message_format is None:
+            pass
+        elif message_format == "role":
+            prompt = "User: " + prompt + " Assistant:"
+        else:
+            raise NotImplementedError(f"Message format {message_format} not implemented")
+        if always_start_with_space:
+            prompt = " " + prompt
+        tokens = self.tokenizer.encode(prompt, add_special_tokens=False)
+        return tokens
+    def process(
+        self,
+        text: TextInput = None,
+        images: ImageInput = None,
+        **kwargs: Unpack[MolmoProcessorKwargs],
+    ):
+        output_kwargs = self._merge_kwargs(
+            MolmoProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        tokens = self.get_tokens_input(
+            text,
+            output_kwargs["text_kwargs"]["message_format"],
+            output_kwargs["text_kwargs"]["always_start_with_space"],
+        )
+        image_token_id = self.special_token_ids[IMAGE_PROMPT]
+        if images is not None:
+            if not isinstance(images, (list, tuple)):
+                images = [images]
+            image_arrays = []
+            for image in images:
+                if isinstance(image, Image):
+                    image = image.convert("RGB")
+                    # Handle images with EXIF orientation tags, which PIL will ignore by default
+                    # https://github.com/python-pillow/Pillow/issues/4703
+                    img = ImageOps.exif_transpose(image)
+                    image_arrays.append(np.array(image))
+                else:
+                    assert len(image.shape) == 3 and image.shape[-1] == 3
+                    image_arrays.append(image.astype(np.uint8))
+            images = image_arrays
+            # For now only support inserting images at the start
+            image_idx = [-1]*len(images)
+        else:
+            image_idx = None
+        sequence_length = output_kwargs["text_kwargs"]["sequence_length"]
+        image_patch_token_id = self.special_token_ids[DEFAULT_IMAGE_PATCH_TOKEN]
+        image_col_token_id = self.special_token_ids[DEFAULT_IM_COL_TOKEN]
+        image_start_token_id = self.special_token_ids[DEFAULT_IM_START_TOKEN]
+        image_end_token_id = self.special_token_ids[DEFAULT_IM_END_TOKEN]
+        out = self.image_processor.multimodal_preprocess(
+            images=images,
+            image_idx=image_idx,
+            tokens=np.asarray(tokens).astype(np.int32),
+            sequence_length=sequence_length,
+            image_patch_token_id=image_patch_token_id,
+            image_col_token_id=image_col_token_id,
+            image_start_token_id=image_start_token_id,
+            image_end_token_id=image_end_token_id,
+            **output_kwargs["images_kwargs"]
+        )
+        # Prepend BOS
+        # qwen2 and olmo do not have a BOS, and instead use EOS as a generic seperator token.
+        bos = self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
+        decoder_input_tokens = np.pad(out["input_ids"], [[1, 0]], constant_values=bos)
+        out["input_ids"] = decoder_input_tokens
+        if "image_input_idx" in out:
+            # Shift patch mapping up by one since we added BOS
+            image_input_idx = out["image_input_idx"]
+            out["image_input_idx"] = np.where(image_input_idx < 0, image_input_idx, image_input_idx + 1)
+        for k, v in out.items():
+            out[k] = torch.from_numpy(v)
+        return out
+MolmoProcessor.register_for_auto_class()

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "auto_map": {
+    "AutoImageProcessor": "image_preprocessing_molmo.MolmoImageProcessor",
+    "AutoProcessor": "preprocessing_molmo.MolmoProcessor"
+  },
+  "base_image_input_size": [
+    336,
+    336
+  ],
+  "do_normalize": true,
+  "image_padding_mask": true,
+  "image_patch_size": 14,
+  "image_processor_type": "MolmoImageProcessor",
+  "image_token_length_h": 12,
+  "image_token_length_w": 12,
+  "max_crops": 12,
+  "overlap_margins": [
+    4,
+    4
+  ],
+  "processor_class": "MolmoProcessor"
+}

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "preprocessing_molmo.MolmoProcessor"
+  },
+  "processor_class": "MolmoProcessor"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,441 @@

+{
+  "additional_special_tokens": [
+    "|<EXTRA_TOKENS_0>|",
+    "|<EXTRA_TOKENS_1>|",
+    "|<EXTRA_TOKENS_2>|",
+    "|<EXTRA_TOKENS_3>|",
+    "|<EXTRA_TOKENS_4>|",
+    "|<EXTRA_TOKENS_5>|",
+    "|<EXTRA_TOKENS_6>|",
+    "|<EXTRA_TOKENS_7>|",
+    "|<EXTRA_TOKENS_8>|",
+    "|<EXTRA_TOKENS_9>|",
+    "|<EXTRA_TOKENS_10>|",
+    "|<EXTRA_TOKENS_11>|",
+    "|<EXTRA_TOKENS_12>|",
+    "|<EXTRA_TOKENS_13>|",
+    "|<EXTRA_TOKENS_14>|",
+    "|<EXTRA_TOKENS_15>|",
+    "|<EXTRA_TOKENS_16>|",
+    "|<EXTRA_TOKENS_17>|",
+    "|<EXTRA_TOKENS_18>|",
+    "|<EXTRA_TOKENS_19>|",
+    "|<EXTRA_TOKENS_20>|",
+    "|<EXTRA_TOKENS_21>|",
+    "|<EXTRA_TOKENS_22>|",
+    "|<EXTRA_TOKENS_23>|",
+    "|<EXTRA_TOKENS_24>|",
+    "|<EXTRA_TOKENS_25>|",
+    "|<EXTRA_TOKENS_26>|",
+    "|<EXTRA_TOKENS_27>|",
+    "|<EXTRA_TOKENS_28>|",
+    "|<EXTRA_TOKENS_29>|",
+    "|<EXTRA_TOKENS_30>|",
+    "|<EXTRA_TOKENS_31>|",
+    "|<EXTRA_TOKENS_32>|",
+    "|<EXTRA_TOKENS_33>|",
+    "|<EXTRA_TOKENS_34>|",
+    "|<EXTRA_TOKENS_35>|",
+    "|<EXTRA_TOKENS_36>|",
+    "|<EXTRA_TOKENS_37>|",
+    "|<EXTRA_TOKENS_38>|",
+    "|<EXTRA_TOKENS_39>|",
+    "|<EXTRA_TOKENS_40>|",
+    "|<EXTRA_TOKENS_41>|",
+    "|<EXTRA_TOKENS_42>|",
+    "|<EXTRA_TOKENS_43>|",
+    "|<EXTRA_TOKENS_44>|",
+    "|<EXTRA_TOKENS_45>|",
+    "|<EXTRA_TOKENS_46>|",
+    "|<EXTRA_TOKENS_47>|",
+    "|<EXTRA_TOKENS_48>|",
+    "|<EXTRA_TOKENS_49>|",
+    "|<EXTRA_TOKENS_50>|",
+    "|<EXTRA_TOKENS_51>|",
+    "|<EXTRA_TOKENS_52>|",
+    "|<EXTRA_TOKENS_53>|",
+    "|<EXTRA_TOKENS_54>|",
+    "|<EXTRA_TOKENS_55>|",
+    "|<EXTRA_TOKENS_56>|",
+    "|<EXTRA_TOKENS_57>|",
+    "|<EXTRA_TOKENS_58>|",
+    "|<EXTRA_TOKENS_59>|",
+    "|<EXTRA_TOKENS_60>|",
+    "|<EXTRA_TOKENS_61>|",
+    "|<EXTRA_TOKENS_62>|",
+    "|<EXTRA_TOKENS_63>|",
+    "|<EXTRA_TOKENS_64>|",
+    "|<EXTRA_TOKENS_65>|",
+    "|<EXTRA_TOKENS_66>|",
+    "|<EXTRA_TOKENS_67>|",
+    "|<EXTRA_TOKENS_68>|",
+    "|<EXTRA_TOKENS_69>|",
+    "|<EXTRA_TOKENS_70>|",
+    "|<EXTRA_TOKENS_71>|",
+    "|<EXTRA_TOKENS_72>|",
+    "|<EXTRA_TOKENS_73>|",
+    "|<EXTRA_TOKENS_74>|",
+    "|<EXTRA_TOKENS_75>|",
+    "|<EXTRA_TOKENS_76>|",
+    "|<EXTRA_TOKENS_77>|",
+    "|<EXTRA_TOKENS_78>|",
+    "|<EXTRA_TOKENS_79>|",
+    "|<EXTRA_TOKENS_80>|",
+    "|<EXTRA_TOKENS_81>|",
+    "|<EXTRA_TOKENS_82>|",
+    "|<EXTRA_TOKENS_83>|",
+    "|<EXTRA_TOKENS_84>|",
+    "|<EXTRA_TOKENS_85>|",
+    "|<EXTRA_TOKENS_86>|",
+    "|<EXTRA_TOKENS_87>|",
+    "|<EXTRA_TOKENS_88>|",
+    "|<EXTRA_TOKENS_89>|",
+    "|<EXTRA_TOKENS_90>|",
+    "|<EXTRA_TOKENS_91>|",
+    "|<EXTRA_TOKENS_92>|",
+    "|<EXTRA_TOKENS_93>|",
+    "|<EXTRA_TOKENS_94>|",
+    "|<EXTRA_TOKENS_95>|",
+    "|<EXTRA_TOKENS_96>|",
+    "|<EXTRA_TOKENS_97>|",
+    "|<EXTRA_TOKENS_98>|",
+    "|<EXTRA_TOKENS_99>|",
+    "|<EXTRA_TOKENS_100>|",
+    "|<EXTRA_TOKENS_101>|",
+    "|<EXTRA_TOKENS_102>|",
+    "|<EXTRA_TOKENS_103>|",
+    "|<EXTRA_TOKENS_104>|",
+    "|<EXTRA_TOKENS_105>|",
+    "|<EXTRA_TOKENS_106>|",
+    "|<EXTRA_TOKENS_107>|",
+    "|<EXTRA_TOKENS_108>|",
+    "|<EXTRA_TOKENS_109>|",
+    "|<EXTRA_TOKENS_110>|",
+    "|<EXTRA_TOKENS_111>|",
+    "|<EXTRA_TOKENS_112>|",
+    "|<EXTRA_TOKENS_113>|",
+    "|<EXTRA_TOKENS_114>|",
+    "|<EXTRA_TOKENS_115>|",
+    "|<EXTRA_TOKENS_116>|",
+    "|<EXTRA_TOKENS_117>|",
+    "|<EXTRA_TOKENS_118>|",
+    "|<EXTRA_TOKENS_119>|",
+    "|<EXTRA_TOKENS_120>|",
+    "|<EXTRA_TOKENS_121>|",
+    "|<EXTRA_TOKENS_122>|",
+    "|<EXTRA_TOKENS_123>|",
+    "|<EXTRA_TOKENS_124>|",
+    "|<EXTRA_TOKENS_125>|",
+    "|<EXTRA_TOKENS_126>|",
+    "|<EXTRA_TOKENS_127>|",
+    "|<EXTRA_TOKENS_128>|",
+    "|<EXTRA_TOKENS_129>|",
+    "|<EXTRA_TOKENS_130>|",
+    "|<EXTRA_TOKENS_131>|",
+    "|<EXTRA_TOKENS_132>|",
+    "|<EXTRA_TOKENS_133>|",
+    "|<EXTRA_TOKENS_134>|",
+    "|<EXTRA_TOKENS_135>|",
+    "|<EXTRA_TOKENS_136>|",
+    "|<EXTRA_TOKENS_137>|",
+    "|<EXTRA_TOKENS_138>|",
+    "|<EXTRA_TOKENS_139>|",
+    "|<EXTRA_TOKENS_140>|",
+    "|<EXTRA_TOKENS_141>|",
+    "|<EXTRA_TOKENS_142>|",
+    "|<EXTRA_TOKENS_143>|",
+    "|<EXTRA_TOKENS_144>|",
+    "|<EXTRA_TOKENS_145>|",
+    "|<EXTRA_TOKENS_146>|",
+    "|<EXTRA_TOKENS_147>|",
+    "|<EXTRA_TOKENS_148>|",
+    "|<EXTRA_TOKENS_149>|",
+    "|<EXTRA_TOKENS_150>|",
+    "|<EXTRA_TOKENS_151>|",
+    "|<EXTRA_TOKENS_152>|",
+    "|<EXTRA_TOKENS_153>|",
+    "|<EXTRA_TOKENS_154>|",
+    "|<EXTRA_TOKENS_155>|",
+    "|<EXTRA_TOKENS_156>|",
+    "|<EXTRA_TOKENS_157>|",
+    "|<EXTRA_TOKENS_158>|",
+    "|<EXTRA_TOKENS_159>|",
+    "|<EXTRA_TOKENS_160>|",
+    "|<EXTRA_TOKENS_161>|",
+    "|<EXTRA_TOKENS_162>|",
+    "|<EXTRA_TOKENS_163>|",
+    "|<EXTRA_TOKENS_164>|",
+    "|<EXTRA_TOKENS_165>|",
+    "|<EXTRA_TOKENS_166>|",
+    "|<EXTRA_TOKENS_167>|",
+    "|<EXTRA_TOKENS_168>|",
+    "|<EXTRA_TOKENS_169>|",
+    "|<EXTRA_TOKENS_170>|",
+    "|<EXTRA_TOKENS_171>|",
+    "|<EXTRA_TOKENS_172>|",
+    "|<EXTRA_TOKENS_173>|",
+    "|<EXTRA_TOKENS_174>|",
+    "|<EXTRA_TOKENS_175>|",
+    "|<EXTRA_TOKENS_176>|",
+    "|<EXTRA_TOKENS_177>|",
+    "|<EXTRA_TOKENS_178>|",
+    "|<EXTRA_TOKENS_179>|",
+    "|<EXTRA_TOKENS_180>|",
+    "|<EXTRA_TOKENS_181>|",
+    "|<EXTRA_TOKENS_182>|",
+    "|<EXTRA_TOKENS_183>|",
+    "|<EXTRA_TOKENS_184>|",
+    "|<EXTRA_TOKENS_185>|",
+    "|<EXTRA_TOKENS_186>|",
+    "|<EXTRA_TOKENS_187>|",
+    "|<EXTRA_TOKENS_188>|",
+    "|<EXTRA_TOKENS_189>|",
+    "|<EXTRA_TOKENS_190>|",
+    "|<EXTRA_TOKENS_191>|",
+    "|<EXTRA_TOKENS_192>|",
+    "|<EXTRA_TOKENS_193>|",
+    "|<EXTRA_TOKENS_194>|",
+    "|<EXTRA_TOKENS_195>|",
+    "|<EXTRA_TOKENS_196>|",
+    "|<EXTRA_TOKENS_197>|",
+    "|<EXTRA_TOKENS_198>|",
+    "|<EXTRA_TOKENS_199>|",
+    "|<EXTRA_TOKENS_200>|",
+    "|<EXTRA_TOKENS_201>|",
+    "|<EXTRA_TOKENS_202>|",
+    "|<EXTRA_TOKENS_203>|",
+    "|<EXTRA_TOKENS_204>|",
+    "|<EXTRA_TOKENS_205>|",
+    "|<EXTRA_TOKENS_206>|",
+    "|<EXTRA_TOKENS_207>|",
+    "|<EXTRA_TOKENS_208>|",
+    "|<EXTRA_TOKENS_209>|",
+    "|<EXTRA_TOKENS_210>|",
+    "|<EXTRA_TOKENS_211>|",
+    "|<EXTRA_TOKENS_212>|",
+    "|<EXTRA_TOKENS_213>|",
+    "|<EXTRA_TOKENS_214>|",
+    "|<EXTRA_TOKENS_215>|",
+    "|<EXTRA_TOKENS_216>|",
+    "|<EXTRA_TOKENS_217>|",
+    "|<EXTRA_TOKENS_218>|",
+    "|<EXTRA_TOKENS_219>|",
+    "|<EXTRA_TOKENS_220>|",
+    "|<EXTRA_TOKENS_221>|",
+    "|<EXTRA_TOKENS_222>|",
+    "|<EXTRA_TOKENS_223>|",
+    "|<EXTRA_TOKENS_224>|",
+    "|<EXTRA_TOKENS_225>|",
+    "|<EXTRA_TOKENS_226>|",
+    "|<EXTRA_TOKENS_227>|",
+    "|<EXTRA_TOKENS_228>|",
+    "|<EXTRA_TOKENS_229>|",
+    "|<EXTRA_TOKENS_230>|",
+    "|<EXTRA_TOKENS_231>|",
+    "|<EXTRA_TOKENS_232>|",
+    "|<EXTRA_TOKENS_233>|",
+    "|<EXTRA_TOKENS_234>|",
+    "|<EXTRA_TOKENS_235>|",
+    "|<EXTRA_TOKENS_236>|",
+    "|<EXTRA_TOKENS_237>|",
+    "|<EXTRA_TOKENS_238>|",
+    "|<EXTRA_TOKENS_239>|",
+    "|<EXTRA_TOKENS_240>|",
+    "|<EXTRA_TOKENS_241>|",
+    "|<EXTRA_TOKENS_242>|",
+    "|<EXTRA_TOKENS_243>|",
+    "|<EXTRA_TOKENS_244>|",
+    "|<EXTRA_TOKENS_245>|",
+    "|<EXTRA_TOKENS_246>|",
+    "|<EXTRA_TOKENS_247>|",
+    "|<EXTRA_TOKENS_248>|",
+    "|<EXTRA_TOKENS_249>|",
+    "|<EXTRA_TOKENS_250>|",
+    "|<EXTRA_TOKENS_251>|",
+    "|<EXTRA_TOKENS_252>|",
+    "|<EXTRA_TOKENS_253>|",
+    "|<EXTRA_TOKENS_254>|",
+    "|<EXTRA_TOKENS_255>|",
+    "|<EXTRA_TOKENS_256>|",
+    "|<EXTRA_TOKENS_257>|",
+    "|<EXTRA_TOKENS_258>|",
+    "|<EXTRA_TOKENS_259>|",
+    "|<EXTRA_TOKENS_260>|",
+    "|<EXTRA_TOKENS_261>|",
+    "|<EXTRA_TOKENS_262>|",
+    "|<EXTRA_TOKENS_263>|",
+    "|<EXTRA_TOKENS_264>|",
+    "|<EXTRA_TOKENS_265>|",
+    "|<EXTRA_TOKENS_266>|",
+    "|<EXTRA_TOKENS_267>|",
+    "|<EXTRA_TOKENS_268>|",
+    "|<EXTRA_TOKENS_269>|",
+    "|<EXTRA_TOKENS_270>|",
+    "|<EXTRA_TOKENS_271>|",
+    "|<EXTRA_TOKENS_272>|",
+    "|<EXTRA_TOKENS_273>|",
+    "|<EXTRA_TOKENS_274>|",
+    "|<EXTRA_TOKENS_275>|",
+    "|<EXTRA_TOKENS_276>|",
+    "|<EXTRA_TOKENS_277>|",
+    "|<EXTRA_TOKENS_278>|",
+    "|<EXTRA_TOKENS_279>|",
+    "|<EXTRA_TOKENS_280>|",
+    "|<EXTRA_TOKENS_281>|",
+    "|<EXTRA_TOKENS_282>|",
+    "|<EXTRA_TOKENS_283>|",
+    "|<EXTRA_TOKENS_284>|",
+    "|<EXTRA_TOKENS_285>|",
+    "|<EXTRA_TOKENS_286>|",
+    "|<EXTRA_TOKENS_287>|",
+    "|<EXTRA_TOKENS_288>|",
+    "|<EXTRA_TOKENS_289>|",
+    "|<EXTRA_TOKENS_290>|",
+    "|<EXTRA_TOKENS_291>|",
+    "|<EXTRA_TOKENS_292>|",
+    "|<EXTRA_TOKENS_293>|",
+    "|<EXTRA_TOKENS_294>|",
+    "|<EXTRA_TOKENS_295>|",
+    "|<EXTRA_TOKENS_296>|",
+    "|<EXTRA_TOKENS_297>|",
+    "|<EXTRA_TOKENS_298>|",
+    "|<EXTRA_TOKENS_299>|",
+    "|<EXTRA_TOKENS_300>|",
+    "|<EXTRA_TOKENS_301>|",
+    "|<EXTRA_TOKENS_302>|",
+    "|<EXTRA_TOKENS_303>|",
+    "|<EXTRA_TOKENS_304>|",
+    "|<EXTRA_TOKENS_305>|",
+    "|<EXTRA_TOKENS_306>|",
+    "|<EXTRA_TOKENS_307>|",
+    "|<EXTRA_TOKENS_308>|",
+    "|<EXTRA_TOKENS_309>|",
+    "|<EXTRA_TOKENS_310>|",
+    "|<EXTRA_TOKENS_311>|",
+    "|<EXTRA_TOKENS_312>|",
+    "|<EXTRA_TOKENS_313>|",
+    "|<EXTRA_TOKENS_314>|",
+    "|<EXTRA_TOKENS_315>|",
+    "|<EXTRA_TOKENS_316>|",
+    "|<EXTRA_TOKENS_317>|",
+    "|<EXTRA_TOKENS_318>|",
+    "|<EXTRA_TOKENS_319>|",
+    "|<EXTRA_TOKENS_320>|",
+    "|<EXTRA_TOKENS_321>|",
+    "|<EXTRA_TOKENS_322>|",
+    "|<EXTRA_TOKENS_323>|",
+    "|<EXTRA_TOKENS_324>|",
+    "|<EXTRA_TOKENS_325>|",
+    "|<EXTRA_TOKENS_326>|",
+    "|<EXTRA_TOKENS_327>|",
+    "|<EXTRA_TOKENS_328>|",
+    "|<EXTRA_TOKENS_329>|",
+    "|<EXTRA_TOKENS_330>|",
+    "|<EXTRA_TOKENS_331>|",
+    "|<EXTRA_TOKENS_332>|",
+    "|<EXTRA_TOKENS_333>|",
+    "|<EXTRA_TOKENS_334>|",
+    "|<EXTRA_TOKENS_335>|",
+    "|<EXTRA_TOKENS_336>|",
+    "|<EXTRA_TOKENS_337>|",
+    "|<EXTRA_TOKENS_338>|",
+    "|<EXTRA_TOKENS_339>|",
+    "|<EXTRA_TOKENS_340>|",
+    "|<EXTRA_TOKENS_341>|",
+    "|<EXTRA_TOKENS_342>|",
+    "|<EXTRA_TOKENS_343>|",
+    "|<EXTRA_TOKENS_344>|",
+    "|<EXTRA_TOKENS_345>|",
+    "|<EXTRA_TOKENS_346>|",
+    "|<EXTRA_TOKENS_347>|",
+    "|<EXTRA_TOKENS_348>|",
+    "|<EXTRA_TOKENS_349>|",
+    "|<EXTRA_TOKENS_350>|",
+    "|<EXTRA_TOKENS_351>|",
+    "|<EXTRA_TOKENS_352>|",
+    "|<EXTRA_TOKENS_353>|",
+    "|<EXTRA_TOKENS_354>|",
+    "|<EXTRA_TOKENS_355>|",
+    "|<EXTRA_TOKENS_356>|",
+    "|<EXTRA_TOKENS_357>|",
+    "|<EXTRA_TOKENS_358>|",
+    "|<EXTRA_TOKENS_359>|",
+    "|<EXTRA_TOKENS_360>|",
+    "|<EXTRA_TOKENS_361>|",
+    "|<EXTRA_TOKENS_362>|",
+    "|<EXTRA_TOKENS_363>|",
+    "|<EXTRA_TOKENS_364>|",
+    "|<EXTRA_TOKENS_365>|",
+    "|<EXTRA_TOKENS_366>|",
+    "|<EXTRA_TOKENS_367>|",
+    "|<EXTRA_TOKENS_368>|",
+    "|<EXTRA_TOKENS_369>|",
+    "|<EXTRA_TOKENS_370>|",
+    "|<EXTRA_TOKENS_371>|",
+    "|<EXTRA_TOKENS_372>|",
+    "|<EXTRA_TOKENS_373>|",
+    "|<EXTRA_TOKENS_374>|",
+    "|<EXTRA_TOKENS_375>|",
+    "|<EXTRA_TOKENS_376>|",
+    "|<EXTRA_TOKENS_377>|",
+    "|<EXTRA_TOKENS_378>|",
+    "|<EXTRA_TOKENS_379>|",
+    "|<EXTRA_TOKENS_380>|",
+    "|<EXTRA_TOKENS_381>|",
+    "|<EXTRA_TOKENS_382>|",
+    "|<EXTRA_TOKENS_383>|",
+    "|<EXTRA_TOKENS_384>|",
+    "|<EXTRA_TOKENS_385>|",
+    "|<EXTRA_TOKENS_386>|",
+    "|<EXTRA_TOKENS_387>|",
+    "|<EXTRA_TOKENS_388>|",
+    "|<EXTRA_TOKENS_389>|",
+    "|<EXTRA_TOKENS_390>|",
+    "|<EXTRA_TOKENS_391>|",
+    "|<EXTRA_TOKENS_392>|",
+    "|<EXTRA_TOKENS_393>|",
+    "|<EXTRA_TOKENS_394>|",
+    "|<EXTRA_TOKENS_395>|",
+    "|<EXTRA_TOKENS_396>|",
+    "|<EXTRA_TOKENS_397>|",
+    "|<EXTRA_TOKENS_398>|",
+    "|<EXTRA_TOKENS_399>|",
+    "|<EXTRA_TOKENS_400>|",
+    "|<EXTRA_TOKENS_401>|",
+    "|<EXTRA_TOKENS_402>|",
+    "|<EXTRA_TOKENS_403>|",
+    "|<EXTRA_TOKENS_404>|",
+    "|<EXTRA_TOKENS_405>|",
+    "|<EXTRA_TOKENS_406>|",
+    "|<EXTRA_TOKENS_407>|",
+    "|<EXTRA_TOKENS_408>|",
+    "|<EXTRA_TOKENS_409>|",
+    "|<EXTRA_TOKENS_410>|",
+    "|<EXTRA_TOKENS_411>|",
+    "|<EXTRA_TOKENS_412>|",
+    "|<EXTRA_TOKENS_413>|",
+    "|<EXTRA_TOKENS_414>|",
+    "|<EXTRA_TOKENS_415>|",
+    "|<EXTRA_TOKENS_416>|",
+    "|<EXTRA_TOKENS_417>|",
+    "<im_start>",
+    "<im_end>",
+    "<im_patch>",
+    "<im_col>",
+    "<|image|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6248048a83152ce87663c799492fe7e60c8086f3ae51ce7bd255ccc445746fc0
+size 11501432

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3852 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "|<EXTRA_TOKENS_0>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "|<EXTRA_TOKENS_1>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "|<EXTRA_TOKENS_2>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "|<EXTRA_TOKENS_3>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "|<EXTRA_TOKENS_4>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "|<EXTRA_TOKENS_5>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "|<EXTRA_TOKENS_6>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "|<EXTRA_TOKENS_7>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "|<EXTRA_TOKENS_8>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "|<EXTRA_TOKENS_9>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "|<EXTRA_TOKENS_10>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "|<EXTRA_TOKENS_11>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151658": {
+      "content": "|<EXTRA_TOKENS_12>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151659": {
+      "content": "|<EXTRA_TOKENS_13>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151660": {
+      "content": "|<EXTRA_TOKENS_14>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151661": {
+      "content": "|<EXTRA_TOKENS_15>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151662": {
+      "content": "|<EXTRA_TOKENS_16>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151663": {
+      "content": "|<EXTRA_TOKENS_17>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151664": {
+      "content": "|<EXTRA_TOKENS_18>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151665": {
+      "content": "|<EXTRA_TOKENS_19>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "|<EXTRA_TOKENS_20>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "|<EXTRA_TOKENS_21>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "|<EXTRA_TOKENS_22>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "|<EXTRA_TOKENS_23>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "|<EXTRA_TOKENS_24>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "|<EXTRA_TOKENS_25>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "|<EXTRA_TOKENS_26>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "|<EXTRA_TOKENS_27>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "|<EXTRA_TOKENS_28>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "|<EXTRA_TOKENS_29>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151676": {
+      "content": "|<EXTRA_TOKENS_30>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151677": {
+      "content": "|<EXTRA_TOKENS_31>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151678": {
+      "content": "|<EXTRA_TOKENS_32>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151679": {
+      "content": "|<EXTRA_TOKENS_33>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151680": {
+      "content": "|<EXTRA_TOKENS_34>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151681": {
+      "content": "|<EXTRA_TOKENS_35>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151682": {
+      "content": "|<EXTRA_TOKENS_36>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151683": {
+      "content": "|<EXTRA_TOKENS_37>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151684": {
+      "content": "|<EXTRA_TOKENS_38>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151685": {
+      "content": "|<EXTRA_TOKENS_39>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151686": {
+      "content": "|<EXTRA_TOKENS_40>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151687": {
+      "content": "|<EXTRA_TOKENS_41>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151688": {
+      "content": "|<EXTRA_TOKENS_42>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151689": {
+      "content": "|<EXTRA_TOKENS_43>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151690": {
+      "content": "|<EXTRA_TOKENS_44>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151691": {
+      "content": "|<EXTRA_TOKENS_45>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151692": {
+      "content": "|<EXTRA_TOKENS_46>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151693": {
+      "content": "|<EXTRA_TOKENS_47>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151694": {
+      "content": "|<EXTRA_TOKENS_48>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151695": {
+      "content": "|<EXTRA_TOKENS_49>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151696": {
+      "content": "|<EXTRA_TOKENS_50>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151697": {
+      "content": "|<EXTRA_TOKENS_51>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151698": {
+      "content": "|<EXTRA_TOKENS_52>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151699": {
+      "content": "|<EXTRA_TOKENS_53>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151700": {
+      "content": "|<EXTRA_TOKENS_54>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151701": {
+      "content": "|<EXTRA_TOKENS_55>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151702": {
+      "content": "|<EXTRA_TOKENS_56>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151703": {
+      "content": "|<EXTRA_TOKENS_57>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151704": {
+      "content": "|<EXTRA_TOKENS_58>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151705": {
+      "content": "|<EXTRA_TOKENS_59>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151706": {
+      "content": "|<EXTRA_TOKENS_60>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151707": {
+      "content": "|<EXTRA_TOKENS_61>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151708": {
+      "content": "|<EXTRA_TOKENS_62>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151709": {
+      "content": "|<EXTRA_TOKENS_63>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151710": {
+      "content": "|<EXTRA_TOKENS_64>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151711": {
+      "content": "|<EXTRA_TOKENS_65>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151712": {
+      "content": "|<EXTRA_TOKENS_66>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151713": {
+      "content": "|<EXTRA_TOKENS_67>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151714": {
+      "content": "|<EXTRA_TOKENS_68>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151715": {
+      "content": "|<EXTRA_TOKENS_69>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151716": {
+      "content": "|<EXTRA_TOKENS_70>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151717": {
+      "content": "|<EXTRA_TOKENS_71>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151718": {
+      "content": "|<EXTRA_TOKENS_72>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151719": {
+      "content": "|<EXTRA_TOKENS_73>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151720": {
+      "content": "|<EXTRA_TOKENS_74>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151721": {
+      "content": "|<EXTRA_TOKENS_75>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151722": {
+      "content": "|<EXTRA_TOKENS_76>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151723": {
+      "content": "|<EXTRA_TOKENS_77>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151724": {
+      "content": "|<EXTRA_TOKENS_78>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151725": {
+      "content": "|<EXTRA_TOKENS_79>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151726": {
+      "content": "|<EXTRA_TOKENS_80>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151727": {
+      "content": "|<EXTRA_TOKENS_81>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151728": {
+      "content": "|<EXTRA_TOKENS_82>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151729": {
+      "content": "|<EXTRA_TOKENS_83>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151730": {
+      "content": "|<EXTRA_TOKENS_84>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151731": {
+      "content": "|<EXTRA_TOKENS_85>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151732": {
+      "content": "|<EXTRA_TOKENS_86>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151733": {
+      "content": "|<EXTRA_TOKENS_87>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151734": {
+      "content": "|<EXTRA_TOKENS_88>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151735": {
+      "content": "|<EXTRA_TOKENS_89>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151736": {
+      "content": "|<EXTRA_TOKENS_90>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151737": {
+      "content": "|<EXTRA_TOKENS_91>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151738": {
+      "content": "|<EXTRA_TOKENS_92>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151739": {
+      "content": "|<EXTRA_TOKENS_93>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151740": {
+      "content": "|<EXTRA_TOKENS_94>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151741": {
+      "content": "|<EXTRA_TOKENS_95>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151742": {
+      "content": "|<EXTRA_TOKENS_96>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151743": {
+      "content": "|<EXTRA_TOKENS_97>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151744": {
+      "content": "|<EXTRA_TOKENS_98>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151745": {
+      "content": "|<EXTRA_TOKENS_99>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151746": {
+      "content": "|<EXTRA_TOKENS_100>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151747": {
+      "content": "|<EXTRA_TOKENS_101>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151748": {
+      "content": "|<EXTRA_TOKENS_102>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151749": {
+      "content": "|<EXTRA_TOKENS_103>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151750": {
+      "content": "|<EXTRA_TOKENS_104>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151751": {
+      "content": "|<EXTRA_TOKENS_105>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151752": {
+      "content": "|<EXTRA_TOKENS_106>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151753": {
+      "content": "|<EXTRA_TOKENS_107>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151754": {
+      "content": "|<EXTRA_TOKENS_108>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151755": {
+      "content": "|<EXTRA_TOKENS_109>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151756": {
+      "content": "|<EXTRA_TOKENS_110>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151757": {
+      "content": "|<EXTRA_TOKENS_111>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151758": {
+      "content": "|<EXTRA_TOKENS_112>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151759": {
+      "content": "|<EXTRA_TOKENS_113>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151760": {
+      "content": "|<EXTRA_TOKENS_114>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151761": {
+      "content": "|<EXTRA_TOKENS_115>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151762": {
+      "content": "|<EXTRA_TOKENS_116>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151763": {
+      "content": "|<EXTRA_TOKENS_117>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151764": {
+      "content": "|<EXTRA_TOKENS_118>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151765": {
+      "content": "|<EXTRA_TOKENS_119>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151766": {
+      "content": "|<EXTRA_TOKENS_120>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151767": {
+      "content": "|<EXTRA_TOKENS_121>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151768": {
+      "content": "|<EXTRA_TOKENS_122>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151769": {
+      "content": "|<EXTRA_TOKENS_123>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151770": {
+      "content": "|<EXTRA_TOKENS_124>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151771": {
+      "content": "|<EXTRA_TOKENS_125>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151772": {
+      "content": "|<EXTRA_TOKENS_126>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151773": {
+      "content": "|<EXTRA_TOKENS_127>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151774": {
+      "content": "|<EXTRA_TOKENS_128>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151775": {
+      "content": "|<EXTRA_TOKENS_129>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151776": {
+      "content": "|<EXTRA_TOKENS_130>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151777": {
+      "content": "|<EXTRA_TOKENS_131>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151778": {
+      "content": "|<EXTRA_TOKENS_132>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151779": {
+      "content": "|<EXTRA_TOKENS_133>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151780": {
+      "content": "|<EXTRA_TOKENS_134>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151781": {
+      "content": "|<EXTRA_TOKENS_135>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151782": {
+      "content": "|<EXTRA_TOKENS_136>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151783": {
+      "content": "|<EXTRA_TOKENS_137>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151784": {
+      "content": "|<EXTRA_TOKENS_138>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151785": {
+      "content": "|<EXTRA_TOKENS_139>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151786": {
+      "content": "|<EXTRA_TOKENS_140>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151787": {
+      "content": "|<EXTRA_TOKENS_141>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151788": {
+      "content": "|<EXTRA_TOKENS_142>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151789": {
+      "content": "|<EXTRA_TOKENS_143>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151790": {
+      "content": "|<EXTRA_TOKENS_144>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151791": {
+      "content": "|<EXTRA_TOKENS_145>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151792": {
+      "content": "|<EXTRA_TOKENS_146>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151793": {
+      "content": "|<EXTRA_TOKENS_147>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151794": {
+      "content": "|<EXTRA_TOKENS_148>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151795": {
+      "content": "|<EXTRA_TOKENS_149>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151796": {
+      "content": "|<EXTRA_TOKENS_150>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151797": {
+      "content": "|<EXTRA_TOKENS_151>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151798": {
+      "content": "|<EXTRA_TOKENS_152>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151799": {
+      "content": "|<EXTRA_TOKENS_153>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151800": {
+      "content": "|<EXTRA_TOKENS_154>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151801": {
+      "content": "|<EXTRA_TOKENS_155>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151802": {
+      "content": "|<EXTRA_TOKENS_156>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151803": {
+      "content": "|<EXTRA_TOKENS_157>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151804": {
+      "content": "|<EXTRA_TOKENS_158>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151805": {
+      "content": "|<EXTRA_TOKENS_159>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151806": {
+      "content": "|<EXTRA_TOKENS_160>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151807": {
+      "content": "|<EXTRA_TOKENS_161>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151808": {
+      "content": "|<EXTRA_TOKENS_162>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151809": {
+      "content": "|<EXTRA_TOKENS_163>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151810": {
+      "content": "|<EXTRA_TOKENS_164>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151811": {
+      "content": "|<EXTRA_TOKENS_165>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151812": {
+      "content": "|<EXTRA_TOKENS_166>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151813": {
+      "content": "|<EXTRA_TOKENS_167>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151814": {
+      "content": "|<EXTRA_TOKENS_168>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151815": {
+      "content": "|<EXTRA_TOKENS_169>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151816": {
+      "content": "|<EXTRA_TOKENS_170>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151817": {
+      "content": "|<EXTRA_TOKENS_171>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151818": {
+      "content": "|<EXTRA_TOKENS_172>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151819": {
+      "content": "|<EXTRA_TOKENS_173>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151820": {
+      "content": "|<EXTRA_TOKENS_174>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151821": {
+      "content": "|<EXTRA_TOKENS_175>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151822": {
+      "content": "|<EXTRA_TOKENS_176>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151823": {
+      "content": "|<EXTRA_TOKENS_177>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151824": {
+      "content": "|<EXTRA_TOKENS_178>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151825": {
+      "content": "|<EXTRA_TOKENS_179>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151826": {
+      "content": "|<EXTRA_TOKENS_180>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151827": {
+      "content": "|<EXTRA_TOKENS_181>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151828": {
+      "content": "|<EXTRA_TOKENS_182>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151829": {
+      "content": "|<EXTRA_TOKENS_183>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151830": {
+      "content": "|<EXTRA_TOKENS_184>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151831": {
+      "content": "|<EXTRA_TOKENS_185>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151832": {
+      "content": "|<EXTRA_TOKENS_186>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151833": {
+      "content": "|<EXTRA_TOKENS_187>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151834": {
+      "content": "|<EXTRA_TOKENS_188>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151835": {
+      "content": "|<EXTRA_TOKENS_189>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151836": {
+      "content": "|<EXTRA_TOKENS_190>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151837": {
+      "content": "|<EXTRA_TOKENS_191>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151838": {
+      "content": "|<EXTRA_TOKENS_192>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151839": {
+      "content": "|<EXTRA_TOKENS_193>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151840": {
+      "content": "|<EXTRA_TOKENS_194>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151841": {
+      "content": "|<EXTRA_TOKENS_195>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151842": {
+      "content": "|<EXTRA_TOKENS_196>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151843": {
+      "content": "|<EXTRA_TOKENS_197>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151844": {
+      "content": "|<EXTRA_TOKENS_198>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151845": {
+      "content": "|<EXTRA_TOKENS_199>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151846": {
+      "content": "|<EXTRA_TOKENS_200>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151847": {
+      "content": "|<EXTRA_TOKENS_201>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151848": {
+      "content": "|<EXTRA_TOKENS_202>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151849": {
+      "content": "|<EXTRA_TOKENS_203>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151850": {
+      "content": "|<EXTRA_TOKENS_204>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151851": {
+      "content": "|<EXTRA_TOKENS_205>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151852": {
+      "content": "|<EXTRA_TOKENS_206>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151853": {
+      "content": "|<EXTRA_TOKENS_207>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151854": {
+      "content": "|<EXTRA_TOKENS_208>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151855": {
+      "content": "|<EXTRA_TOKENS_209>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151856": {
+      "content": "|<EXTRA_TOKENS_210>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151857": {
+      "content": "|<EXTRA_TOKENS_211>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151858": {
+      "content": "|<EXTRA_TOKENS_212>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151859": {
+      "content": "|<EXTRA_TOKENS_213>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151860": {
+      "content": "|<EXTRA_TOKENS_214>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151861": {
+      "content": "|<EXTRA_TOKENS_215>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151862": {
+      "content": "|<EXTRA_TOKENS_216>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151863": {
+      "content": "|<EXTRA_TOKENS_217>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151864": {
+      "content": "|<EXTRA_TOKENS_218>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151865": {
+      "content": "|<EXTRA_TOKENS_219>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151866": {
+      "content": "|<EXTRA_TOKENS_220>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151867": {
+      "content": "|<EXTRA_TOKENS_221>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151868": {
+      "content": "|<EXTRA_TOKENS_222>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151869": {
+      "content": "|<EXTRA_TOKENS_223>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151870": {
+      "content": "|<EXTRA_TOKENS_224>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151871": {
+      "content": "|<EXTRA_TOKENS_225>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151872": {
+      "content": "|<EXTRA_TOKENS_226>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151873": {
+      "content": "|<EXTRA_TOKENS_227>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151874": {
+      "content": "|<EXTRA_TOKENS_228>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151875": {
+      "content": "|<EXTRA_TOKENS_229>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151876": {
+      "content": "|<EXTRA_TOKENS_230>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151877": {
+      "content": "|<EXTRA_TOKENS_231>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151878": {
+      "content": "|<EXTRA_TOKENS_232>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151879": {
+      "content": "|<EXTRA_TOKENS_233>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151880": {
+      "content": "|<EXTRA_TOKENS_234>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151881": {
+      "content": "|<EXTRA_TOKENS_235>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151882": {
+      "content": "|<EXTRA_TOKENS_236>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151883": {
+      "content": "|<EXTRA_TOKENS_237>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151884": {
+      "content": "|<EXTRA_TOKENS_238>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151885": {
+      "content": "|<EXTRA_TOKENS_239>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151886": {
+      "content": "|<EXTRA_TOKENS_240>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151887": {
+      "content": "|<EXTRA_TOKENS_241>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151888": {
+      "content": "|<EXTRA_TOKENS_242>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151889": {
+      "content": "|<EXTRA_TOKENS_243>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151890": {
+      "content": "|<EXTRA_TOKENS_244>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151891": {
+      "content": "|<EXTRA_TOKENS_245>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151892": {
+      "content": "|<EXTRA_TOKENS_246>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151893": {
+      "content": "|<EXTRA_TOKENS_247>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151894": {
+      "content": "|<EXTRA_TOKENS_248>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151895": {
+      "content": "|<EXTRA_TOKENS_249>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151896": {
+      "content": "|<EXTRA_TOKENS_250>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151897": {
+      "content": "|<EXTRA_TOKENS_251>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151898": {
+      "content": "|<EXTRA_TOKENS_252>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151899": {
+      "content": "|<EXTRA_TOKENS_253>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151900": {
+      "content": "|<EXTRA_TOKENS_254>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151901": {
+      "content": "|<EXTRA_TOKENS_255>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151902": {
+      "content": "|<EXTRA_TOKENS_256>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151903": {
+      "content": "|<EXTRA_TOKENS_257>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151904": {
+      "content": "|<EXTRA_TOKENS_258>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151905": {
+      "content": "|<EXTRA_TOKENS_259>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151906": {
+      "content": "|<EXTRA_TOKENS_260>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151907": {
+      "content": "|<EXTRA_TOKENS_261>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151908": {
+      "content": "|<EXTRA_TOKENS_262>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151909": {
+      "content": "|<EXTRA_TOKENS_263>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151910": {
+      "content": "|<EXTRA_TOKENS_264>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151911": {
+      "content": "|<EXTRA_TOKENS_265>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151912": {
+      "content": "|<EXTRA_TOKENS_266>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151913": {
+      "content": "|<EXTRA_TOKENS_267>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151914": {
+      "content": "|<EXTRA_TOKENS_268>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151915": {
+      "content": "|<EXTRA_TOKENS_269>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151916": {
+      "content": "|<EXTRA_TOKENS_270>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151917": {
+      "content": "|<EXTRA_TOKENS_271>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151918": {
+      "content": "|<EXTRA_TOKENS_272>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151919": {
+      "content": "|<EXTRA_TOKENS_273>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151920": {
+      "content": "|<EXTRA_TOKENS_274>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151921": {
+      "content": "|<EXTRA_TOKENS_275>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151922": {
+      "content": "|<EXTRA_TOKENS_276>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151923": {
+      "content": "|<EXTRA_TOKENS_277>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151924": {
+      "content": "|<EXTRA_TOKENS_278>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151925": {
+      "content": "|<EXTRA_TOKENS_279>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151926": {
+      "content": "|<EXTRA_TOKENS_280>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151927": {
+      "content": "|<EXTRA_TOKENS_281>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151928": {
+      "content": "|<EXTRA_TOKENS_282>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151929": {
+      "content": "|<EXTRA_TOKENS_283>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151930": {
+      "content": "|<EXTRA_TOKENS_284>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151931": {
+      "content": "|<EXTRA_TOKENS_285>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151932": {
+      "content": "|<EXTRA_TOKENS_286>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151933": {
+      "content": "|<EXTRA_TOKENS_287>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151934": {
+      "content": "|<EXTRA_TOKENS_288>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151935": {
+      "content": "|<EXTRA_TOKENS_289>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151936": {
+      "content": "|<EXTRA_TOKENS_290>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151937": {
+      "content": "|<EXTRA_TOKENS_291>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151938": {
+      "content": "|<EXTRA_TOKENS_292>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151939": {
+      "content": "|<EXTRA_TOKENS_293>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151940": {
+      "content": "|<EXTRA_TOKENS_294>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151941": {
+      "content": "|<EXTRA_TOKENS_295>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151942": {
+      "content": "|<EXTRA_TOKENS_296>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151943": {
+      "content": "|<EXTRA_TOKENS_297>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151944": {
+      "content": "|<EXTRA_TOKENS_298>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151945": {
+      "content": "|<EXTRA_TOKENS_299>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151946": {
+      "content": "|<EXTRA_TOKENS_300>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151947": {
+      "content": "|<EXTRA_TOKENS_301>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151948": {
+      "content": "|<EXTRA_TOKENS_302>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151949": {
+      "content": "|<EXTRA_TOKENS_303>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151950": {
+      "content": "|<EXTRA_TOKENS_304>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151951": {
+      "content": "|<EXTRA_TOKENS_305>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151952": {
+      "content": "|<EXTRA_TOKENS_306>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151953": {
+      "content": "|<EXTRA_TOKENS_307>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151954": {
+      "content": "|<EXTRA_TOKENS_308>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151955": {
+      "content": "|<EXTRA_TOKENS_309>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151956": {
+      "content": "|<EXTRA_TOKENS_310>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151957": {
+      "content": "|<EXTRA_TOKENS_311>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151958": {
+      "content": "|<EXTRA_TOKENS_312>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151959": {
+      "content": "|<EXTRA_TOKENS_313>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151960": {
+      "content": "|<EXTRA_TOKENS_314>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151961": {
+      "content": "|<EXTRA_TOKENS_315>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151962": {
+      "content": "|<EXTRA_TOKENS_316>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151963": {
+      "content": "|<EXTRA_TOKENS_317>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151964": {
+      "content": "|<EXTRA_TOKENS_318>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151965": {
+      "content": "|<EXTRA_TOKENS_319>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151966": {
+      "content": "|<EXTRA_TOKENS_320>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151967": {
+      "content": "|<EXTRA_TOKENS_321>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151968": {
+      "content": "|<EXTRA_TOKENS_322>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151969": {
+      "content": "|<EXTRA_TOKENS_323>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151970": {
+      "content": "|<EXTRA_TOKENS_324>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151971": {
+      "content": "|<EXTRA_TOKENS_325>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151972": {
+      "content": "|<EXTRA_TOKENS_326>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151973": {
+      "content": "|<EXTRA_TOKENS_327>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151974": {
+      "content": "|<EXTRA_TOKENS_328>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151975": {
+      "content": "|<EXTRA_TOKENS_329>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151976": {
+      "content": "|<EXTRA_TOKENS_330>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151977": {
+      "content": "|<EXTRA_TOKENS_331>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151978": {
+      "content": "|<EXTRA_TOKENS_332>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151979": {
+      "content": "|<EXTRA_TOKENS_333>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151980": {
+      "content": "|<EXTRA_TOKENS_334>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151981": {
+      "content": "|<EXTRA_TOKENS_335>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151982": {
+      "content": "|<EXTRA_TOKENS_336>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151983": {
+      "content": "|<EXTRA_TOKENS_337>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151984": {
+      "content": "|<EXTRA_TOKENS_338>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151985": {
+      "content": "|<EXTRA_TOKENS_339>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151986": {
+      "content": "|<EXTRA_TOKENS_340>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151987": {
+      "content": "|<EXTRA_TOKENS_341>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151988": {
+      "content": "|<EXTRA_TOKENS_342>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151989": {
+      "content": "|<EXTRA_TOKENS_343>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151990": {
+      "content": "|<EXTRA_TOKENS_344>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151991": {
+      "content": "|<EXTRA_TOKENS_345>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151992": {
+      "content": "|<EXTRA_TOKENS_346>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151993": {
+      "content": "|<EXTRA_TOKENS_347>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151994": {
+      "content": "|<EXTRA_TOKENS_348>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151995": {
+      "content": "|<EXTRA_TOKENS_349>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151996": {
+      "content": "|<EXTRA_TOKENS_350>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151997": {
+      "content": "|<EXTRA_TOKENS_351>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151998": {
+      "content": "|<EXTRA_TOKENS_352>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151999": {
+      "content": "|<EXTRA_TOKENS_353>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152000": {
+      "content": "|<EXTRA_TOKENS_354>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152001": {
+      "content": "|<EXTRA_TOKENS_355>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152002": {
+      "content": "|<EXTRA_TOKENS_356>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152003": {
+      "content": "|<EXTRA_TOKENS_357>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152004": {
+      "content": "|<EXTRA_TOKENS_358>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152005": {
+      "content": "|<EXTRA_TOKENS_359>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152006": {
+      "content": "|<EXTRA_TOKENS_360>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152007": {
+      "content": "|<EXTRA_TOKENS_361>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152008": {
+      "content": "|<EXTRA_TOKENS_362>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152009": {
+      "content": "|<EXTRA_TOKENS_363>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152010": {
+      "content": "|<EXTRA_TOKENS_364>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152011": {
+      "content": "|<EXTRA_TOKENS_365>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152012": {
+      "content": "|<EXTRA_TOKENS_366>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152013": {
+      "content": "|<EXTRA_TOKENS_367>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152014": {
+      "content": "|<EXTRA_TOKENS_368>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152015": {
+      "content": "|<EXTRA_TOKENS_369>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152016": {
+      "content": "|<EXTRA_TOKENS_370>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152017": {
+      "content": "|<EXTRA_TOKENS_371>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152018": {
+      "content": "|<EXTRA_TOKENS_372>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152019": {
+      "content": "|<EXTRA_TOKENS_373>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152020": {
+      "content": "|<EXTRA_TOKENS_374>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152021": {
+      "content": "|<EXTRA_TOKENS_375>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152022": {
+      "content": "|<EXTRA_TOKENS_376>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152023": {
+      "content": "|<EXTRA_TOKENS_377>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152024": {
+      "content": "|<EXTRA_TOKENS_378>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152025": {
+      "content": "|<EXTRA_TOKENS_379>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152026": {
+      "content": "|<EXTRA_TOKENS_380>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152027": {
+      "content": "|<EXTRA_TOKENS_381>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152028": {
+      "content": "|<EXTRA_TOKENS_382>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152029": {
+      "content": "|<EXTRA_TOKENS_383>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152030": {
+      "content": "|<EXTRA_TOKENS_384>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152031": {
+      "content": "|<EXTRA_TOKENS_385>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152032": {
+      "content": "|<EXTRA_TOKENS_386>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152033": {
+      "content": "|<EXTRA_TOKENS_387>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152034": {
+      "content": "|<EXTRA_TOKENS_388>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152035": {
+      "content": "|<EXTRA_TOKENS_389>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152036": {
+      "content": "|<EXTRA_TOKENS_390>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152037": {
+      "content": "|<EXTRA_TOKENS_391>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152038": {
+      "content": "|<EXTRA_TOKENS_392>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152039": {
+      "content": "|<EXTRA_TOKENS_393>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152040": {
+      "content": "|<EXTRA_TOKENS_394>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152041": {
+      "content": "|<EXTRA_TOKENS_395>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152042": {
+      "content": "|<EXTRA_TOKENS_396>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152043": {
+      "content": "|<EXTRA_TOKENS_397>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152044": {
+      "content": "|<EXTRA_TOKENS_398>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152045": {
+      "content": "|<EXTRA_TOKENS_399>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152046": {
+      "content": "|<EXTRA_TOKENS_400>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152047": {
+      "content": "|<EXTRA_TOKENS_401>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152048": {
+      "content": "|<EXTRA_TOKENS_402>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152049": {
+      "content": "|<EXTRA_TOKENS_403>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152050": {
+      "content": "|<EXTRA_TOKENS_404>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152051": {
+      "content": "|<EXTRA_TOKENS_405>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152052": {
+      "content": "|<EXTRA_TOKENS_406>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152053": {
+      "content": "|<EXTRA_TOKENS_407>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152054": {
+      "content": "|<EXTRA_TOKENS_408>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152055": {
+      "content": "|<EXTRA_TOKENS_409>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152056": {
+      "content": "|<EXTRA_TOKENS_410>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152057": {
+      "content": "|<EXTRA_TOKENS_411>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152058": {
+      "content": "|<EXTRA_TOKENS_412>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152059": {
+      "content": "|<EXTRA_TOKENS_413>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152060": {
+      "content": "|<EXTRA_TOKENS_414>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152061": {
+      "content": "|<EXTRA_TOKENS_415>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152062": {
+      "content": "|<EXTRA_TOKENS_416>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152063": {
+      "content": "|<EXTRA_TOKENS_417>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152064": {
+      "content": "<im_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152065": {
+      "content": "<im_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152066": {
+      "content": "<im_patch>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152067": {
+      "content": "<im_col>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152068": {
+      "content": "<|image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "|<EXTRA_TOKENS_0>|",
+    "|<EXTRA_TOKENS_1>|",
+    "|<EXTRA_TOKENS_2>|",
+    "|<EXTRA_TOKENS_3>|",
+    "|<EXTRA_TOKENS_4>|",
+    "|<EXTRA_TOKENS_5>|",
+    "|<EXTRA_TOKENS_6>|",
+    "|<EXTRA_TOKENS_7>|",
+    "|<EXTRA_TOKENS_8>|",
+    "|<EXTRA_TOKENS_9>|",
+    "|<EXTRA_TOKENS_10>|",
+    "|<EXTRA_TOKENS_11>|",
+    "|<EXTRA_TOKENS_12>|",
+    "|<EXTRA_TOKENS_13>|",
+    "|<EXTRA_TOKENS_14>|",
+    "|<EXTRA_TOKENS_15>|",
+    "|<EXTRA_TOKENS_16>|",
+    "|<EXTRA_TOKENS_17>|",
+    "|<EXTRA_TOKENS_18>|",
+    "|<EXTRA_TOKENS_19>|",
+    "|<EXTRA_TOKENS_20>|",
+    "|<EXTRA_TOKENS_21>|",
+    "|<EXTRA_TOKENS_22>|",
+    "|<EXTRA_TOKENS_23>|",
+    "|<EXTRA_TOKENS_24>|",
+    "|<EXTRA_TOKENS_25>|",
+    "|<EXTRA_TOKENS_26>|",
+    "|<EXTRA_TOKENS_27>|",
+    "|<EXTRA_TOKENS_28>|",
+    "|<EXTRA_TOKENS_29>|",
+    "|<EXTRA_TOKENS_30>|",
+    "|<EXTRA_TOKENS_31>|",
+    "|<EXTRA_TOKENS_32>|",
+    "|<EXTRA_TOKENS_33>|",
+    "|<EXTRA_TOKENS_34>|",
+    "|<EXTRA_TOKENS_35>|",
+    "|<EXTRA_TOKENS_36>|",
+    "|<EXTRA_TOKENS_37>|",
+    "|<EXTRA_TOKENS_38>|",
+    "|<EXTRA_TOKENS_39>|",
+    "|<EXTRA_TOKENS_40>|",
+    "|<EXTRA_TOKENS_41>|",
+    "|<EXTRA_TOKENS_42>|",
+    "|<EXTRA_TOKENS_43>|",
+    "|<EXTRA_TOKENS_44>|",
+    "|<EXTRA_TOKENS_45>|",
+    "|<EXTRA_TOKENS_46>|",
+    "|<EXTRA_TOKENS_47>|",
+    "|<EXTRA_TOKENS_48>|",
+    "|<EXTRA_TOKENS_49>|",
+    "|<EXTRA_TOKENS_50>|",
+    "|<EXTRA_TOKENS_51>|",
+    "|<EXTRA_TOKENS_52>|",
+    "|<EXTRA_TOKENS_53>|",
+    "|<EXTRA_TOKENS_54>|",
+    "|<EXTRA_TOKENS_55>|",
+    "|<EXTRA_TOKENS_56>|",
+    "|<EXTRA_TOKENS_57>|",
+    "|<EXTRA_TOKENS_58>|",
+    "|<EXTRA_TOKENS_59>|",
+    "|<EXTRA_TOKENS_60>|",
+    "|<EXTRA_TOKENS_61>|",
+    "|<EXTRA_TOKENS_62>|",
+    "|<EXTRA_TOKENS_63>|",
+    "|<EXTRA_TOKENS_64>|",
+    "|<EXTRA_TOKENS_65>|",
+    "|<EXTRA_TOKENS_66>|",
+    "|<EXTRA_TOKENS_67>|",
+    "|<EXTRA_TOKENS_68>|",
+    "|<EXTRA_TOKENS_69>|",
+    "|<EXTRA_TOKENS_70>|",
+    "|<EXTRA_TOKENS_71>|",
+    "|<EXTRA_TOKENS_72>|",
+    "|<EXTRA_TOKENS_73>|",
+    "|<EXTRA_TOKENS_74>|",
+    "|<EXTRA_TOKENS_75>|",
+    "|<EXTRA_TOKENS_76>|",
+    "|<EXTRA_TOKENS_77>|",
+    "|<EXTRA_TOKENS_78>|",
+    "|<EXTRA_TOKENS_79>|",
+    "|<EXTRA_TOKENS_80>|",
+    "|<EXTRA_TOKENS_81>|",
+    "|<EXTRA_TOKENS_82>|",
+    "|<EXTRA_TOKENS_83>|",
+    "|<EXTRA_TOKENS_84>|",
+    "|<EXTRA_TOKENS_85>|",
+    "|<EXTRA_TOKENS_86>|",
+    "|<EXTRA_TOKENS_87>|",
+    "|<EXTRA_TOKENS_88>|",
+    "|<EXTRA_TOKENS_89>|",
+    "|<EXTRA_TOKENS_90>|",
+    "|<EXTRA_TOKENS_91>|",
+    "|<EXTRA_TOKENS_92>|",
+    "|<EXTRA_TOKENS_93>|",
+    "|<EXTRA_TOKENS_94>|",
+    "|<EXTRA_TOKENS_95>|",
+    "|<EXTRA_TOKENS_96>|",
+    "|<EXTRA_TOKENS_97>|",
+    "|<EXTRA_TOKENS_98>|",
+    "|<EXTRA_TOKENS_99>|",
+    "|<EXTRA_TOKENS_100>|",
+    "|<EXTRA_TOKENS_101>|",
+    "|<EXTRA_TOKENS_102>|",
+    "|<EXTRA_TOKENS_103>|",
+    "|<EXTRA_TOKENS_104>|",
+    "|<EXTRA_TOKENS_105>|",
+    "|<EXTRA_TOKENS_106>|",
+    "|<EXTRA_TOKENS_107>|",
+    "|<EXTRA_TOKENS_108>|",
+    "|<EXTRA_TOKENS_109>|",
+    "|<EXTRA_TOKENS_110>|",
+    "|<EXTRA_TOKENS_111>|",
+    "|<EXTRA_TOKENS_112>|",
+    "|<EXTRA_TOKENS_113>|",
+    "|<EXTRA_TOKENS_114>|",
+    "|<EXTRA_TOKENS_115>|",
+    "|<EXTRA_TOKENS_116>|",
+    "|<EXTRA_TOKENS_117>|",
+    "|<EXTRA_TOKENS_118>|",
+    "|<EXTRA_TOKENS_119>|",
+    "|<EXTRA_TOKENS_120>|",
+    "|<EXTRA_TOKENS_121>|",
+    "|<EXTRA_TOKENS_122>|",
+    "|<EXTRA_TOKENS_123>|",
+    "|<EXTRA_TOKENS_124>|",
+    "|<EXTRA_TOKENS_125>|",
+    "|<EXTRA_TOKENS_126>|",
+    "|<EXTRA_TOKENS_127>|",
+    "|<EXTRA_TOKENS_128>|",
+    "|<EXTRA_TOKENS_129>|",
+    "|<EXTRA_TOKENS_130>|",
+    "|<EXTRA_TOKENS_131>|",
+    "|<EXTRA_TOKENS_132>|",
+    "|<EXTRA_TOKENS_133>|",
+    "|<EXTRA_TOKENS_134>|",
+    "|<EXTRA_TOKENS_135>|",
+    "|<EXTRA_TOKENS_136>|",
+    "|<EXTRA_TOKENS_137>|",
+    "|<EXTRA_TOKENS_138>|",
+    "|<EXTRA_TOKENS_139>|",
+    "|<EXTRA_TOKENS_140>|",
+    "|<EXTRA_TOKENS_141>|",
+    "|<EXTRA_TOKENS_142>|",
+    "|<EXTRA_TOKENS_143>|",
+    "|<EXTRA_TOKENS_144>|",
+    "|<EXTRA_TOKENS_145>|",
+    "|<EXTRA_TOKENS_146>|",
+    "|<EXTRA_TOKENS_147>|",
+    "|<EXTRA_TOKENS_148>|",
+    "|<EXTRA_TOKENS_149>|",
+    "|<EXTRA_TOKENS_150>|",
+    "|<EXTRA_TOKENS_151>|",
+    "|<EXTRA_TOKENS_152>|",
+    "|<EXTRA_TOKENS_153>|",
+    "|<EXTRA_TOKENS_154>|",
+    "|<EXTRA_TOKENS_155>|",
+    "|<EXTRA_TOKENS_156>|",
+    "|<EXTRA_TOKENS_157>|",
+    "|<EXTRA_TOKENS_158>|",
+    "|<EXTRA_TOKENS_159>|",
+    "|<EXTRA_TOKENS_160>|",
+    "|<EXTRA_TOKENS_161>|",
+    "|<EXTRA_TOKENS_162>|",
+    "|<EXTRA_TOKENS_163>|",
+    "|<EXTRA_TOKENS_164>|",
+    "|<EXTRA_TOKENS_165>|",
+    "|<EXTRA_TOKENS_166>|",
+    "|<EXTRA_TOKENS_167>|",
+    "|<EXTRA_TOKENS_168>|",
+    "|<EXTRA_TOKENS_169>|",
+    "|<EXTRA_TOKENS_170>|",
+    "|<EXTRA_TOKENS_171>|",
+    "|<EXTRA_TOKENS_172>|",
+    "|<EXTRA_TOKENS_173>|",
+    "|<EXTRA_TOKENS_174>|",
+    "|<EXTRA_TOKENS_175>|",
+    "|<EXTRA_TOKENS_176>|",
+    "|<EXTRA_TOKENS_177>|",
+    "|<EXTRA_TOKENS_178>|",
+    "|<EXTRA_TOKENS_179>|",
+    "|<EXTRA_TOKENS_180>|",
+    "|<EXTRA_TOKENS_181>|",
+    "|<EXTRA_TOKENS_182>|",
+    "|<EXTRA_TOKENS_183>|",
+    "|<EXTRA_TOKENS_184>|",
+    "|<EXTRA_TOKENS_185>|",
+    "|<EXTRA_TOKENS_186>|",
+    "|<EXTRA_TOKENS_187>|",
+    "|<EXTRA_TOKENS_188>|",
+    "|<EXTRA_TOKENS_189>|",
+    "|<EXTRA_TOKENS_190>|",
+    "|<EXTRA_TOKENS_191>|",
+    "|<EXTRA_TOKENS_192>|",
+    "|<EXTRA_TOKENS_193>|",
+    "|<EXTRA_TOKENS_194>|",
+    "|<EXTRA_TOKENS_195>|",
+    "|<EXTRA_TOKENS_196>|",
+    "|<EXTRA_TOKENS_197>|",
+    "|<EXTRA_TOKENS_198>|",
+    "|<EXTRA_TOKENS_199>|",
+    "|<EXTRA_TOKENS_200>|",
+    "|<EXTRA_TOKENS_201>|",
+    "|<EXTRA_TOKENS_202>|",
+    "|<EXTRA_TOKENS_203>|",
+    "|<EXTRA_TOKENS_204>|",
+    "|<EXTRA_TOKENS_205>|",
+    "|<EXTRA_TOKENS_206>|",
+    "|<EXTRA_TOKENS_207>|",
+    "|<EXTRA_TOKENS_208>|",
+    "|<EXTRA_TOKENS_209>|",
+    "|<EXTRA_TOKENS_210>|",
+    "|<EXTRA_TOKENS_211>|",
+    "|<EXTRA_TOKENS_212>|",
+    "|<EXTRA_TOKENS_213>|",
+    "|<EXTRA_TOKENS_214>|",
+    "|<EXTRA_TOKENS_215>|",
+    "|<EXTRA_TOKENS_216>|",
+    "|<EXTRA_TOKENS_217>|",
+    "|<EXTRA_TOKENS_218>|",
+    "|<EXTRA_TOKENS_219>|",
+    "|<EXTRA_TOKENS_220>|",
+    "|<EXTRA_TOKENS_221>|",
+    "|<EXTRA_TOKENS_222>|",
+    "|<EXTRA_TOKENS_223>|",
+    "|<EXTRA_TOKENS_224>|",
+    "|<EXTRA_TOKENS_225>|",
+    "|<EXTRA_TOKENS_226>|",
+    "|<EXTRA_TOKENS_227>|",
+    "|<EXTRA_TOKENS_228>|",
+    "|<EXTRA_TOKENS_229>|",
+    "|<EXTRA_TOKENS_230>|",
+    "|<EXTRA_TOKENS_231>|",
+    "|<EXTRA_TOKENS_232>|",
+    "|<EXTRA_TOKENS_233>|",
+    "|<EXTRA_TOKENS_234>|",
+    "|<EXTRA_TOKENS_235>|",
+    "|<EXTRA_TOKENS_236>|",
+    "|<EXTRA_TOKENS_237>|",
+    "|<EXTRA_TOKENS_238>|",
+    "|<EXTRA_TOKENS_239>|",
+    "|<EXTRA_TOKENS_240>|",
+    "|<EXTRA_TOKENS_241>|",
+    "|<EXTRA_TOKENS_242>|",
+    "|<EXTRA_TOKENS_243>|",
+    "|<EXTRA_TOKENS_244>|",
+    "|<EXTRA_TOKENS_245>|",
+    "|<EXTRA_TOKENS_246>|",
+    "|<EXTRA_TOKENS_247>|",
+    "|<EXTRA_TOKENS_248>|",
+    "|<EXTRA_TOKENS_249>|",
+    "|<EXTRA_TOKENS_250>|",
+    "|<EXTRA_TOKENS_251>|",
+    "|<EXTRA_TOKENS_252>|",
+    "|<EXTRA_TOKENS_253>|",
+    "|<EXTRA_TOKENS_254>|",
+    "|<EXTRA_TOKENS_255>|",
+    "|<EXTRA_TOKENS_256>|",
+    "|<EXTRA_TOKENS_257>|",
+    "|<EXTRA_TOKENS_258>|",
+    "|<EXTRA_TOKENS_259>|",
+    "|<EXTRA_TOKENS_260>|",
+    "|<EXTRA_TOKENS_261>|",
+    "|<EXTRA_TOKENS_262>|",
+    "|<EXTRA_TOKENS_263>|",
+    "|<EXTRA_TOKENS_264>|",
+    "|<EXTRA_TOKENS_265>|",
+    "|<EXTRA_TOKENS_266>|",
+    "|<EXTRA_TOKENS_267>|",
+    "|<EXTRA_TOKENS_268>|",
+    "|<EXTRA_TOKENS_269>|",
+    "|<EXTRA_TOKENS_270>|",
+    "|<EXTRA_TOKENS_271>|",
+    "|<EXTRA_TOKENS_272>|",
+    "|<EXTRA_TOKENS_273>|",
+    "|<EXTRA_TOKENS_274>|",
+    "|<EXTRA_TOKENS_275>|",
+    "|<EXTRA_TOKENS_276>|",
+    "|<EXTRA_TOKENS_277>|",
+    "|<EXTRA_TOKENS_278>|",
+    "|<EXTRA_TOKENS_279>|",
+    "|<EXTRA_TOKENS_280>|",
+    "|<EXTRA_TOKENS_281>|",
+    "|<EXTRA_TOKENS_282>|",
+    "|<EXTRA_TOKENS_283>|",
+    "|<EXTRA_TOKENS_284>|",
+    "|<EXTRA_TOKENS_285>|",
+    "|<EXTRA_TOKENS_286>|",
+    "|<EXTRA_TOKENS_287>|",
+    "|<EXTRA_TOKENS_288>|",
+    "|<EXTRA_TOKENS_289>|",
+    "|<EXTRA_TOKENS_290>|",
+    "|<EXTRA_TOKENS_291>|",
+    "|<EXTRA_TOKENS_292>|",
+    "|<EXTRA_TOKENS_293>|",
+    "|<EXTRA_TOKENS_294>|",
+    "|<EXTRA_TOKENS_295>|",
+    "|<EXTRA_TOKENS_296>|",
+    "|<EXTRA_TOKENS_297>|",
+    "|<EXTRA_TOKENS_298>|",
+    "|<EXTRA_TOKENS_299>|",
+    "|<EXTRA_TOKENS_300>|",
+    "|<EXTRA_TOKENS_301>|",
+    "|<EXTRA_TOKENS_302>|",
+    "|<EXTRA_TOKENS_303>|",
+    "|<EXTRA_TOKENS_304>|",
+    "|<EXTRA_TOKENS_305>|",
+    "|<EXTRA_TOKENS_306>|",
+    "|<EXTRA_TOKENS_307>|",
+    "|<EXTRA_TOKENS_308>|",
+    "|<EXTRA_TOKENS_309>|",
+    "|<EXTRA_TOKENS_310>|",
+    "|<EXTRA_TOKENS_311>|",
+    "|<EXTRA_TOKENS_312>|",
+    "|<EXTRA_TOKENS_313>|",
+    "|<EXTRA_TOKENS_314>|",
+    "|<EXTRA_TOKENS_315>|",
+    "|<EXTRA_TOKENS_316>|",
+    "|<EXTRA_TOKENS_317>|",
+    "|<EXTRA_TOKENS_318>|",
+    "|<EXTRA_TOKENS_319>|",
+    "|<EXTRA_TOKENS_320>|",
+    "|<EXTRA_TOKENS_321>|",
+    "|<EXTRA_TOKENS_322>|",
+    "|<EXTRA_TOKENS_323>|",
+    "|<EXTRA_TOKENS_324>|",
+    "|<EXTRA_TOKENS_325>|",
+    "|<EXTRA_TOKENS_326>|",
+    "|<EXTRA_TOKENS_327>|",
+    "|<EXTRA_TOKENS_328>|",
+    "|<EXTRA_TOKENS_329>|",
+    "|<EXTRA_TOKENS_330>|",
+    "|<EXTRA_TOKENS_331>|",
+    "|<EXTRA_TOKENS_332>|",
+    "|<EXTRA_TOKENS_333>|",
+    "|<EXTRA_TOKENS_334>|",
+    "|<EXTRA_TOKENS_335>|",
+    "|<EXTRA_TOKENS_336>|",
+    "|<EXTRA_TOKENS_337>|",
+    "|<EXTRA_TOKENS_338>|",
+    "|<EXTRA_TOKENS_339>|",
+    "|<EXTRA_TOKENS_340>|",
+    "|<EXTRA_TOKENS_341>|",
+    "|<EXTRA_TOKENS_342>|",
+    "|<EXTRA_TOKENS_343>|",
+    "|<EXTRA_TOKENS_344>|",
+    "|<EXTRA_TOKENS_345>|",
+    "|<EXTRA_TOKENS_346>|",
+    "|<EXTRA_TOKENS_347>|",
+    "|<EXTRA_TOKENS_348>|",
+    "|<EXTRA_TOKENS_349>|",
+    "|<EXTRA_TOKENS_350>|",
+    "|<EXTRA_TOKENS_351>|",
+    "|<EXTRA_TOKENS_352>|",
+    "|<EXTRA_TOKENS_353>|",
+    "|<EXTRA_TOKENS_354>|",
+    "|<EXTRA_TOKENS_355>|",
+    "|<EXTRA_TOKENS_356>|",
+    "|<EXTRA_TOKENS_357>|",
+    "|<EXTRA_TOKENS_358>|",
+    "|<EXTRA_TOKENS_359>|",
+    "|<EXTRA_TOKENS_360>|",
+    "|<EXTRA_TOKENS_361>|",
+    "|<EXTRA_TOKENS_362>|",
+    "|<EXTRA_TOKENS_363>|",
+    "|<EXTRA_TOKENS_364>|",
+    "|<EXTRA_TOKENS_365>|",
+    "|<EXTRA_TOKENS_366>|",
+    "|<EXTRA_TOKENS_367>|",
+    "|<EXTRA_TOKENS_368>|",
+    "|<EXTRA_TOKENS_369>|",
+    "|<EXTRA_TOKENS_370>|",
+    "|<EXTRA_TOKENS_371>|",
+    "|<EXTRA_TOKENS_372>|",
+    "|<EXTRA_TOKENS_373>|",
+    "|<EXTRA_TOKENS_374>|",
+    "|<EXTRA_TOKENS_375>|",
+    "|<EXTRA_TOKENS_376>|",
+    "|<EXTRA_TOKENS_377>|",
+    "|<EXTRA_TOKENS_378>|",
+    "|<EXTRA_TOKENS_379>|",
+    "|<EXTRA_TOKENS_380>|",
+    "|<EXTRA_TOKENS_381>|",
+    "|<EXTRA_TOKENS_382>|",
+    "|<EXTRA_TOKENS_383>|",
+    "|<EXTRA_TOKENS_384>|",
+    "|<EXTRA_TOKENS_385>|",
+    "|<EXTRA_TOKENS_386>|",
+    "|<EXTRA_TOKENS_387>|",
+    "|<EXTRA_TOKENS_388>|",
+    "|<EXTRA_TOKENS_389>|",
+    "|<EXTRA_TOKENS_390>|",
+    "|<EXTRA_TOKENS_391>|",
+    "|<EXTRA_TOKENS_392>|",
+    "|<EXTRA_TOKENS_393>|",
+    "|<EXTRA_TOKENS_394>|",
+    "|<EXTRA_TOKENS_395>|",
+    "|<EXTRA_TOKENS_396>|",
+    "|<EXTRA_TOKENS_397>|",
+    "|<EXTRA_TOKENS_398>|",
+    "|<EXTRA_TOKENS_399>|",
+    "|<EXTRA_TOKENS_400>|",
+    "|<EXTRA_TOKENS_401>|",
+    "|<EXTRA_TOKENS_402>|",
+    "|<EXTRA_TOKENS_403>|",
+    "|<EXTRA_TOKENS_404>|",
+    "|<EXTRA_TOKENS_405>|",
+    "|<EXTRA_TOKENS_406>|",
+    "|<EXTRA_TOKENS_407>|",
+    "|<EXTRA_TOKENS_408>|",
+    "|<EXTRA_TOKENS_409>|",
+    "|<EXTRA_TOKENS_410>|",
+    "|<EXTRA_TOKENS_411>|",
+    "|<EXTRA_TOKENS_412>|",
+    "|<EXTRA_TOKENS_413>|",
+    "|<EXTRA_TOKENS_414>|",
+    "|<EXTRA_TOKENS_415>|",
+    "|<EXTRA_TOKENS_416>|",
+    "|<EXTRA_TOKENS_417>|",
+    "<im_start>",
+    "<im_end>",
+    "<im_patch>",
+    "<im_col>",
+    "<|image|>"
+  ],
+  "auto_map": {
+    "AutoProcessor": "preprocessing_molmo.MolmoProcessor"
+  },
+  "bos_token": null,
+  "chat_template": "{% for message in messages -%}\n        {%- if (loop.index % 2 == 1 and message['role'] != 'user') or \n          (loop.index % 2 == 0 and message['role'].lower() != 'assistant') -%}\n        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n        {%- endif -%}\n        {{ message['role'].capitalize() + ': ' + message['content'] }}\n        {%- if not loop.last -%}\n        {{ ' ' }}\n        {%- endif %}\n        {%- endfor -%}\n        {%- if add_generation_prompt -%}\n        {{ ' Assistant:' }}\n        {%- endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "MolmoProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff