{ "student_ckpt": "/root/bitnet1/ckpt/v29_50M_50K_last.pt", "student_config": { "vocab_size": 128, "d_model": 768, "n_layers": 10, "n_heads": 12, "d_ff": 1280, "seq_len": 256 }, "student_step": 50000, "student_val_bpc": 1.2349195671077682, "timestamp": "2026-04-22 17:05:59", "layer_ablation": { "baseline_bpc": 1.2372383790923875, "per_layer": [ { "layer": 0, "baseline_bpc": 1.2372383790923875, "ablated_bpc": 2.658024141629142, "delta_bpc": 1.4207857625367544 }, { "layer": 1, "baseline_bpc": 1.2372383790923875, "ablated_bpc": 1.791177114375058, "delta_bpc": 0.5539387352826706 }, { "layer": 2, "baseline_bpc": 1.2372383790923875, "ablated_bpc": 1.40893652335544, "delta_bpc": 0.17169814426305252 }, { "layer": 3, "baseline_bpc": 1.2372383790923875, "ablated_bpc": 1.3546692810857077, "delta_bpc": 0.11743090199332018 }, { "layer": 4, "baseline_bpc": 1.2372383790923875, "ablated_bpc": 1.4076403331115497, "delta_bpc": 0.17040195401916214 }, { "layer": 5, "baseline_bpc": 1.2372383790923875, "ablated_bpc": 1.3853021601250528, "delta_bpc": 0.14806378103266526 }, { "layer": 6, "baseline_bpc": 1.2372383790923875, "ablated_bpc": 1.4080221517947302, "delta_bpc": 0.17078377270234268 }, { "layer": 7, "baseline_bpc": 1.2372383790923875, "ablated_bpc": 1.5111466640773288, "delta_bpc": 0.27390828498494124 }, { "layer": 8, "baseline_bpc": 1.2372383790923875, "ablated_bpc": 1.5338596560531164, "delta_bpc": 0.2966212769607288 }, { "layer": 9, "baseline_bpc": 1.2372383790923875, "ablated_bpc": 1.6496860416056456, "delta_bpc": 0.41244766251325804 } ] }, "weight_saturation": [ { "name": "out_codebook", "shape": [ 128, 768 ], "n": 98304, "mean": 0.32484379410743713, "median": 0.26955974102020264, "q10": 0.022471295669674873, "q90": 0.6936785578727722, "q99": 1.0008821487426758, "frac_below_0.01": 0.077606201171875, "frac_below_0.05": 0.1428019255399704, "frac_above_0.5": 0.2465413510799408, "max": 2.1756932735443115 }, { "name": "embed.weight", "shape": [ 128, 768 ], "n": 98304, "mean": 0.025270771235227585, "median": 0.01747666671872139, "q10": 0.0027019940316677094, "q90": 0.06000998988747597, "q99": 0.1124815121293068, "frac_below_0.01": 0.322998046875, "frac_below_0.05": 0.8575338125228882, "frac_above_0.5": 0.0, "max": 0.20307880640029907 }, { "name": "blocks.0.attn.q_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.020984457805752754, "median": 0.01630321331322193, "q10": 0.002880966058000922, "q90": 0.04540819674730301, "q99": 0.08269572257995605, "frac_below_0.01": 0.327667236328125, "frac_below_0.05": 0.9232143759727478, "frac_above_0.5": 0.0, "max": 0.17969276010990143 }, { "name": "blocks.0.attn.k_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.021982185542583466, "median": 0.016868969425559044, "q10": 0.0029649189673364162, "q90": 0.04804832115769386, "q99": 0.08712130039930344, "frac_below_0.01": 0.318261057138443, "frac_below_0.05": 0.9096696972846985, "frac_above_0.5": 0.0, "max": 0.17866352200508118 }, { "name": "blocks.0.attn.v_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.02966231107711792, "median": 0.02414115145802498, "q10": 0.0041543589904904366, "q90": 0.0629270151257515, "q99": 0.10377822816371918, "frac_below_0.01": 0.2300957590341568, "frac_below_0.05": 0.8167402744293213, "frac_above_0.5": 0.0, "max": 0.21888598799705505 }, { "name": "blocks.0.attn.o_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.03943490982055664, "median": 0.024108367040753365, "q10": 0.0036507798358798027, "q90": 0.09731841832399368, "q99": 0.2101808488368988, "frac_below_0.01": 0.2511308491230011, "frac_below_0.05": 0.7431420087814331, "frac_above_0.5": 3.3908420391526306e-06, "max": 0.5122478604316711 }, { "name": "blocks.0.ffn.gate.raw.weight", "shape": [ 1280, 768 ], "n": 983040, "mean": 0.029497429728507996, "median": 0.024814119562506676, "q10": 0.004512307234108448, "q90": 0.06109948456287384, "q99": 0.09630865603685379, "frac_below_0.01": 0.21697694063186646, "frac_below_0.05": 0.8223521113395691, "frac_above_0.5": 0.0, "max": 0.17949974536895752 }, { "name": "blocks.0.ffn.up.raw.weight", "shape": [ 1280, 768 ], "n": 983040, "mean": 0.029491588473320007, "median": 0.024759948253631592, "q10": 0.004544392228126526, "q90": 0.06112118810415268, "q99": 0.09623175859451294, "frac_below_0.01": 0.21637777984142303, "frac_below_0.05": 0.8225098252296448, "frac_above_0.5": 0.0, "max": 0.17936795949935913 }, { "name": "blocks.0.ffn.down.raw.weight", "shape": [ 768, 1280 ], "n": 983040, "mean": 0.030015649273991585, "median": 0.025115979835391045, "q10": 0.004507269244641066, "q90": 0.06252331286668777, "q99": 0.09870631247758865, "frac_below_0.01": 0.21663717925548553, "frac_below_0.05": 0.8136902451515198, "frac_above_0.5": 0.0, "max": 0.19937986135482788 }, { "name": "blocks.1.attn.q_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.021738752722740173, "median": 0.016651637852191925, "q10": 0.0030065574683248997, "q90": 0.04737270250916481, "q99": 0.0873987078666687, "frac_below_0.01": 0.3206566870212555, "frac_below_0.05": 0.9128333330154419, "frac_above_0.5": 0.0, "max": 0.20024967193603516 }, { "name": "blocks.1.attn.k_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.022124914452433586, "median": 0.016716480255126953, "q10": 0.0029788485262542963, "q90": 0.0486590750515461, "q99": 0.09064910560846329, "frac_below_0.01": 0.3203616738319397, "frac_below_0.05": 0.9063364863395691, "frac_above_0.5": 0.0, "max": 0.21088646352291107 }, { "name": "blocks.1.attn.v_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.032271161675453186, "median": 0.02542409859597683, "q10": 0.004067442379891872, "q90": 0.07015787810087204, "q99": 0.11874839663505554, "frac_below_0.01": 0.2300516813993454, "frac_below_0.05": 0.7786475419998169, "frac_above_0.5": 0.0, "max": 0.29738032817840576 }, { "name": "blocks.1.attn.o_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.0365101583302021, "median": 0.026378890499472618, "q10": 0.004064246546477079, "q90": 0.08268115669488907, "q99": 0.15872563421726227, "frac_below_0.01": 0.2282664030790329, "frac_below_0.05": 0.7426249384880066, "frac_above_0.5": 0.0, "max": 0.32939204573631287 }, { "name": "blocks.1.ffn.gate.raw.weight", "shape": [ 1280, 768 ], "n": 983040, "mean": 0.030397042632102966, "median": 0.0249449722468853, "q10": 0.0042805117554962635, "q90": 0.06440477073192596, "q99": 0.10299010574817657, "frac_below_0.01": 0.22425639629364014, "frac_below_0.05": 0.8044993281364441, "frac_above_0.5": 0.0, "max": 0.20056255161762238 }, { "name": "blocks.1.ffn.up.raw.weight", "shape": [ 1280, 768 ], "n": 983040, "mean": 0.03043045848608017, "median": 0.02498842217028141, "q10": 0.00427834689617157, "q90": 0.06437431275844574, "q99": 0.1029980257153511, "frac_below_0.01": 0.22370098531246185, "frac_below_0.05": 0.8039296865463257, "frac_above_0.5": 0.0, "max": 0.20558297634124756 }, { "name": "blocks.1.ffn.down.raw.weight", "shape": [ 768, 1280 ], "n": 983040, "mean": 0.030961716547608376, "median": 0.025217225775122643, "q10": 0.004261382389813662, "q90": 0.06597873568534851, "q99": 0.10618423670530319, "frac_below_0.01": 0.22413738071918488, "frac_below_0.05": 0.7958465814590454, "frac_above_0.5": 0.0, "max": 0.20832186937332153 }, { "name": "blocks.2.attn.q_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.021832674741744995, "median": 0.016725117340683937, "q10": 0.003007261548191309, "q90": 0.04768085852265358, "q99": 0.08757057785987854, "frac_below_0.01": 0.31890869140625, "frac_below_0.05": 0.9113871455192566, "frac_above_0.5": 0.0, "max": 0.19008490443229675 }, { "name": "blocks.2.attn.k_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.02204788476228714, "median": 0.01679176464676857, "q10": 0.003010682761669159, "q90": 0.04842023551464081, "q99": 0.08911668509244919, "frac_below_0.01": 0.3183034360408783, "frac_below_0.05": 0.907684326171875, "frac_above_0.5": 0.0, "max": 0.19714821875095367 }, { "name": "blocks.2.attn.v_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.03241870179772377, "median": 0.025510499253869057, "q10": 0.0039276741445064545, "q90": 0.07076237350702286, "q99": 0.11824272572994232, "frac_below_0.01": 0.2332882434129715, "frac_below_0.05": 0.7741004228591919, "frac_above_0.5": 0.0, "max": 0.25875869393348694 }, { "name": "blocks.2.attn.o_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.0360061414539814, "median": 0.026566587388515472, "q10": 0.003982395865023136, "q90": 0.08094409108161926, "q99": 0.15047675371170044, "frac_below_0.01": 0.2296006977558136, "frac_below_0.05": 0.7429623007774353, "frac_above_0.5": 0.0, "max": 0.3236503005027771 }, { "name": "blocks.2.ffn.gate.raw.weight", "shape": [ 1280, 768 ], "n": 983040, "mean": 0.030454641208052635, "median": 0.024827929213643074, "q10": 0.004183255601674318, "q90": 0.06483904272317886, "q99": 0.10405371338129044, "frac_below_0.01": 0.2272491604089737, "frac_below_0.05": 0.8031179308891296, "frac_above_0.5": 0.0, "max": 0.21293406188488007 }, { "name": "blocks.2.ffn.up.raw.weight", "shape": [ 1280, 768 ], "n": 983040, "mean": 0.030473940074443817, "median": 0.024836573749780655, "q10": 0.0041793277487158775, "q90": 0.0648508369922638, "q99": 0.10394246876239777, "frac_below_0.01": 0.22719117999076843, "frac_below_0.05": 0.8021494746208191, "frac_above_0.5": 0.0, "max": 0.20337601006031036 }, { "name": "blocks.2.ffn.down.raw.weight", "shape": [ 768, 1280 ], "n": 983040, "mean": 0.03145662695169449, "median": 0.025447478517889977, "q10": 0.004232366569340229, "q90": 0.0673917829990387, "q99": 0.10850941389799118, "frac_below_0.01": 0.22418315708637238, "frac_below_0.05": 0.7883524894714355, "frac_above_0.5": 0.0, "max": 0.2242489606142044 }, { "name": "blocks.3.attn.q_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.021820461377501488, "median": 0.016704333946108818, "q10": 0.0030075714457780123, "q90": 0.047725576907396317, "q99": 0.08704143017530441, "frac_below_0.01": 0.3202582597732544, "frac_below_0.05": 0.9108836054801941, "frac_above_0.5": 0.0, "max": 0.1996636688709259 }, { "name": "blocks.3.attn.k_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.02202785760164261, "median": 0.016765017062425613, "q10": 0.003018269781023264, "q90": 0.04830968379974365, "q99": 0.08893266320228577, "frac_below_0.01": 0.319023996591568, "frac_below_0.05": 0.9081844687461853, "frac_above_0.5": 0.0, "max": 0.2032247930765152 }, { "name": "blocks.3.attn.v_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.03259618952870369, "median": 0.02546798251569271, "q10": 0.0037475833669304848, "q90": 0.07166170328855515, "q99": 0.1207074448466301, "frac_below_0.01": 0.2383185476064682, "frac_below_0.05": 0.7702789306640625, "frac_above_0.5": 0.0, "max": 0.308125764131546 }, { "name": "blocks.3.attn.o_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.036724548786878586, "median": 0.026713047176599503, "q10": 0.0038133268244564533, "q90": 0.08330091834068298, "q99": 0.1569368839263916, "frac_below_0.01": 0.233211949467659, "frac_below_0.05": 0.7358195185661316, "frac_above_0.5": 0.0, "max": 0.35014036297798157 }, { "name": "blocks.3.ffn.gate.raw.weight", "shape": [ 1280, 768 ], "n": 983040, "mean": 0.03041911870241165, "median": 0.024767549708485603, "q10": 0.004097540397197008, "q90": 0.06491811573505402, "q99": 0.10423436015844345, "frac_below_0.01": 0.23008118569850922, "frac_below_0.05": 0.8022247552871704, "frac_above_0.5": 0.0, "max": 0.2128787487745285 }, { "name": "blocks.3.ffn.up.raw.weight", "shape": [ 1280, 768 ], "n": 983040, "mean": 0.030419766902923584, "median": 0.024740705266594887, "q10": 0.004079720471054316, "q90": 0.06493699550628662, "q99": 0.10483106225728989, "frac_below_0.01": 0.2303914576768875, "frac_below_0.05": 0.802680492401123, "frac_above_0.5": 0.0, "max": 0.21313220262527466 }, { "name": "blocks.3.ffn.down.raw.weight", "shape": [ 768, 1280 ], "n": 983040, "mean": 0.031688056886196136, "median": 0.025530001148581505, "q10": 0.004116586875170469, "q90": 0.06815119832754135, "q99": 0.11011636257171631, "frac_below_0.01": 0.22733664512634277, "frac_below_0.05": 0.7837514877319336, "frac_above_0.5": 0.0, "max": 0.22840875387191772 }, { "name": "blocks.4.attn.q_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.021806135773658752, "median": 0.016604429110884666, "q10": 0.0029621128924191, "q90": 0.047807518392801285, "q99": 0.0883331298828125, "frac_below_0.01": 0.3223758339881897, "frac_below_0.05": 0.9105224609375, "frac_above_0.5": 0.0, "max": 0.19498515129089355 }, { "name": "blocks.4.attn.k_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.0219953004270792, "median": 0.01666717790067196, "q10": 0.002971069421619177, "q90": 0.04836598038673401, "q99": 0.08983262628316879, "frac_below_0.01": 0.320641428232193, "frac_below_0.05": 0.9077809453010559, "frac_above_0.5": 0.0, "max": 0.19498971104621887 }, { "name": "blocks.4.attn.v_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.03282567113637924, "median": 0.025203978642821312, "q10": 0.0033933515660464764, "q90": 0.07318802922964096, "q99": 0.12469246238470078, "frac_below_0.01": 0.2480655312538147, "frac_below_0.05": 0.7664320468902588, "frac_above_0.5": 0.0, "max": 0.28078585863113403 }, { "name": "blocks.4.attn.o_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.03867517039179802, "median": 0.026929715648293495, "q10": 0.0034530421253293753, "q90": 0.09060992300510406, "q99": 0.1718837320804596, "frac_below_0.01": 0.2434438019990921, "frac_below_0.05": 0.7186652421951294, "frac_above_0.5": 0.0, "max": 0.3775966465473175 }, { "name": "blocks.4.ffn.gate.raw.weight", "shape": [ 1280, 768 ], "n": 983040, "mean": 0.030496973544359207, "median": 0.024575216695666313, "q10": 0.003859907388687134, "q90": 0.06571313738822937, "q99": 0.10632941871881485, "frac_below_0.01": 0.23670147359371185, "frac_below_0.05": 0.7992218136787415, "frac_above_0.5": 0.0, "max": 0.236494243144989 }, { "name": "blocks.4.ffn.up.raw.weight", "shape": [ 1280, 768 ], "n": 983040, "mean": 0.03050924651324749, "median": 0.02466735430061817, "q10": 0.003866195445880294, "q90": 0.06563732028007507, "q99": 0.10621653497219086, "frac_below_0.01": 0.23652751743793488, "frac_below_0.05": 0.799262523651123, "frac_above_0.5": 0.0, "max": 0.22307677567005157 }, { "name": "blocks.4.ffn.down.raw.weight", "shape": [ 768, 1280 ], "n": 983040, "mean": 0.03216877207159996, "median": 0.025634581223130226, "q10": 0.003866716753691435, "q90": 0.07008880376815796, "q99": 0.11355020850896835, "frac_below_0.01": 0.23405660688877106, "frac_below_0.05": 0.7747518420219421, "frac_above_0.5": 0.0, "max": 0.2466614544391632 }, { "name": "blocks.5.attn.q_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.021769991144537926, "median": 0.016515638679265976, "q10": 0.0029437255579978228, "q90": 0.04784300550818443, "q99": 0.08883869647979736, "frac_below_0.01": 0.3243001401424408, "frac_below_0.05": 0.9103173017501831, "frac_above_0.5": 0.0, "max": 0.1770746260881424 }, { "name": "blocks.5.attn.k_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.02204853482544422, "median": 0.016590725630521774, "q10": 0.002919677644968033, "q90": 0.04862433299422264, "q99": 0.09129194915294647, "frac_below_0.01": 0.3242102861404419, "frac_below_0.05": 0.9064585566520691, "frac_above_0.5": 0.0, "max": 0.2087617963552475 }, { "name": "blocks.5.attn.v_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.0333058126270771, "median": 0.025002961978316307, "q10": 0.002964397193863988, "q90": 0.07551417499780655, "q99": 0.13100847601890564, "frac_below_0.01": 0.2606692910194397, "frac_below_0.05": 0.7589297890663147, "frac_above_0.5": 0.0, "max": 0.3152570426464081 }, { "name": "blocks.5.attn.o_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.041803035885095596, "median": 0.02722357213497162, "q10": 0.0029212224762886763, "q90": 0.10118373483419418, "q99": 0.20479606091976166, "frac_below_0.01": 0.2576039731502533, "frac_below_0.05": 0.6999596357345581, "frac_above_0.5": 5.086263172415784e-06, "max": 0.5692521929740906 }, { "name": "blocks.5.ffn.gate.raw.weight", "shape": [ 1280, 768 ], "n": 983040, "mean": 0.030752409249544144, "median": 0.0244681965559721, "q10": 0.0035621533170342445, "q90": 0.06708699464797974, "q99": 0.10938585549592972, "frac_below_0.01": 0.2462412714958191, "frac_below_0.05": 0.7924683094024658, "frac_above_0.5": 0.0, "max": 0.2319944053888321 }, { "name": "blocks.5.ffn.up.raw.weight", "shape": [ 1280, 768 ], "n": 983040, "mean": 0.030752165243029594, "median": 0.024411072954535484, "q10": 0.0035458123311400414, "q90": 0.06726153194904327, "q99": 0.10948219895362854, "frac_below_0.01": 0.24592998623847961, "frac_below_0.05": 0.7923950552940369, "frac_above_0.5": 0.0, "max": 0.228058323264122 }, { "name": "blocks.5.ffn.down.raw.weight", "shape": [ 768, 1280 ], "n": 983040, "mean": 0.03299623355269432, "median": 0.02572372555732727, "q10": 0.003507574088871479, "q90": 0.07318581640720367, "q99": 0.12024357914924622, "frac_below_0.01": 0.24329835176467896, "frac_below_0.05": 0.7614848017692566, "frac_above_0.5": 0.0, "max": 0.27965444326400757 }, { "name": "blocks.6.attn.q_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.0215331818908453, "median": 0.016295911744236946, "q10": 0.0028458735905587673, "q90": 0.04729600250720978, "q99": 0.08885350078344345, "frac_below_0.01": 0.329857736825943, "frac_below_0.05": 0.9127739667892456, "frac_above_0.5": 0.0, "max": 0.19640463590621948 }, { "name": "blocks.6.attn.k_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.02174237370491028, "median": 0.01627686247229576, "q10": 0.0028480947948992252, "q90": 0.047940269112586975, "q99": 0.0914376974105835, "frac_below_0.01": 0.3300510048866272, "frac_below_0.05": 0.9095560908317566, "frac_above_0.5": 0.0, "max": 0.21430690586566925 }, { "name": "blocks.6.attn.v_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.03459187224507332, "median": 0.02503044530749321, "q10": 0.0024408106692135334, "q90": 0.08047246932983398, "q99": 0.14331185817718506, "frac_below_0.01": 0.2751024067401886, "frac_below_0.05": 0.7422892451286316, "frac_above_0.5": 0.0, "max": 0.3117149770259857 }, { "name": "blocks.6.attn.o_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.04699027165770531, "median": 0.02844177559018135, "q10": 0.0022870029788464308, "q90": 0.1200011819601059, "q99": 0.23704038560390472, "frac_below_0.01": 0.2714368999004364, "frac_below_0.05": 0.6671345829963684, "frac_above_0.5": 3.3908420391526306e-06, "max": 0.518464207649231 }, { "name": "blocks.6.ffn.gate.raw.weight", "shape": [ 1280, 768 ], "n": 983040, "mean": 0.03131778910756111, "median": 0.02425358071923256, "q10": 0.0030630475375801325, "q90": 0.07003403455018997, "q99": 0.11553935706615448, "frac_below_0.01": 0.2605092525482178, "frac_below_0.05": 0.7811055779457092, "frac_above_0.5": 0.0, "max": 0.24002909660339355 }, { "name": "blocks.6.ffn.up.raw.weight", "shape": [ 1280, 768 ], "n": 983040, "mean": 0.031340092420578, "median": 0.024272354319691658, "q10": 0.0030343779362738132, "q90": 0.07006758451461792, "q99": 0.11566709727048874, "frac_below_0.01": 0.2608998715877533, "frac_below_0.05": 0.780102550983429, "frac_above_0.5": 0.0, "max": 0.2660861313343048 }, { "name": "blocks.6.ffn.down.raw.weight", "shape": [ 768, 1280 ], "n": 983040, "mean": 0.034312065690755844, "median": 0.025947941467165947, "q10": 0.0029694088734686375, "q90": 0.07810769975185394, "q99": 0.13014021515846252, "frac_below_0.01": 0.2571146786212921, "frac_below_0.05": 0.7417826652526855, "frac_above_0.5": 0.0, "max": 0.26336076855659485 }, { "name": "blocks.7.attn.q_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.022099530324339867, "median": 0.016269685700535774, "q10": 0.0027676254976540804, "q90": 0.04940343275666237, "q99": 0.0952984169125557, "frac_below_0.01": 0.3332875669002533, "frac_below_0.05": 0.9026760458946228, "frac_above_0.5": 0.0, "max": 0.21799291670322418 }, { "name": "blocks.7.attn.k_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.022439096122980118, "median": 0.016338741406798363, "q10": 0.002755739027634263, "q90": 0.050511594861745834, "q99": 0.09869782626628876, "frac_below_0.01": 0.3318294882774353, "frac_below_0.05": 0.8977118730545044, "frac_above_0.5": 0.0, "max": 0.219570130109787 }, { "name": "blocks.7.attn.v_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.03680150955915451, "median": 0.024955160915851593, "q10": 0.0015987252118065953, "q90": 0.08944220095872879, "q99": 0.16461941599845886, "frac_below_0.01": 0.3004574179649353, "frac_below_0.05": 0.7197520136833191, "frac_above_0.5": 0.0, "max": 0.3584654927253723 }, { "name": "blocks.7.attn.o_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.05452443286776543, "median": 0.029793892055749893, "q10": 0.0014873046893626451, "q90": 0.14702990651130676, "q99": 0.28597983717918396, "frac_below_0.01": 0.2919921875, "frac_below_0.05": 0.6334109902381897, "frac_above_0.5": 8.646646892884746e-05, "max": 0.7674525380134583 }, { "name": "blocks.7.ffn.gate.raw.weight", "shape": [ 1280, 768 ], "n": 983040, "mean": 0.03266124054789543, "median": 0.024235490709543228, "q10": 0.0022881708573549986, "q90": 0.07565723359584808, "q99": 0.1269170194864273, "frac_below_0.01": 0.2822510004043579, "frac_below_0.05": 0.757544994354248, "frac_above_0.5": 0.0, "max": 0.28104186058044434 }, { "name": "blocks.7.ffn.up.raw.weight", "shape": [ 1280, 768 ], "n": 983040, "mean": 0.032623130828142166, "median": 0.0241586584597826, "q10": 0.002281878376379609, "q90": 0.07564595341682434, "q99": 0.1268070936203003, "frac_below_0.01": 0.281890869140625, "frac_below_0.05": 0.7584859728813171, "frac_above_0.5": 0.0, "max": 0.267304927110672 }, { "name": "blocks.7.ffn.down.raw.weight", "shape": [ 768, 1280 ], "n": 983040, "mean": 0.036399319767951965, "median": 0.026400091126561165, "q10": 0.0022754990495741367, "q90": 0.08571262657642365, "q99": 0.14497342705726624, "frac_below_0.01": 0.27541911602020264, "frac_below_0.05": 0.7168355584144592, "frac_above_0.5": 0.0, "max": 0.30715522170066833 }, { "name": "blocks.8.attn.q_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.02204364724457264, "median": 0.01618419960141182, "q10": 0.0027065007016062737, "q90": 0.04928082227706909, "q99": 0.09612085670232773, "frac_below_0.01": 0.3353847861289978, "frac_below_0.05": 0.9031490683555603, "frac_above_0.5": 0.0, "max": 0.2505124807357788 }, { "name": "blocks.8.attn.k_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.022502297535538673, "median": 0.01621812768280506, "q10": 0.0026969979517161846, "q90": 0.05086328089237213, "q99": 0.1007092148065567, "frac_below_0.01": 0.3353763222694397, "frac_below_0.05": 0.8963436484336853, "frac_above_0.5": 0.0, "max": 0.23622968792915344 }, { "name": "blocks.8.attn.v_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.03888942673802376, "median": 0.0260167196393013, "q10": 0.001105984323658049, "q90": 0.0957922413945198, "q99": 0.17475828528404236, "frac_below_0.01": 0.3079240620136261, "frac_below_0.05": 0.6965315341949463, "frac_above_0.5": 0.0, "max": 0.49975377321243286 }, { "name": "blocks.8.attn.o_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.058232564479112625, "median": 0.03141973167657852, "q10": 0.0010921643115580082, "q90": 0.15505672991275787, "q99": 0.33220720291137695, "frac_below_0.01": 0.2909308671951294, "frac_below_0.05": 0.6194356083869934, "frac_above_0.5": 0.0006086561479605734, "max": 0.8799290657043457 }, { "name": "blocks.8.ffn.gate.raw.weight", "shape": [ 1280, 768 ], "n": 983040, "mean": 0.03446163237094879, "median": 0.024365149438381195, "q10": 0.001547358464449644, "q90": 0.08275697380304337, "q99": 0.14115624129772186, "frac_below_0.01": 0.30199891328811646, "frac_below_0.05": 0.733826756477356, "frac_above_0.5": 0.0, "max": 0.2949505150318146 }, { "name": "blocks.8.ffn.up.raw.weight", "shape": [ 1280, 768 ], "n": 983040, "mean": 0.03450019657611847, "median": 0.024399472400546074, "q10": 0.0015466592740267515, "q90": 0.08281046152114868, "q99": 0.14151407778263092, "frac_below_0.01": 0.3011057674884796, "frac_below_0.05": 0.733771800994873, "frac_above_0.5": 0.0, "max": 0.279220312833786 }, { "name": "blocks.8.ffn.down.raw.weight", "shape": [ 768, 1280 ], "n": 983040, "mean": 0.03970775753259659, "median": 0.027399426326155663, "q10": 0.0016542956000193954, "q90": 0.0966249480843544, "q99": 0.16643178462982178, "frac_below_0.01": 0.28803712129592896, "frac_below_0.05": 0.6849355697631836, "frac_above_0.5": 0.0, "max": 0.3608091473579407 }, { "name": "blocks.9.attn.q_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.022359108552336693, "median": 0.016284987330436707, "q10": 0.00270974263548851, "q90": 0.05029057338833809, "q99": 0.09790308773517609, "frac_below_0.01": 0.3334842324256897, "frac_below_0.05": 0.8987799882888794, "frac_above_0.5": 0.0, "max": 0.2203265279531479 }, { "name": "blocks.9.attn.k_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.02273663878440857, "median": 0.016311366111040115, "q10": 0.0026683020405471325, "q90": 0.05166053771972656, "q99": 0.10231450200080872, "frac_below_0.01": 0.3345811665058136, "frac_below_0.05": 0.8929036259651184, "frac_above_0.5": 0.0, "max": 0.24579688906669617 }, { "name": "blocks.9.attn.v_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.042768146842718124, "median": 0.028541607782244682, "q10": 0.0006766233709640801, "q90": 0.10649372637271881, "q99": 0.18854376673698425, "frac_below_0.01": 0.303490549325943, "frac_below_0.05": 0.6605818271636963, "frac_above_0.5": 0.0, "max": 0.4518583118915558 }, { "name": "blocks.9.attn.o_proj.raw.weight", "shape": [ 768, 768 ], "n": 589824, "mean": 0.0570216067135334, "median": 0.03520055115222931, "q10": 0.0008812142186798155, "q90": 0.14035534858703613, "q99": 0.3157142400741577, "frac_below_0.01": 0.2710367739200592, "frac_below_0.05": 0.59759521484375, "frac_above_0.5": 0.0011274550342932343, "max": 1.018288016319275 }, { "name": "blocks.9.ffn.gate.raw.weight", "shape": [ 1280, 768 ], "n": 983040, "mean": 0.03622754290699959, "median": 0.024802587926387787, "q10": 0.0009529480012133718, "q90": 0.08900798112154007, "q99": 0.15433676540851593, "frac_below_0.01": 0.31352946162223816, "frac_below_0.05": 0.7148427963256836, "frac_above_0.5": 0.0, "max": 0.3419005572795868 }, { "name": "blocks.9.ffn.up.raw.weight", "shape": [ 1280, 768 ], "n": 983040, "mean": 0.03622148931026459, "median": 0.024851901456713676, "q10": 0.000944304745644331, "q90": 0.08894885331392288, "q99": 0.15408946573734283, "frac_below_0.01": 0.31365662813186646, "frac_below_0.05": 0.7145477533340454, "frac_above_0.5": 0.0, "max": 0.3313311040401459 }, { "name": "blocks.9.ffn.down.raw.weight", "shape": [ 768, 1280 ], "n": 983040, "mean": 0.0442914180457592, "median": 0.02870362251996994, "q10": 0.0009061374003067613, "q90": 0.11138416081666946, "q99": 0.1992027461528778, "frac_below_0.01": 0.3033294975757599, "frac_below_0.05": 0.6538391709327698, "frac_above_0.5": 0.0, "max": 0.44191890954971313 } ], "attention_entropy": [ { "layer": 0, "entropy_per_head": [ 0.31201067566871643, 0.25528043508529663, 0.17442357540130615, 0.08394858986139297, 0.03744904324412346, 0.004117322154343128, 6.864327682654103e-12, 0.0, 0.0, 0.0, 0.0, 0.0 ], "max_prob_per_head": [ 0.8789216876029968, 0.8898435831069946, 0.9274678230285645, 0.9629272222518921, 0.9853159785270691, 0.9986785054206848, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ], "mean_entropy": 0.07226914167404175, "mean_max_prob": 0.9702628254890442 }, { "layer": 1, "entropy_per_head": [ 0.2243039309978485, 0.1652330458164215, 0.11748333275318146, 0.055319301784038544, 0.026457656174898148, 0.005040682852268219, 1.5577959115806866e-14, 1.0707181436059495e-40, 0.0, 0.0, 0.0, 0.0 ], "max_prob_per_head": [ 0.9096110463142395, 0.9270795583724976, 0.9465883374214172, 0.9738114476203918, 0.9879803657531738, 0.9976929426193237, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ], "mean_entropy": 0.049486491829156876, "mean_max_prob": 0.9785636067390442 }, { "layer": 2, "entropy_per_head": [ 0.2058943510055542, 0.16393162310123444, 0.10231588780879974, 0.06609746068716049, 0.023581046611070633, 0.009287012740969658, 7.763824034622696e-14, 4.0588610019168326e-41, 0.0, 0.0, 0.0, 0.0 ], "max_prob_per_head": [ 0.9163063168525696, 0.9273470640182495, 0.9537574052810669, 0.9697834849357605, 0.9889707565307617, 0.9955309629440308, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ], "mean_entropy": 0.04759228229522705, "mean_max_prob": 0.9793079495429993 }, { "layer": 3, "entropy_per_head": [ 0.209762841463089, 0.1171560063958168, 0.09794622659683228, 0.06588968634605408, 0.026721913367509842, 0.006907845847308636, 3.580622567379583e-15, 9.686195374952433e-41, 0.0, 0.0, 0.0, 0.0 ], "max_prob_per_head": [ 0.9159234166145325, 0.94795161485672, 0.9551283717155457, 0.9698406457901001, 0.9875055551528931, 0.9967637062072754, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ], "mean_entropy": 0.04369870945811272, "mean_max_prob": 0.9810927510261536 }, { "layer": 4, "entropy_per_head": [ 0.21481843292713165, 0.15817253291606903, 0.10903926193714142, 0.06867136061191559, 0.03494655713438988, 0.01739848032593727, 5.683253889200712e-13, 9.1925179259708e-43, 0.0, 0.0, 0.0, 0.0 ], "max_prob_per_head": [ 0.9135462045669556, 0.9296945333480835, 0.9507255554199219, 0.9684354662895203, 0.9833245277404785, 0.9917436838150024, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ], "mean_entropy": 0.050253886729478836, "mean_max_prob": 0.9781224727630615 }, { "layer": 5, "entropy_per_head": [ 0.20507557690143585, 0.1427854597568512, 0.10421045869588852, 0.04479587450623512, 0.01711101457476616, 0.005781891755759716, 7.97702019456753e-13, 2.397621672459762e-42, 0.0, 0.0, 0.0, 0.0 ], "max_prob_per_head": [ 0.917660117149353, 0.9356474876403809, 0.9525343179702759, 0.9798707962036133, 0.9924705624580383, 0.997445285320282, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ], "mean_entropy": 0.04331335797905922, "mean_max_prob": 0.9813024401664734 }, { "layer": 6, "entropy_per_head": [ 0.1552124321460724, 0.10944928973913193, 0.08590570837259293, 0.03441280126571655, 0.01842230185866356, 0.007083357311785221, 1.0530790649221355e-13, 1.4939242928166875e-41, 0.0, 0.0, 0.0, 0.0 ], "max_prob_per_head": [ 0.9378371238708496, 0.9510696530342102, 0.9609823226928711, 0.983575165271759, 0.9912484288215637, 0.9965624809265137, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ], "mean_entropy": 0.03420715779066086, "mean_max_prob": 0.9851062297821045 }, { "layer": 7, "entropy_per_head": [ 0.20514321327209473, 0.12170679867267609, 0.0744284838438034, 0.051210127770900726, 0.019812947139143944, 0.017020035535097122, 6.806287304594871e-13, 5.125829270832249e-39, 0.0, 0.0, 0.0, 0.0 ], "max_prob_per_head": [ 0.9193431735038757, 0.9457699060440063, 0.9668407440185547, 0.976292610168457, 0.9907869100570679, 0.9917880892753601, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ], "mean_entropy": 0.040776800364255905, "mean_max_prob": 0.9825684428215027 }, { "layer": 8, "entropy_per_head": [ 0.17893101274967194, 0.07504458725452423, 0.059000205248594284, 0.031063487753272057, 0.025076285004615784, 0.007854106836020947, 1.886881761947734e-10, 3.0688436368713494e-43, 0.0, 0.0, 0.0, 0.0 ], "max_prob_per_head": [ 0.9290326833724976, 0.9654616117477417, 0.9727820158004761, 0.9861835241317749, 0.9882169961929321, 0.9964586496353149, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ], "mean_entropy": 0.03141414001584053, "mean_max_prob": 0.9865112900733948 }, { "layer": 9, "entropy_per_head": [ 0.22920922935009003, 0.13090595602989197, 0.07475412636995316, 0.05009441822767258, 0.02543170377612114, 0.006821715272963047, 7.358213973025773e-13, 1.4680002712266784e-41, 0.0, 0.0, 0.0, 0.0 ], "max_prob_per_head": [ 0.909065842628479, 0.9412500262260437, 0.9665333032608032, 0.9774504899978638, 0.9881967306137085, 0.9967927932739258, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ], "mean_entropy": 0.04310142993927002, "mean_max_prob": 0.9816074371337891 } ], "error_breakdown": { "overall_accuracy": 0.727154541015625, "per_class": { "space": { "accuracy": 0.933965844402277, "n": 31620 }, "newline": { "accuracy": 0.5960154952960708, "n": 1807 }, "lowercase": { "accuracy": 0.699845422511801, "n": 120328 }, "uppercase": { "accuracy": 0.34219190968955787, "n": 4252 }, "digit": { "accuracy": 0.0, "n": 9 }, "punct": { "accuracy": 0.49141483516483514, "n": 5824 } } }, "teacher_ckpt": "/root/bitnet1/ckpt/fp32_ref_50M_last.pt", "teacher_val_bpc": 0.8601819578579681, "student_teacher_similarity": [ { "layer": 0, "sign_agreement": 0.577347195148468 }, { "layer": 1, "sign_agreement": 0.5173985481262207 }, { "layer": 2, "sign_agreement": 0.5035130262374878 }, { "layer": 3, "sign_agreement": 0.5001080393791199 }, { "layer": 4, "sign_agreement": 0.5003906428813935 }, { "layer": 5, "sign_agreement": 0.5008003354072571 }, { "layer": 6, "sign_agreement": 0.5006855249404907 }, { "layer": 7, "sign_agreement": 0.5008371591567993 }, { "layer": 8, "sign_agreement": 0.49889018535614016 }, { "layer": 9, "sign_agreement": 0.49474666118621824 } ] }