| { |
| "student_ckpt": "/root/bitnet1/ckpt/v29_50M_50K_last.pt", |
| "student_config": { |
| "vocab_size": 128, |
| "d_model": 768, |
| "n_layers": 10, |
| "n_heads": 12, |
| "d_ff": 1280, |
| "seq_len": 256 |
| }, |
| "student_step": 50000, |
| "student_val_bpc": 1.2349195671077682, |
| "timestamp": "2026-04-22 17:05:59", |
| "layer_ablation": { |
| "baseline_bpc": 1.2372383790923875, |
| "per_layer": [ |
| { |
| "layer": 0, |
| "baseline_bpc": 1.2372383790923875, |
| "ablated_bpc": 2.658024141629142, |
| "delta_bpc": 1.4207857625367544 |
| }, |
| { |
| "layer": 1, |
| "baseline_bpc": 1.2372383790923875, |
| "ablated_bpc": 1.791177114375058, |
| "delta_bpc": 0.5539387352826706 |
| }, |
| { |
| "layer": 2, |
| "baseline_bpc": 1.2372383790923875, |
| "ablated_bpc": 1.40893652335544, |
| "delta_bpc": 0.17169814426305252 |
| }, |
| { |
| "layer": 3, |
| "baseline_bpc": 1.2372383790923875, |
| "ablated_bpc": 1.3546692810857077, |
| "delta_bpc": 0.11743090199332018 |
| }, |
| { |
| "layer": 4, |
| "baseline_bpc": 1.2372383790923875, |
| "ablated_bpc": 1.4076403331115497, |
| "delta_bpc": 0.17040195401916214 |
| }, |
| { |
| "layer": 5, |
| "baseline_bpc": 1.2372383790923875, |
| "ablated_bpc": 1.3853021601250528, |
| "delta_bpc": 0.14806378103266526 |
| }, |
| { |
| "layer": 6, |
| "baseline_bpc": 1.2372383790923875, |
| "ablated_bpc": 1.4080221517947302, |
| "delta_bpc": 0.17078377270234268 |
| }, |
| { |
| "layer": 7, |
| "baseline_bpc": 1.2372383790923875, |
| "ablated_bpc": 1.5111466640773288, |
| "delta_bpc": 0.27390828498494124 |
| }, |
| { |
| "layer": 8, |
| "baseline_bpc": 1.2372383790923875, |
| "ablated_bpc": 1.5338596560531164, |
| "delta_bpc": 0.2966212769607288 |
| }, |
| { |
| "layer": 9, |
| "baseline_bpc": 1.2372383790923875, |
| "ablated_bpc": 1.6496860416056456, |
| "delta_bpc": 0.41244766251325804 |
| } |
| ] |
| }, |
| "weight_saturation": [ |
| { |
| "name": "out_codebook", |
| "shape": [ |
| 128, |
| 768 |
| ], |
| "n": 98304, |
| "mean": 0.32484379410743713, |
| "median": 0.26955974102020264, |
| "q10": 0.022471295669674873, |
| "q90": 0.6936785578727722, |
| "q99": 1.0008821487426758, |
| "frac_below_0.01": 0.077606201171875, |
| "frac_below_0.05": 0.1428019255399704, |
| "frac_above_0.5": 0.2465413510799408, |
| "max": 2.1756932735443115 |
| }, |
| { |
| "name": "embed.weight", |
| "shape": [ |
| 128, |
| 768 |
| ], |
| "n": 98304, |
| "mean": 0.025270771235227585, |
| "median": 0.01747666671872139, |
| "q10": 0.0027019940316677094, |
| "q90": 0.06000998988747597, |
| "q99": 0.1124815121293068, |
| "frac_below_0.01": 0.322998046875, |
| "frac_below_0.05": 0.8575338125228882, |
| "frac_above_0.5": 0.0, |
| "max": 0.20307880640029907 |
| }, |
| { |
| "name": "blocks.0.attn.q_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.020984457805752754, |
| "median": 0.01630321331322193, |
| "q10": 0.002880966058000922, |
| "q90": 0.04540819674730301, |
| "q99": 0.08269572257995605, |
| "frac_below_0.01": 0.327667236328125, |
| "frac_below_0.05": 0.9232143759727478, |
| "frac_above_0.5": 0.0, |
| "max": 0.17969276010990143 |
| }, |
| { |
| "name": "blocks.0.attn.k_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.021982185542583466, |
| "median": 0.016868969425559044, |
| "q10": 0.0029649189673364162, |
| "q90": 0.04804832115769386, |
| "q99": 0.08712130039930344, |
| "frac_below_0.01": 0.318261057138443, |
| "frac_below_0.05": 0.9096696972846985, |
| "frac_above_0.5": 0.0, |
| "max": 0.17866352200508118 |
| }, |
| { |
| "name": "blocks.0.attn.v_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.02966231107711792, |
| "median": 0.02414115145802498, |
| "q10": 0.0041543589904904366, |
| "q90": 0.0629270151257515, |
| "q99": 0.10377822816371918, |
| "frac_below_0.01": 0.2300957590341568, |
| "frac_below_0.05": 0.8167402744293213, |
| "frac_above_0.5": 0.0, |
| "max": 0.21888598799705505 |
| }, |
| { |
| "name": "blocks.0.attn.o_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.03943490982055664, |
| "median": 0.024108367040753365, |
| "q10": 0.0036507798358798027, |
| "q90": 0.09731841832399368, |
| "q99": 0.2101808488368988, |
| "frac_below_0.01": 0.2511308491230011, |
| "frac_below_0.05": 0.7431420087814331, |
| "frac_above_0.5": 3.3908420391526306e-06, |
| "max": 0.5122478604316711 |
| }, |
| { |
| "name": "blocks.0.ffn.gate.raw.weight", |
| "shape": [ |
| 1280, |
| 768 |
| ], |
| "n": 983040, |
| "mean": 0.029497429728507996, |
| "median": 0.024814119562506676, |
| "q10": 0.004512307234108448, |
| "q90": 0.06109948456287384, |
| "q99": 0.09630865603685379, |
| "frac_below_0.01": 0.21697694063186646, |
| "frac_below_0.05": 0.8223521113395691, |
| "frac_above_0.5": 0.0, |
| "max": 0.17949974536895752 |
| }, |
| { |
| "name": "blocks.0.ffn.up.raw.weight", |
| "shape": [ |
| 1280, |
| 768 |
| ], |
| "n": 983040, |
| "mean": 0.029491588473320007, |
| "median": 0.024759948253631592, |
| "q10": 0.004544392228126526, |
| "q90": 0.06112118810415268, |
| "q99": 0.09623175859451294, |
| "frac_below_0.01": 0.21637777984142303, |
| "frac_below_0.05": 0.8225098252296448, |
| "frac_above_0.5": 0.0, |
| "max": 0.17936795949935913 |
| }, |
| { |
| "name": "blocks.0.ffn.down.raw.weight", |
| "shape": [ |
| 768, |
| 1280 |
| ], |
| "n": 983040, |
| "mean": 0.030015649273991585, |
| "median": 0.025115979835391045, |
| "q10": 0.004507269244641066, |
| "q90": 0.06252331286668777, |
| "q99": 0.09870631247758865, |
| "frac_below_0.01": 0.21663717925548553, |
| "frac_below_0.05": 0.8136902451515198, |
| "frac_above_0.5": 0.0, |
| "max": 0.19937986135482788 |
| }, |
| { |
| "name": "blocks.1.attn.q_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.021738752722740173, |
| "median": 0.016651637852191925, |
| "q10": 0.0030065574683248997, |
| "q90": 0.04737270250916481, |
| "q99": 0.0873987078666687, |
| "frac_below_0.01": 0.3206566870212555, |
| "frac_below_0.05": 0.9128333330154419, |
| "frac_above_0.5": 0.0, |
| "max": 0.20024967193603516 |
| }, |
| { |
| "name": "blocks.1.attn.k_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.022124914452433586, |
| "median": 0.016716480255126953, |
| "q10": 0.0029788485262542963, |
| "q90": 0.0486590750515461, |
| "q99": 0.09064910560846329, |
| "frac_below_0.01": 0.3203616738319397, |
| "frac_below_0.05": 0.9063364863395691, |
| "frac_above_0.5": 0.0, |
| "max": 0.21088646352291107 |
| }, |
| { |
| "name": "blocks.1.attn.v_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.032271161675453186, |
| "median": 0.02542409859597683, |
| "q10": 0.004067442379891872, |
| "q90": 0.07015787810087204, |
| "q99": 0.11874839663505554, |
| "frac_below_0.01": 0.2300516813993454, |
| "frac_below_0.05": 0.7786475419998169, |
| "frac_above_0.5": 0.0, |
| "max": 0.29738032817840576 |
| }, |
| { |
| "name": "blocks.1.attn.o_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.0365101583302021, |
| "median": 0.026378890499472618, |
| "q10": 0.004064246546477079, |
| "q90": 0.08268115669488907, |
| "q99": 0.15872563421726227, |
| "frac_below_0.01": 0.2282664030790329, |
| "frac_below_0.05": 0.7426249384880066, |
| "frac_above_0.5": 0.0, |
| "max": 0.32939204573631287 |
| }, |
| { |
| "name": "blocks.1.ffn.gate.raw.weight", |
| "shape": [ |
| 1280, |
| 768 |
| ], |
| "n": 983040, |
| "mean": 0.030397042632102966, |
| "median": 0.0249449722468853, |
| "q10": 0.0042805117554962635, |
| "q90": 0.06440477073192596, |
| "q99": 0.10299010574817657, |
| "frac_below_0.01": 0.22425639629364014, |
| "frac_below_0.05": 0.8044993281364441, |
| "frac_above_0.5": 0.0, |
| "max": 0.20056255161762238 |
| }, |
| { |
| "name": "blocks.1.ffn.up.raw.weight", |
| "shape": [ |
| 1280, |
| 768 |
| ], |
| "n": 983040, |
| "mean": 0.03043045848608017, |
| "median": 0.02498842217028141, |
| "q10": 0.00427834689617157, |
| "q90": 0.06437431275844574, |
| "q99": 0.1029980257153511, |
| "frac_below_0.01": 0.22370098531246185, |
| "frac_below_0.05": 0.8039296865463257, |
| "frac_above_0.5": 0.0, |
| "max": 0.20558297634124756 |
| }, |
| { |
| "name": "blocks.1.ffn.down.raw.weight", |
| "shape": [ |
| 768, |
| 1280 |
| ], |
| "n": 983040, |
| "mean": 0.030961716547608376, |
| "median": 0.025217225775122643, |
| "q10": 0.004261382389813662, |
| "q90": 0.06597873568534851, |
| "q99": 0.10618423670530319, |
| "frac_below_0.01": 0.22413738071918488, |
| "frac_below_0.05": 0.7958465814590454, |
| "frac_above_0.5": 0.0, |
| "max": 0.20832186937332153 |
| }, |
| { |
| "name": "blocks.2.attn.q_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.021832674741744995, |
| "median": 0.016725117340683937, |
| "q10": 0.003007261548191309, |
| "q90": 0.04768085852265358, |
| "q99": 0.08757057785987854, |
| "frac_below_0.01": 0.31890869140625, |
| "frac_below_0.05": 0.9113871455192566, |
| "frac_above_0.5": 0.0, |
| "max": 0.19008490443229675 |
| }, |
| { |
| "name": "blocks.2.attn.k_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.02204788476228714, |
| "median": 0.01679176464676857, |
| "q10": 0.003010682761669159, |
| "q90": 0.04842023551464081, |
| "q99": 0.08911668509244919, |
| "frac_below_0.01": 0.3183034360408783, |
| "frac_below_0.05": 0.907684326171875, |
| "frac_above_0.5": 0.0, |
| "max": 0.19714821875095367 |
| }, |
| { |
| "name": "blocks.2.attn.v_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.03241870179772377, |
| "median": 0.025510499253869057, |
| "q10": 0.0039276741445064545, |
| "q90": 0.07076237350702286, |
| "q99": 0.11824272572994232, |
| "frac_below_0.01": 0.2332882434129715, |
| "frac_below_0.05": 0.7741004228591919, |
| "frac_above_0.5": 0.0, |
| "max": 0.25875869393348694 |
| }, |
| { |
| "name": "blocks.2.attn.o_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.0360061414539814, |
| "median": 0.026566587388515472, |
| "q10": 0.003982395865023136, |
| "q90": 0.08094409108161926, |
| "q99": 0.15047675371170044, |
| "frac_below_0.01": 0.2296006977558136, |
| "frac_below_0.05": 0.7429623007774353, |
| "frac_above_0.5": 0.0, |
| "max": 0.3236503005027771 |
| }, |
| { |
| "name": "blocks.2.ffn.gate.raw.weight", |
| "shape": [ |
| 1280, |
| 768 |
| ], |
| "n": 983040, |
| "mean": 0.030454641208052635, |
| "median": 0.024827929213643074, |
| "q10": 0.004183255601674318, |
| "q90": 0.06483904272317886, |
| "q99": 0.10405371338129044, |
| "frac_below_0.01": 0.2272491604089737, |
| "frac_below_0.05": 0.8031179308891296, |
| "frac_above_0.5": 0.0, |
| "max": 0.21293406188488007 |
| }, |
| { |
| "name": "blocks.2.ffn.up.raw.weight", |
| "shape": [ |
| 1280, |
| 768 |
| ], |
| "n": 983040, |
| "mean": 0.030473940074443817, |
| "median": 0.024836573749780655, |
| "q10": 0.0041793277487158775, |
| "q90": 0.0648508369922638, |
| "q99": 0.10394246876239777, |
| "frac_below_0.01": 0.22719117999076843, |
| "frac_below_0.05": 0.8021494746208191, |
| "frac_above_0.5": 0.0, |
| "max": 0.20337601006031036 |
| }, |
| { |
| "name": "blocks.2.ffn.down.raw.weight", |
| "shape": [ |
| 768, |
| 1280 |
| ], |
| "n": 983040, |
| "mean": 0.03145662695169449, |
| "median": 0.025447478517889977, |
| "q10": 0.004232366569340229, |
| "q90": 0.0673917829990387, |
| "q99": 0.10850941389799118, |
| "frac_below_0.01": 0.22418315708637238, |
| "frac_below_0.05": 0.7883524894714355, |
| "frac_above_0.5": 0.0, |
| "max": 0.2242489606142044 |
| }, |
| { |
| "name": "blocks.3.attn.q_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.021820461377501488, |
| "median": 0.016704333946108818, |
| "q10": 0.0030075714457780123, |
| "q90": 0.047725576907396317, |
| "q99": 0.08704143017530441, |
| "frac_below_0.01": 0.3202582597732544, |
| "frac_below_0.05": 0.9108836054801941, |
| "frac_above_0.5": 0.0, |
| "max": 0.1996636688709259 |
| }, |
| { |
| "name": "blocks.3.attn.k_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.02202785760164261, |
| "median": 0.016765017062425613, |
| "q10": 0.003018269781023264, |
| "q90": 0.04830968379974365, |
| "q99": 0.08893266320228577, |
| "frac_below_0.01": 0.319023996591568, |
| "frac_below_0.05": 0.9081844687461853, |
| "frac_above_0.5": 0.0, |
| "max": 0.2032247930765152 |
| }, |
| { |
| "name": "blocks.3.attn.v_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.03259618952870369, |
| "median": 0.02546798251569271, |
| "q10": 0.0037475833669304848, |
| "q90": 0.07166170328855515, |
| "q99": 0.1207074448466301, |
| "frac_below_0.01": 0.2383185476064682, |
| "frac_below_0.05": 0.7702789306640625, |
| "frac_above_0.5": 0.0, |
| "max": 0.308125764131546 |
| }, |
| { |
| "name": "blocks.3.attn.o_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.036724548786878586, |
| "median": 0.026713047176599503, |
| "q10": 0.0038133268244564533, |
| "q90": 0.08330091834068298, |
| "q99": 0.1569368839263916, |
| "frac_below_0.01": 0.233211949467659, |
| "frac_below_0.05": 0.7358195185661316, |
| "frac_above_0.5": 0.0, |
| "max": 0.35014036297798157 |
| }, |
| { |
| "name": "blocks.3.ffn.gate.raw.weight", |
| "shape": [ |
| 1280, |
| 768 |
| ], |
| "n": 983040, |
| "mean": 0.03041911870241165, |
| "median": 0.024767549708485603, |
| "q10": 0.004097540397197008, |
| "q90": 0.06491811573505402, |
| "q99": 0.10423436015844345, |
| "frac_below_0.01": 0.23008118569850922, |
| "frac_below_0.05": 0.8022247552871704, |
| "frac_above_0.5": 0.0, |
| "max": 0.2128787487745285 |
| }, |
| { |
| "name": "blocks.3.ffn.up.raw.weight", |
| "shape": [ |
| 1280, |
| 768 |
| ], |
| "n": 983040, |
| "mean": 0.030419766902923584, |
| "median": 0.024740705266594887, |
| "q10": 0.004079720471054316, |
| "q90": 0.06493699550628662, |
| "q99": 0.10483106225728989, |
| "frac_below_0.01": 0.2303914576768875, |
| "frac_below_0.05": 0.802680492401123, |
| "frac_above_0.5": 0.0, |
| "max": 0.21313220262527466 |
| }, |
| { |
| "name": "blocks.3.ffn.down.raw.weight", |
| "shape": [ |
| 768, |
| 1280 |
| ], |
| "n": 983040, |
| "mean": 0.031688056886196136, |
| "median": 0.025530001148581505, |
| "q10": 0.004116586875170469, |
| "q90": 0.06815119832754135, |
| "q99": 0.11011636257171631, |
| "frac_below_0.01": 0.22733664512634277, |
| "frac_below_0.05": 0.7837514877319336, |
| "frac_above_0.5": 0.0, |
| "max": 0.22840875387191772 |
| }, |
| { |
| "name": "blocks.4.attn.q_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.021806135773658752, |
| "median": 0.016604429110884666, |
| "q10": 0.0029621128924191, |
| "q90": 0.047807518392801285, |
| "q99": 0.0883331298828125, |
| "frac_below_0.01": 0.3223758339881897, |
| "frac_below_0.05": 0.9105224609375, |
| "frac_above_0.5": 0.0, |
| "max": 0.19498515129089355 |
| }, |
| { |
| "name": "blocks.4.attn.k_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.0219953004270792, |
| "median": 0.01666717790067196, |
| "q10": 0.002971069421619177, |
| "q90": 0.04836598038673401, |
| "q99": 0.08983262628316879, |
| "frac_below_0.01": 0.320641428232193, |
| "frac_below_0.05": 0.9077809453010559, |
| "frac_above_0.5": 0.0, |
| "max": 0.19498971104621887 |
| }, |
| { |
| "name": "blocks.4.attn.v_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.03282567113637924, |
| "median": 0.025203978642821312, |
| "q10": 0.0033933515660464764, |
| "q90": 0.07318802922964096, |
| "q99": 0.12469246238470078, |
| "frac_below_0.01": 0.2480655312538147, |
| "frac_below_0.05": 0.7664320468902588, |
| "frac_above_0.5": 0.0, |
| "max": 0.28078585863113403 |
| }, |
| { |
| "name": "blocks.4.attn.o_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.03867517039179802, |
| "median": 0.026929715648293495, |
| "q10": 0.0034530421253293753, |
| "q90": 0.09060992300510406, |
| "q99": 0.1718837320804596, |
| "frac_below_0.01": 0.2434438019990921, |
| "frac_below_0.05": 0.7186652421951294, |
| "frac_above_0.5": 0.0, |
| "max": 0.3775966465473175 |
| }, |
| { |
| "name": "blocks.4.ffn.gate.raw.weight", |
| "shape": [ |
| 1280, |
| 768 |
| ], |
| "n": 983040, |
| "mean": 0.030496973544359207, |
| "median": 0.024575216695666313, |
| "q10": 0.003859907388687134, |
| "q90": 0.06571313738822937, |
| "q99": 0.10632941871881485, |
| "frac_below_0.01": 0.23670147359371185, |
| "frac_below_0.05": 0.7992218136787415, |
| "frac_above_0.5": 0.0, |
| "max": 0.236494243144989 |
| }, |
| { |
| "name": "blocks.4.ffn.up.raw.weight", |
| "shape": [ |
| 1280, |
| 768 |
| ], |
| "n": 983040, |
| "mean": 0.03050924651324749, |
| "median": 0.02466735430061817, |
| "q10": 0.003866195445880294, |
| "q90": 0.06563732028007507, |
| "q99": 0.10621653497219086, |
| "frac_below_0.01": 0.23652751743793488, |
| "frac_below_0.05": 0.799262523651123, |
| "frac_above_0.5": 0.0, |
| "max": 0.22307677567005157 |
| }, |
| { |
| "name": "blocks.4.ffn.down.raw.weight", |
| "shape": [ |
| 768, |
| 1280 |
| ], |
| "n": 983040, |
| "mean": 0.03216877207159996, |
| "median": 0.025634581223130226, |
| "q10": 0.003866716753691435, |
| "q90": 0.07008880376815796, |
| "q99": 0.11355020850896835, |
| "frac_below_0.01": 0.23405660688877106, |
| "frac_below_0.05": 0.7747518420219421, |
| "frac_above_0.5": 0.0, |
| "max": 0.2466614544391632 |
| }, |
| { |
| "name": "blocks.5.attn.q_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.021769991144537926, |
| "median": 0.016515638679265976, |
| "q10": 0.0029437255579978228, |
| "q90": 0.04784300550818443, |
| "q99": 0.08883869647979736, |
| "frac_below_0.01": 0.3243001401424408, |
| "frac_below_0.05": 0.9103173017501831, |
| "frac_above_0.5": 0.0, |
| "max": 0.1770746260881424 |
| }, |
| { |
| "name": "blocks.5.attn.k_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.02204853482544422, |
| "median": 0.016590725630521774, |
| "q10": 0.002919677644968033, |
| "q90": 0.04862433299422264, |
| "q99": 0.09129194915294647, |
| "frac_below_0.01": 0.3242102861404419, |
| "frac_below_0.05": 0.9064585566520691, |
| "frac_above_0.5": 0.0, |
| "max": 0.2087617963552475 |
| }, |
| { |
| "name": "blocks.5.attn.v_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.0333058126270771, |
| "median": 0.025002961978316307, |
| "q10": 0.002964397193863988, |
| "q90": 0.07551417499780655, |
| "q99": 0.13100847601890564, |
| "frac_below_0.01": 0.2606692910194397, |
| "frac_below_0.05": 0.7589297890663147, |
| "frac_above_0.5": 0.0, |
| "max": 0.3152570426464081 |
| }, |
| { |
| "name": "blocks.5.attn.o_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.041803035885095596, |
| "median": 0.02722357213497162, |
| "q10": 0.0029212224762886763, |
| "q90": 0.10118373483419418, |
| "q99": 0.20479606091976166, |
| "frac_below_0.01": 0.2576039731502533, |
| "frac_below_0.05": 0.6999596357345581, |
| "frac_above_0.5": 5.086263172415784e-06, |
| "max": 0.5692521929740906 |
| }, |
| { |
| "name": "blocks.5.ffn.gate.raw.weight", |
| "shape": [ |
| 1280, |
| 768 |
| ], |
| "n": 983040, |
| "mean": 0.030752409249544144, |
| "median": 0.0244681965559721, |
| "q10": 0.0035621533170342445, |
| "q90": 0.06708699464797974, |
| "q99": 0.10938585549592972, |
| "frac_below_0.01": 0.2462412714958191, |
| "frac_below_0.05": 0.7924683094024658, |
| "frac_above_0.5": 0.0, |
| "max": 0.2319944053888321 |
| }, |
| { |
| "name": "blocks.5.ffn.up.raw.weight", |
| "shape": [ |
| 1280, |
| 768 |
| ], |
| "n": 983040, |
| "mean": 0.030752165243029594, |
| "median": 0.024411072954535484, |
| "q10": 0.0035458123311400414, |
| "q90": 0.06726153194904327, |
| "q99": 0.10948219895362854, |
| "frac_below_0.01": 0.24592998623847961, |
| "frac_below_0.05": 0.7923950552940369, |
| "frac_above_0.5": 0.0, |
| "max": 0.228058323264122 |
| }, |
| { |
| "name": "blocks.5.ffn.down.raw.weight", |
| "shape": [ |
| 768, |
| 1280 |
| ], |
| "n": 983040, |
| "mean": 0.03299623355269432, |
| "median": 0.02572372555732727, |
| "q10": 0.003507574088871479, |
| "q90": 0.07318581640720367, |
| "q99": 0.12024357914924622, |
| "frac_below_0.01": 0.24329835176467896, |
| "frac_below_0.05": 0.7614848017692566, |
| "frac_above_0.5": 0.0, |
| "max": 0.27965444326400757 |
| }, |
| { |
| "name": "blocks.6.attn.q_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.0215331818908453, |
| "median": 0.016295911744236946, |
| "q10": 0.0028458735905587673, |
| "q90": 0.04729600250720978, |
| "q99": 0.08885350078344345, |
| "frac_below_0.01": 0.329857736825943, |
| "frac_below_0.05": 0.9127739667892456, |
| "frac_above_0.5": 0.0, |
| "max": 0.19640463590621948 |
| }, |
| { |
| "name": "blocks.6.attn.k_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.02174237370491028, |
| "median": 0.01627686247229576, |
| "q10": 0.0028480947948992252, |
| "q90": 0.047940269112586975, |
| "q99": 0.0914376974105835, |
| "frac_below_0.01": 0.3300510048866272, |
| "frac_below_0.05": 0.9095560908317566, |
| "frac_above_0.5": 0.0, |
| "max": 0.21430690586566925 |
| }, |
| { |
| "name": "blocks.6.attn.v_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.03459187224507332, |
| "median": 0.02503044530749321, |
| "q10": 0.0024408106692135334, |
| "q90": 0.08047246932983398, |
| "q99": 0.14331185817718506, |
| "frac_below_0.01": 0.2751024067401886, |
| "frac_below_0.05": 0.7422892451286316, |
| "frac_above_0.5": 0.0, |
| "max": 0.3117149770259857 |
| }, |
| { |
| "name": "blocks.6.attn.o_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.04699027165770531, |
| "median": 0.02844177559018135, |
| "q10": 0.0022870029788464308, |
| "q90": 0.1200011819601059, |
| "q99": 0.23704038560390472, |
| "frac_below_0.01": 0.2714368999004364, |
| "frac_below_0.05": 0.6671345829963684, |
| "frac_above_0.5": 3.3908420391526306e-06, |
| "max": 0.518464207649231 |
| }, |
| { |
| "name": "blocks.6.ffn.gate.raw.weight", |
| "shape": [ |
| 1280, |
| 768 |
| ], |
| "n": 983040, |
| "mean": 0.03131778910756111, |
| "median": 0.02425358071923256, |
| "q10": 0.0030630475375801325, |
| "q90": 0.07003403455018997, |
| "q99": 0.11553935706615448, |
| "frac_below_0.01": 0.2605092525482178, |
| "frac_below_0.05": 0.7811055779457092, |
| "frac_above_0.5": 0.0, |
| "max": 0.24002909660339355 |
| }, |
| { |
| "name": "blocks.6.ffn.up.raw.weight", |
| "shape": [ |
| 1280, |
| 768 |
| ], |
| "n": 983040, |
| "mean": 0.031340092420578, |
| "median": 0.024272354319691658, |
| "q10": 0.0030343779362738132, |
| "q90": 0.07006758451461792, |
| "q99": 0.11566709727048874, |
| "frac_below_0.01": 0.2608998715877533, |
| "frac_below_0.05": 0.780102550983429, |
| "frac_above_0.5": 0.0, |
| "max": 0.2660861313343048 |
| }, |
| { |
| "name": "blocks.6.ffn.down.raw.weight", |
| "shape": [ |
| 768, |
| 1280 |
| ], |
| "n": 983040, |
| "mean": 0.034312065690755844, |
| "median": 0.025947941467165947, |
| "q10": 0.0029694088734686375, |
| "q90": 0.07810769975185394, |
| "q99": 0.13014021515846252, |
| "frac_below_0.01": 0.2571146786212921, |
| "frac_below_0.05": 0.7417826652526855, |
| "frac_above_0.5": 0.0, |
| "max": 0.26336076855659485 |
| }, |
| { |
| "name": "blocks.7.attn.q_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.022099530324339867, |
| "median": 0.016269685700535774, |
| "q10": 0.0027676254976540804, |
| "q90": 0.04940343275666237, |
| "q99": 0.0952984169125557, |
| "frac_below_0.01": 0.3332875669002533, |
| "frac_below_0.05": 0.9026760458946228, |
| "frac_above_0.5": 0.0, |
| "max": 0.21799291670322418 |
| }, |
| { |
| "name": "blocks.7.attn.k_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.022439096122980118, |
| "median": 0.016338741406798363, |
| "q10": 0.002755739027634263, |
| "q90": 0.050511594861745834, |
| "q99": 0.09869782626628876, |
| "frac_below_0.01": 0.3318294882774353, |
| "frac_below_0.05": 0.8977118730545044, |
| "frac_above_0.5": 0.0, |
| "max": 0.219570130109787 |
| }, |
| { |
| "name": "blocks.7.attn.v_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.03680150955915451, |
| "median": 0.024955160915851593, |
| "q10": 0.0015987252118065953, |
| "q90": 0.08944220095872879, |
| "q99": 0.16461941599845886, |
| "frac_below_0.01": 0.3004574179649353, |
| "frac_below_0.05": 0.7197520136833191, |
| "frac_above_0.5": 0.0, |
| "max": 0.3584654927253723 |
| }, |
| { |
| "name": "blocks.7.attn.o_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.05452443286776543, |
| "median": 0.029793892055749893, |
| "q10": 0.0014873046893626451, |
| "q90": 0.14702990651130676, |
| "q99": 0.28597983717918396, |
| "frac_below_0.01": 0.2919921875, |
| "frac_below_0.05": 0.6334109902381897, |
| "frac_above_0.5": 8.646646892884746e-05, |
| "max": 0.7674525380134583 |
| }, |
| { |
| "name": "blocks.7.ffn.gate.raw.weight", |
| "shape": [ |
| 1280, |
| 768 |
| ], |
| "n": 983040, |
| "mean": 0.03266124054789543, |
| "median": 0.024235490709543228, |
| "q10": 0.0022881708573549986, |
| "q90": 0.07565723359584808, |
| "q99": 0.1269170194864273, |
| "frac_below_0.01": 0.2822510004043579, |
| "frac_below_0.05": 0.757544994354248, |
| "frac_above_0.5": 0.0, |
| "max": 0.28104186058044434 |
| }, |
| { |
| "name": "blocks.7.ffn.up.raw.weight", |
| "shape": [ |
| 1280, |
| 768 |
| ], |
| "n": 983040, |
| "mean": 0.032623130828142166, |
| "median": 0.0241586584597826, |
| "q10": 0.002281878376379609, |
| "q90": 0.07564595341682434, |
| "q99": 0.1268070936203003, |
| "frac_below_0.01": 0.281890869140625, |
| "frac_below_0.05": 0.7584859728813171, |
| "frac_above_0.5": 0.0, |
| "max": 0.267304927110672 |
| }, |
| { |
| "name": "blocks.7.ffn.down.raw.weight", |
| "shape": [ |
| 768, |
| 1280 |
| ], |
| "n": 983040, |
| "mean": 0.036399319767951965, |
| "median": 0.026400091126561165, |
| "q10": 0.0022754990495741367, |
| "q90": 0.08571262657642365, |
| "q99": 0.14497342705726624, |
| "frac_below_0.01": 0.27541911602020264, |
| "frac_below_0.05": 0.7168355584144592, |
| "frac_above_0.5": 0.0, |
| "max": 0.30715522170066833 |
| }, |
| { |
| "name": "blocks.8.attn.q_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.02204364724457264, |
| "median": 0.01618419960141182, |
| "q10": 0.0027065007016062737, |
| "q90": 0.04928082227706909, |
| "q99": 0.09612085670232773, |
| "frac_below_0.01": 0.3353847861289978, |
| "frac_below_0.05": 0.9031490683555603, |
| "frac_above_0.5": 0.0, |
| "max": 0.2505124807357788 |
| }, |
| { |
| "name": "blocks.8.attn.k_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.022502297535538673, |
| "median": 0.01621812768280506, |
| "q10": 0.0026969979517161846, |
| "q90": 0.05086328089237213, |
| "q99": 0.1007092148065567, |
| "frac_below_0.01": 0.3353763222694397, |
| "frac_below_0.05": 0.8963436484336853, |
| "frac_above_0.5": 0.0, |
| "max": 0.23622968792915344 |
| }, |
| { |
| "name": "blocks.8.attn.v_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.03888942673802376, |
| "median": 0.0260167196393013, |
| "q10": 0.001105984323658049, |
| "q90": 0.0957922413945198, |
| "q99": 0.17475828528404236, |
| "frac_below_0.01": 0.3079240620136261, |
| "frac_below_0.05": 0.6965315341949463, |
| "frac_above_0.5": 0.0, |
| "max": 0.49975377321243286 |
| }, |
| { |
| "name": "blocks.8.attn.o_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.058232564479112625, |
| "median": 0.03141973167657852, |
| "q10": 0.0010921643115580082, |
| "q90": 0.15505672991275787, |
| "q99": 0.33220720291137695, |
| "frac_below_0.01": 0.2909308671951294, |
| "frac_below_0.05": 0.6194356083869934, |
| "frac_above_0.5": 0.0006086561479605734, |
| "max": 0.8799290657043457 |
| }, |
| { |
| "name": "blocks.8.ffn.gate.raw.weight", |
| "shape": [ |
| 1280, |
| 768 |
| ], |
| "n": 983040, |
| "mean": 0.03446163237094879, |
| "median": 0.024365149438381195, |
| "q10": 0.001547358464449644, |
| "q90": 0.08275697380304337, |
| "q99": 0.14115624129772186, |
| "frac_below_0.01": 0.30199891328811646, |
| "frac_below_0.05": 0.733826756477356, |
| "frac_above_0.5": 0.0, |
| "max": 0.2949505150318146 |
| }, |
| { |
| "name": "blocks.8.ffn.up.raw.weight", |
| "shape": [ |
| 1280, |
| 768 |
| ], |
| "n": 983040, |
| "mean": 0.03450019657611847, |
| "median": 0.024399472400546074, |
| "q10": 0.0015466592740267515, |
| "q90": 0.08281046152114868, |
| "q99": 0.14151407778263092, |
| "frac_below_0.01": 0.3011057674884796, |
| "frac_below_0.05": 0.733771800994873, |
| "frac_above_0.5": 0.0, |
| "max": 0.279220312833786 |
| }, |
| { |
| "name": "blocks.8.ffn.down.raw.weight", |
| "shape": [ |
| 768, |
| 1280 |
| ], |
| "n": 983040, |
| "mean": 0.03970775753259659, |
| "median": 0.027399426326155663, |
| "q10": 0.0016542956000193954, |
| "q90": 0.0966249480843544, |
| "q99": 0.16643178462982178, |
| "frac_below_0.01": 0.28803712129592896, |
| "frac_below_0.05": 0.6849355697631836, |
| "frac_above_0.5": 0.0, |
| "max": 0.3608091473579407 |
| }, |
| { |
| "name": "blocks.9.attn.q_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.022359108552336693, |
| "median": 0.016284987330436707, |
| "q10": 0.00270974263548851, |
| "q90": 0.05029057338833809, |
| "q99": 0.09790308773517609, |
| "frac_below_0.01": 0.3334842324256897, |
| "frac_below_0.05": 0.8987799882888794, |
| "frac_above_0.5": 0.0, |
| "max": 0.2203265279531479 |
| }, |
| { |
| "name": "blocks.9.attn.k_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.02273663878440857, |
| "median": 0.016311366111040115, |
| "q10": 0.0026683020405471325, |
| "q90": 0.05166053771972656, |
| "q99": 0.10231450200080872, |
| "frac_below_0.01": 0.3345811665058136, |
| "frac_below_0.05": 0.8929036259651184, |
| "frac_above_0.5": 0.0, |
| "max": 0.24579688906669617 |
| }, |
| { |
| "name": "blocks.9.attn.v_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.042768146842718124, |
| "median": 0.028541607782244682, |
| "q10": 0.0006766233709640801, |
| "q90": 0.10649372637271881, |
| "q99": 0.18854376673698425, |
| "frac_below_0.01": 0.303490549325943, |
| "frac_below_0.05": 0.6605818271636963, |
| "frac_above_0.5": 0.0, |
| "max": 0.4518583118915558 |
| }, |
| { |
| "name": "blocks.9.attn.o_proj.raw.weight", |
| "shape": [ |
| 768, |
| 768 |
| ], |
| "n": 589824, |
| "mean": 0.0570216067135334, |
| "median": 0.03520055115222931, |
| "q10": 0.0008812142186798155, |
| "q90": 0.14035534858703613, |
| "q99": 0.3157142400741577, |
| "frac_below_0.01": 0.2710367739200592, |
| "frac_below_0.05": 0.59759521484375, |
| "frac_above_0.5": 0.0011274550342932343, |
| "max": 1.018288016319275 |
| }, |
| { |
| "name": "blocks.9.ffn.gate.raw.weight", |
| "shape": [ |
| 1280, |
| 768 |
| ], |
| "n": 983040, |
| "mean": 0.03622754290699959, |
| "median": 0.024802587926387787, |
| "q10": 0.0009529480012133718, |
| "q90": 0.08900798112154007, |
| "q99": 0.15433676540851593, |
| "frac_below_0.01": 0.31352946162223816, |
| "frac_below_0.05": 0.7148427963256836, |
| "frac_above_0.5": 0.0, |
| "max": 0.3419005572795868 |
| }, |
| { |
| "name": "blocks.9.ffn.up.raw.weight", |
| "shape": [ |
| 1280, |
| 768 |
| ], |
| "n": 983040, |
| "mean": 0.03622148931026459, |
| "median": 0.024851901456713676, |
| "q10": 0.000944304745644331, |
| "q90": 0.08894885331392288, |
| "q99": 0.15408946573734283, |
| "frac_below_0.01": 0.31365662813186646, |
| "frac_below_0.05": 0.7145477533340454, |
| "frac_above_0.5": 0.0, |
| "max": 0.3313311040401459 |
| }, |
| { |
| "name": "blocks.9.ffn.down.raw.weight", |
| "shape": [ |
| 768, |
| 1280 |
| ], |
| "n": 983040, |
| "mean": 0.0442914180457592, |
| "median": 0.02870362251996994, |
| "q10": 0.0009061374003067613, |
| "q90": 0.11138416081666946, |
| "q99": 0.1992027461528778, |
| "frac_below_0.01": 0.3033294975757599, |
| "frac_below_0.05": 0.6538391709327698, |
| "frac_above_0.5": 0.0, |
| "max": 0.44191890954971313 |
| } |
| ], |
| "attention_entropy": [ |
| { |
| "layer": 0, |
| "entropy_per_head": [ |
| 0.31201067566871643, |
| 0.25528043508529663, |
| 0.17442357540130615, |
| 0.08394858986139297, |
| 0.03744904324412346, |
| 0.004117322154343128, |
| 6.864327682654103e-12, |
| 0.0, |
| 0.0, |
| 0.0, |
| 0.0, |
| 0.0 |
| ], |
| "max_prob_per_head": [ |
| 0.8789216876029968, |
| 0.8898435831069946, |
| 0.9274678230285645, |
| 0.9629272222518921, |
| 0.9853159785270691, |
| 0.9986785054206848, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0 |
| ], |
| "mean_entropy": 0.07226914167404175, |
| "mean_max_prob": 0.9702628254890442 |
| }, |
| { |
| "layer": 1, |
| "entropy_per_head": [ |
| 0.2243039309978485, |
| 0.1652330458164215, |
| 0.11748333275318146, |
| 0.055319301784038544, |
| 0.026457656174898148, |
| 0.005040682852268219, |
| 1.5577959115806866e-14, |
| 1.0707181436059495e-40, |
| 0.0, |
| 0.0, |
| 0.0, |
| 0.0 |
| ], |
| "max_prob_per_head": [ |
| 0.9096110463142395, |
| 0.9270795583724976, |
| 0.9465883374214172, |
| 0.9738114476203918, |
| 0.9879803657531738, |
| 0.9976929426193237, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0 |
| ], |
| "mean_entropy": 0.049486491829156876, |
| "mean_max_prob": 0.9785636067390442 |
| }, |
| { |
| "layer": 2, |
| "entropy_per_head": [ |
| 0.2058943510055542, |
| 0.16393162310123444, |
| 0.10231588780879974, |
| 0.06609746068716049, |
| 0.023581046611070633, |
| 0.009287012740969658, |
| 7.763824034622696e-14, |
| 4.0588610019168326e-41, |
| 0.0, |
| 0.0, |
| 0.0, |
| 0.0 |
| ], |
| "max_prob_per_head": [ |
| 0.9163063168525696, |
| 0.9273470640182495, |
| 0.9537574052810669, |
| 0.9697834849357605, |
| 0.9889707565307617, |
| 0.9955309629440308, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0 |
| ], |
| "mean_entropy": 0.04759228229522705, |
| "mean_max_prob": 0.9793079495429993 |
| }, |
| { |
| "layer": 3, |
| "entropy_per_head": [ |
| 0.209762841463089, |
| 0.1171560063958168, |
| 0.09794622659683228, |
| 0.06588968634605408, |
| 0.026721913367509842, |
| 0.006907845847308636, |
| 3.580622567379583e-15, |
| 9.686195374952433e-41, |
| 0.0, |
| 0.0, |
| 0.0, |
| 0.0 |
| ], |
| "max_prob_per_head": [ |
| 0.9159234166145325, |
| 0.94795161485672, |
| 0.9551283717155457, |
| 0.9698406457901001, |
| 0.9875055551528931, |
| 0.9967637062072754, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0 |
| ], |
| "mean_entropy": 0.04369870945811272, |
| "mean_max_prob": 0.9810927510261536 |
| }, |
| { |
| "layer": 4, |
| "entropy_per_head": [ |
| 0.21481843292713165, |
| 0.15817253291606903, |
| 0.10903926193714142, |
| 0.06867136061191559, |
| 0.03494655713438988, |
| 0.01739848032593727, |
| 5.683253889200712e-13, |
| 9.1925179259708e-43, |
| 0.0, |
| 0.0, |
| 0.0, |
| 0.0 |
| ], |
| "max_prob_per_head": [ |
| 0.9135462045669556, |
| 0.9296945333480835, |
| 0.9507255554199219, |
| 0.9684354662895203, |
| 0.9833245277404785, |
| 0.9917436838150024, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0 |
| ], |
| "mean_entropy": 0.050253886729478836, |
| "mean_max_prob": 0.9781224727630615 |
| }, |
| { |
| "layer": 5, |
| "entropy_per_head": [ |
| 0.20507557690143585, |
| 0.1427854597568512, |
| 0.10421045869588852, |
| 0.04479587450623512, |
| 0.01711101457476616, |
| 0.005781891755759716, |
| 7.97702019456753e-13, |
| 2.397621672459762e-42, |
| 0.0, |
| 0.0, |
| 0.0, |
| 0.0 |
| ], |
| "max_prob_per_head": [ |
| 0.917660117149353, |
| 0.9356474876403809, |
| 0.9525343179702759, |
| 0.9798707962036133, |
| 0.9924705624580383, |
| 0.997445285320282, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0 |
| ], |
| "mean_entropy": 0.04331335797905922, |
| "mean_max_prob": 0.9813024401664734 |
| }, |
| { |
| "layer": 6, |
| "entropy_per_head": [ |
| 0.1552124321460724, |
| 0.10944928973913193, |
| 0.08590570837259293, |
| 0.03441280126571655, |
| 0.01842230185866356, |
| 0.007083357311785221, |
| 1.0530790649221355e-13, |
| 1.4939242928166875e-41, |
| 0.0, |
| 0.0, |
| 0.0, |
| 0.0 |
| ], |
| "max_prob_per_head": [ |
| 0.9378371238708496, |
| 0.9510696530342102, |
| 0.9609823226928711, |
| 0.983575165271759, |
| 0.9912484288215637, |
| 0.9965624809265137, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0 |
| ], |
| "mean_entropy": 0.03420715779066086, |
| "mean_max_prob": 0.9851062297821045 |
| }, |
| { |
| "layer": 7, |
| "entropy_per_head": [ |
| 0.20514321327209473, |
| 0.12170679867267609, |
| 0.0744284838438034, |
| 0.051210127770900726, |
| 0.019812947139143944, |
| 0.017020035535097122, |
| 6.806287304594871e-13, |
| 5.125829270832249e-39, |
| 0.0, |
| 0.0, |
| 0.0, |
| 0.0 |
| ], |
| "max_prob_per_head": [ |
| 0.9193431735038757, |
| 0.9457699060440063, |
| 0.9668407440185547, |
| 0.976292610168457, |
| 0.9907869100570679, |
| 0.9917880892753601, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0 |
| ], |
| "mean_entropy": 0.040776800364255905, |
| "mean_max_prob": 0.9825684428215027 |
| }, |
| { |
| "layer": 8, |
| "entropy_per_head": [ |
| 0.17893101274967194, |
| 0.07504458725452423, |
| 0.059000205248594284, |
| 0.031063487753272057, |
| 0.025076285004615784, |
| 0.007854106836020947, |
| 1.886881761947734e-10, |
| 3.0688436368713494e-43, |
| 0.0, |
| 0.0, |
| 0.0, |
| 0.0 |
| ], |
| "max_prob_per_head": [ |
| 0.9290326833724976, |
| 0.9654616117477417, |
| 0.9727820158004761, |
| 0.9861835241317749, |
| 0.9882169961929321, |
| 0.9964586496353149, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0 |
| ], |
| "mean_entropy": 0.03141414001584053, |
| "mean_max_prob": 0.9865112900733948 |
| }, |
| { |
| "layer": 9, |
| "entropy_per_head": [ |
| 0.22920922935009003, |
| 0.13090595602989197, |
| 0.07475412636995316, |
| 0.05009441822767258, |
| 0.02543170377612114, |
| 0.006821715272963047, |
| 7.358213973025773e-13, |
| 1.4680002712266784e-41, |
| 0.0, |
| 0.0, |
| 0.0, |
| 0.0 |
| ], |
| "max_prob_per_head": [ |
| 0.909065842628479, |
| 0.9412500262260437, |
| 0.9665333032608032, |
| 0.9774504899978638, |
| 0.9881967306137085, |
| 0.9967927932739258, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0 |
| ], |
| "mean_entropy": 0.04310142993927002, |
| "mean_max_prob": 0.9816074371337891 |
| } |
| ], |
| "error_breakdown": { |
| "overall_accuracy": 0.727154541015625, |
| "per_class": { |
| "space": { |
| "accuracy": 0.933965844402277, |
| "n": 31620 |
| }, |
| "newline": { |
| "accuracy": 0.5960154952960708, |
| "n": 1807 |
| }, |
| "lowercase": { |
| "accuracy": 0.699845422511801, |
| "n": 120328 |
| }, |
| "uppercase": { |
| "accuracy": 0.34219190968955787, |
| "n": 4252 |
| }, |
| "digit": { |
| "accuracy": 0.0, |
| "n": 9 |
| }, |
| "punct": { |
| "accuracy": 0.49141483516483514, |
| "n": 5824 |
| } |
| } |
| }, |
| "teacher_ckpt": "/root/bitnet1/ckpt/fp32_ref_50M_last.pt", |
| "teacher_val_bpc": 0.8601819578579681, |
| "student_teacher_similarity": [ |
| { |
| "layer": 0, |
| "sign_agreement": 0.577347195148468 |
| }, |
| { |
| "layer": 1, |
| "sign_agreement": 0.5173985481262207 |
| }, |
| { |
| "layer": 2, |
| "sign_agreement": 0.5035130262374878 |
| }, |
| { |
| "layer": 3, |
| "sign_agreement": 0.5001080393791199 |
| }, |
| { |
| "layer": 4, |
| "sign_agreement": 0.5003906428813935 |
| }, |
| { |
| "layer": 5, |
| "sign_agreement": 0.5008003354072571 |
| }, |
| { |
| "layer": 6, |
| "sign_agreement": 0.5006855249404907 |
| }, |
| { |
| "layer": 7, |
| "sign_agreement": 0.5008371591567993 |
| }, |
| { |
| "layer": 8, |
| "sign_agreement": 0.49889018535614016 |
| }, |
| { |
| "layer": 9, |
| "sign_agreement": 0.49474666118621824 |
| } |
| ] |
| } |