bitnet-1bitllm / analysis_v29.json
hidude562's picture
1bitllm code (checkpoints to follow)
4754707 verified
{
"student_ckpt": "/root/bitnet1/ckpt/v29_50M_50K_last.pt",
"student_config": {
"vocab_size": 128,
"d_model": 768,
"n_layers": 10,
"n_heads": 12,
"d_ff": 1280,
"seq_len": 256
},
"student_step": 50000,
"student_val_bpc": 1.2349195671077682,
"timestamp": "2026-04-22 17:05:59",
"layer_ablation": {
"baseline_bpc": 1.2372383790923875,
"per_layer": [
{
"layer": 0,
"baseline_bpc": 1.2372383790923875,
"ablated_bpc": 2.658024141629142,
"delta_bpc": 1.4207857625367544
},
{
"layer": 1,
"baseline_bpc": 1.2372383790923875,
"ablated_bpc": 1.791177114375058,
"delta_bpc": 0.5539387352826706
},
{
"layer": 2,
"baseline_bpc": 1.2372383790923875,
"ablated_bpc": 1.40893652335544,
"delta_bpc": 0.17169814426305252
},
{
"layer": 3,
"baseline_bpc": 1.2372383790923875,
"ablated_bpc": 1.3546692810857077,
"delta_bpc": 0.11743090199332018
},
{
"layer": 4,
"baseline_bpc": 1.2372383790923875,
"ablated_bpc": 1.4076403331115497,
"delta_bpc": 0.17040195401916214
},
{
"layer": 5,
"baseline_bpc": 1.2372383790923875,
"ablated_bpc": 1.3853021601250528,
"delta_bpc": 0.14806378103266526
},
{
"layer": 6,
"baseline_bpc": 1.2372383790923875,
"ablated_bpc": 1.4080221517947302,
"delta_bpc": 0.17078377270234268
},
{
"layer": 7,
"baseline_bpc": 1.2372383790923875,
"ablated_bpc": 1.5111466640773288,
"delta_bpc": 0.27390828498494124
},
{
"layer": 8,
"baseline_bpc": 1.2372383790923875,
"ablated_bpc": 1.5338596560531164,
"delta_bpc": 0.2966212769607288
},
{
"layer": 9,
"baseline_bpc": 1.2372383790923875,
"ablated_bpc": 1.6496860416056456,
"delta_bpc": 0.41244766251325804
}
]
},
"weight_saturation": [
{
"name": "out_codebook",
"shape": [
128,
768
],
"n": 98304,
"mean": 0.32484379410743713,
"median": 0.26955974102020264,
"q10": 0.022471295669674873,
"q90": 0.6936785578727722,
"q99": 1.0008821487426758,
"frac_below_0.01": 0.077606201171875,
"frac_below_0.05": 0.1428019255399704,
"frac_above_0.5": 0.2465413510799408,
"max": 2.1756932735443115
},
{
"name": "embed.weight",
"shape": [
128,
768
],
"n": 98304,
"mean": 0.025270771235227585,
"median": 0.01747666671872139,
"q10": 0.0027019940316677094,
"q90": 0.06000998988747597,
"q99": 0.1124815121293068,
"frac_below_0.01": 0.322998046875,
"frac_below_0.05": 0.8575338125228882,
"frac_above_0.5": 0.0,
"max": 0.20307880640029907
},
{
"name": "blocks.0.attn.q_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.020984457805752754,
"median": 0.01630321331322193,
"q10": 0.002880966058000922,
"q90": 0.04540819674730301,
"q99": 0.08269572257995605,
"frac_below_0.01": 0.327667236328125,
"frac_below_0.05": 0.9232143759727478,
"frac_above_0.5": 0.0,
"max": 0.17969276010990143
},
{
"name": "blocks.0.attn.k_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.021982185542583466,
"median": 0.016868969425559044,
"q10": 0.0029649189673364162,
"q90": 0.04804832115769386,
"q99": 0.08712130039930344,
"frac_below_0.01": 0.318261057138443,
"frac_below_0.05": 0.9096696972846985,
"frac_above_0.5": 0.0,
"max": 0.17866352200508118
},
{
"name": "blocks.0.attn.v_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.02966231107711792,
"median": 0.02414115145802498,
"q10": 0.0041543589904904366,
"q90": 0.0629270151257515,
"q99": 0.10377822816371918,
"frac_below_0.01": 0.2300957590341568,
"frac_below_0.05": 0.8167402744293213,
"frac_above_0.5": 0.0,
"max": 0.21888598799705505
},
{
"name": "blocks.0.attn.o_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.03943490982055664,
"median": 0.024108367040753365,
"q10": 0.0036507798358798027,
"q90": 0.09731841832399368,
"q99": 0.2101808488368988,
"frac_below_0.01": 0.2511308491230011,
"frac_below_0.05": 0.7431420087814331,
"frac_above_0.5": 3.3908420391526306e-06,
"max": 0.5122478604316711
},
{
"name": "blocks.0.ffn.gate.raw.weight",
"shape": [
1280,
768
],
"n": 983040,
"mean": 0.029497429728507996,
"median": 0.024814119562506676,
"q10": 0.004512307234108448,
"q90": 0.06109948456287384,
"q99": 0.09630865603685379,
"frac_below_0.01": 0.21697694063186646,
"frac_below_0.05": 0.8223521113395691,
"frac_above_0.5": 0.0,
"max": 0.17949974536895752
},
{
"name": "blocks.0.ffn.up.raw.weight",
"shape": [
1280,
768
],
"n": 983040,
"mean": 0.029491588473320007,
"median": 0.024759948253631592,
"q10": 0.004544392228126526,
"q90": 0.06112118810415268,
"q99": 0.09623175859451294,
"frac_below_0.01": 0.21637777984142303,
"frac_below_0.05": 0.8225098252296448,
"frac_above_0.5": 0.0,
"max": 0.17936795949935913
},
{
"name": "blocks.0.ffn.down.raw.weight",
"shape": [
768,
1280
],
"n": 983040,
"mean": 0.030015649273991585,
"median": 0.025115979835391045,
"q10": 0.004507269244641066,
"q90": 0.06252331286668777,
"q99": 0.09870631247758865,
"frac_below_0.01": 0.21663717925548553,
"frac_below_0.05": 0.8136902451515198,
"frac_above_0.5": 0.0,
"max": 0.19937986135482788
},
{
"name": "blocks.1.attn.q_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.021738752722740173,
"median": 0.016651637852191925,
"q10": 0.0030065574683248997,
"q90": 0.04737270250916481,
"q99": 0.0873987078666687,
"frac_below_0.01": 0.3206566870212555,
"frac_below_0.05": 0.9128333330154419,
"frac_above_0.5": 0.0,
"max": 0.20024967193603516
},
{
"name": "blocks.1.attn.k_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.022124914452433586,
"median": 0.016716480255126953,
"q10": 0.0029788485262542963,
"q90": 0.0486590750515461,
"q99": 0.09064910560846329,
"frac_below_0.01": 0.3203616738319397,
"frac_below_0.05": 0.9063364863395691,
"frac_above_0.5": 0.0,
"max": 0.21088646352291107
},
{
"name": "blocks.1.attn.v_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.032271161675453186,
"median": 0.02542409859597683,
"q10": 0.004067442379891872,
"q90": 0.07015787810087204,
"q99": 0.11874839663505554,
"frac_below_0.01": 0.2300516813993454,
"frac_below_0.05": 0.7786475419998169,
"frac_above_0.5": 0.0,
"max": 0.29738032817840576
},
{
"name": "blocks.1.attn.o_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.0365101583302021,
"median": 0.026378890499472618,
"q10": 0.004064246546477079,
"q90": 0.08268115669488907,
"q99": 0.15872563421726227,
"frac_below_0.01": 0.2282664030790329,
"frac_below_0.05": 0.7426249384880066,
"frac_above_0.5": 0.0,
"max": 0.32939204573631287
},
{
"name": "blocks.1.ffn.gate.raw.weight",
"shape": [
1280,
768
],
"n": 983040,
"mean": 0.030397042632102966,
"median": 0.0249449722468853,
"q10": 0.0042805117554962635,
"q90": 0.06440477073192596,
"q99": 0.10299010574817657,
"frac_below_0.01": 0.22425639629364014,
"frac_below_0.05": 0.8044993281364441,
"frac_above_0.5": 0.0,
"max": 0.20056255161762238
},
{
"name": "blocks.1.ffn.up.raw.weight",
"shape": [
1280,
768
],
"n": 983040,
"mean": 0.03043045848608017,
"median": 0.02498842217028141,
"q10": 0.00427834689617157,
"q90": 0.06437431275844574,
"q99": 0.1029980257153511,
"frac_below_0.01": 0.22370098531246185,
"frac_below_0.05": 0.8039296865463257,
"frac_above_0.5": 0.0,
"max": 0.20558297634124756
},
{
"name": "blocks.1.ffn.down.raw.weight",
"shape": [
768,
1280
],
"n": 983040,
"mean": 0.030961716547608376,
"median": 0.025217225775122643,
"q10": 0.004261382389813662,
"q90": 0.06597873568534851,
"q99": 0.10618423670530319,
"frac_below_0.01": 0.22413738071918488,
"frac_below_0.05": 0.7958465814590454,
"frac_above_0.5": 0.0,
"max": 0.20832186937332153
},
{
"name": "blocks.2.attn.q_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.021832674741744995,
"median": 0.016725117340683937,
"q10": 0.003007261548191309,
"q90": 0.04768085852265358,
"q99": 0.08757057785987854,
"frac_below_0.01": 0.31890869140625,
"frac_below_0.05": 0.9113871455192566,
"frac_above_0.5": 0.0,
"max": 0.19008490443229675
},
{
"name": "blocks.2.attn.k_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.02204788476228714,
"median": 0.01679176464676857,
"q10": 0.003010682761669159,
"q90": 0.04842023551464081,
"q99": 0.08911668509244919,
"frac_below_0.01": 0.3183034360408783,
"frac_below_0.05": 0.907684326171875,
"frac_above_0.5": 0.0,
"max": 0.19714821875095367
},
{
"name": "blocks.2.attn.v_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.03241870179772377,
"median": 0.025510499253869057,
"q10": 0.0039276741445064545,
"q90": 0.07076237350702286,
"q99": 0.11824272572994232,
"frac_below_0.01": 0.2332882434129715,
"frac_below_0.05": 0.7741004228591919,
"frac_above_0.5": 0.0,
"max": 0.25875869393348694
},
{
"name": "blocks.2.attn.o_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.0360061414539814,
"median": 0.026566587388515472,
"q10": 0.003982395865023136,
"q90": 0.08094409108161926,
"q99": 0.15047675371170044,
"frac_below_0.01": 0.2296006977558136,
"frac_below_0.05": 0.7429623007774353,
"frac_above_0.5": 0.0,
"max": 0.3236503005027771
},
{
"name": "blocks.2.ffn.gate.raw.weight",
"shape": [
1280,
768
],
"n": 983040,
"mean": 0.030454641208052635,
"median": 0.024827929213643074,
"q10": 0.004183255601674318,
"q90": 0.06483904272317886,
"q99": 0.10405371338129044,
"frac_below_0.01": 0.2272491604089737,
"frac_below_0.05": 0.8031179308891296,
"frac_above_0.5": 0.0,
"max": 0.21293406188488007
},
{
"name": "blocks.2.ffn.up.raw.weight",
"shape": [
1280,
768
],
"n": 983040,
"mean": 0.030473940074443817,
"median": 0.024836573749780655,
"q10": 0.0041793277487158775,
"q90": 0.0648508369922638,
"q99": 0.10394246876239777,
"frac_below_0.01": 0.22719117999076843,
"frac_below_0.05": 0.8021494746208191,
"frac_above_0.5": 0.0,
"max": 0.20337601006031036
},
{
"name": "blocks.2.ffn.down.raw.weight",
"shape": [
768,
1280
],
"n": 983040,
"mean": 0.03145662695169449,
"median": 0.025447478517889977,
"q10": 0.004232366569340229,
"q90": 0.0673917829990387,
"q99": 0.10850941389799118,
"frac_below_0.01": 0.22418315708637238,
"frac_below_0.05": 0.7883524894714355,
"frac_above_0.5": 0.0,
"max": 0.2242489606142044
},
{
"name": "blocks.3.attn.q_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.021820461377501488,
"median": 0.016704333946108818,
"q10": 0.0030075714457780123,
"q90": 0.047725576907396317,
"q99": 0.08704143017530441,
"frac_below_0.01": 0.3202582597732544,
"frac_below_0.05": 0.9108836054801941,
"frac_above_0.5": 0.0,
"max": 0.1996636688709259
},
{
"name": "blocks.3.attn.k_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.02202785760164261,
"median": 0.016765017062425613,
"q10": 0.003018269781023264,
"q90": 0.04830968379974365,
"q99": 0.08893266320228577,
"frac_below_0.01": 0.319023996591568,
"frac_below_0.05": 0.9081844687461853,
"frac_above_0.5": 0.0,
"max": 0.2032247930765152
},
{
"name": "blocks.3.attn.v_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.03259618952870369,
"median": 0.02546798251569271,
"q10": 0.0037475833669304848,
"q90": 0.07166170328855515,
"q99": 0.1207074448466301,
"frac_below_0.01": 0.2383185476064682,
"frac_below_0.05": 0.7702789306640625,
"frac_above_0.5": 0.0,
"max": 0.308125764131546
},
{
"name": "blocks.3.attn.o_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.036724548786878586,
"median": 0.026713047176599503,
"q10": 0.0038133268244564533,
"q90": 0.08330091834068298,
"q99": 0.1569368839263916,
"frac_below_0.01": 0.233211949467659,
"frac_below_0.05": 0.7358195185661316,
"frac_above_0.5": 0.0,
"max": 0.35014036297798157
},
{
"name": "blocks.3.ffn.gate.raw.weight",
"shape": [
1280,
768
],
"n": 983040,
"mean": 0.03041911870241165,
"median": 0.024767549708485603,
"q10": 0.004097540397197008,
"q90": 0.06491811573505402,
"q99": 0.10423436015844345,
"frac_below_0.01": 0.23008118569850922,
"frac_below_0.05": 0.8022247552871704,
"frac_above_0.5": 0.0,
"max": 0.2128787487745285
},
{
"name": "blocks.3.ffn.up.raw.weight",
"shape": [
1280,
768
],
"n": 983040,
"mean": 0.030419766902923584,
"median": 0.024740705266594887,
"q10": 0.004079720471054316,
"q90": 0.06493699550628662,
"q99": 0.10483106225728989,
"frac_below_0.01": 0.2303914576768875,
"frac_below_0.05": 0.802680492401123,
"frac_above_0.5": 0.0,
"max": 0.21313220262527466
},
{
"name": "blocks.3.ffn.down.raw.weight",
"shape": [
768,
1280
],
"n": 983040,
"mean": 0.031688056886196136,
"median": 0.025530001148581505,
"q10": 0.004116586875170469,
"q90": 0.06815119832754135,
"q99": 0.11011636257171631,
"frac_below_0.01": 0.22733664512634277,
"frac_below_0.05": 0.7837514877319336,
"frac_above_0.5": 0.0,
"max": 0.22840875387191772
},
{
"name": "blocks.4.attn.q_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.021806135773658752,
"median": 0.016604429110884666,
"q10": 0.0029621128924191,
"q90": 0.047807518392801285,
"q99": 0.0883331298828125,
"frac_below_0.01": 0.3223758339881897,
"frac_below_0.05": 0.9105224609375,
"frac_above_0.5": 0.0,
"max": 0.19498515129089355
},
{
"name": "blocks.4.attn.k_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.0219953004270792,
"median": 0.01666717790067196,
"q10": 0.002971069421619177,
"q90": 0.04836598038673401,
"q99": 0.08983262628316879,
"frac_below_0.01": 0.320641428232193,
"frac_below_0.05": 0.9077809453010559,
"frac_above_0.5": 0.0,
"max": 0.19498971104621887
},
{
"name": "blocks.4.attn.v_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.03282567113637924,
"median": 0.025203978642821312,
"q10": 0.0033933515660464764,
"q90": 0.07318802922964096,
"q99": 0.12469246238470078,
"frac_below_0.01": 0.2480655312538147,
"frac_below_0.05": 0.7664320468902588,
"frac_above_0.5": 0.0,
"max": 0.28078585863113403
},
{
"name": "blocks.4.attn.o_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.03867517039179802,
"median": 0.026929715648293495,
"q10": 0.0034530421253293753,
"q90": 0.09060992300510406,
"q99": 0.1718837320804596,
"frac_below_0.01": 0.2434438019990921,
"frac_below_0.05": 0.7186652421951294,
"frac_above_0.5": 0.0,
"max": 0.3775966465473175
},
{
"name": "blocks.4.ffn.gate.raw.weight",
"shape": [
1280,
768
],
"n": 983040,
"mean": 0.030496973544359207,
"median": 0.024575216695666313,
"q10": 0.003859907388687134,
"q90": 0.06571313738822937,
"q99": 0.10632941871881485,
"frac_below_0.01": 0.23670147359371185,
"frac_below_0.05": 0.7992218136787415,
"frac_above_0.5": 0.0,
"max": 0.236494243144989
},
{
"name": "blocks.4.ffn.up.raw.weight",
"shape": [
1280,
768
],
"n": 983040,
"mean": 0.03050924651324749,
"median": 0.02466735430061817,
"q10": 0.003866195445880294,
"q90": 0.06563732028007507,
"q99": 0.10621653497219086,
"frac_below_0.01": 0.23652751743793488,
"frac_below_0.05": 0.799262523651123,
"frac_above_0.5": 0.0,
"max": 0.22307677567005157
},
{
"name": "blocks.4.ffn.down.raw.weight",
"shape": [
768,
1280
],
"n": 983040,
"mean": 0.03216877207159996,
"median": 0.025634581223130226,
"q10": 0.003866716753691435,
"q90": 0.07008880376815796,
"q99": 0.11355020850896835,
"frac_below_0.01": 0.23405660688877106,
"frac_below_0.05": 0.7747518420219421,
"frac_above_0.5": 0.0,
"max": 0.2466614544391632
},
{
"name": "blocks.5.attn.q_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.021769991144537926,
"median": 0.016515638679265976,
"q10": 0.0029437255579978228,
"q90": 0.04784300550818443,
"q99": 0.08883869647979736,
"frac_below_0.01": 0.3243001401424408,
"frac_below_0.05": 0.9103173017501831,
"frac_above_0.5": 0.0,
"max": 0.1770746260881424
},
{
"name": "blocks.5.attn.k_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.02204853482544422,
"median": 0.016590725630521774,
"q10": 0.002919677644968033,
"q90": 0.04862433299422264,
"q99": 0.09129194915294647,
"frac_below_0.01": 0.3242102861404419,
"frac_below_0.05": 0.9064585566520691,
"frac_above_0.5": 0.0,
"max": 0.2087617963552475
},
{
"name": "blocks.5.attn.v_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.0333058126270771,
"median": 0.025002961978316307,
"q10": 0.002964397193863988,
"q90": 0.07551417499780655,
"q99": 0.13100847601890564,
"frac_below_0.01": 0.2606692910194397,
"frac_below_0.05": 0.7589297890663147,
"frac_above_0.5": 0.0,
"max": 0.3152570426464081
},
{
"name": "blocks.5.attn.o_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.041803035885095596,
"median": 0.02722357213497162,
"q10": 0.0029212224762886763,
"q90": 0.10118373483419418,
"q99": 0.20479606091976166,
"frac_below_0.01": 0.2576039731502533,
"frac_below_0.05": 0.6999596357345581,
"frac_above_0.5": 5.086263172415784e-06,
"max": 0.5692521929740906
},
{
"name": "blocks.5.ffn.gate.raw.weight",
"shape": [
1280,
768
],
"n": 983040,
"mean": 0.030752409249544144,
"median": 0.0244681965559721,
"q10": 0.0035621533170342445,
"q90": 0.06708699464797974,
"q99": 0.10938585549592972,
"frac_below_0.01": 0.2462412714958191,
"frac_below_0.05": 0.7924683094024658,
"frac_above_0.5": 0.0,
"max": 0.2319944053888321
},
{
"name": "blocks.5.ffn.up.raw.weight",
"shape": [
1280,
768
],
"n": 983040,
"mean": 0.030752165243029594,
"median": 0.024411072954535484,
"q10": 0.0035458123311400414,
"q90": 0.06726153194904327,
"q99": 0.10948219895362854,
"frac_below_0.01": 0.24592998623847961,
"frac_below_0.05": 0.7923950552940369,
"frac_above_0.5": 0.0,
"max": 0.228058323264122
},
{
"name": "blocks.5.ffn.down.raw.weight",
"shape": [
768,
1280
],
"n": 983040,
"mean": 0.03299623355269432,
"median": 0.02572372555732727,
"q10": 0.003507574088871479,
"q90": 0.07318581640720367,
"q99": 0.12024357914924622,
"frac_below_0.01": 0.24329835176467896,
"frac_below_0.05": 0.7614848017692566,
"frac_above_0.5": 0.0,
"max": 0.27965444326400757
},
{
"name": "blocks.6.attn.q_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.0215331818908453,
"median": 0.016295911744236946,
"q10": 0.0028458735905587673,
"q90": 0.04729600250720978,
"q99": 0.08885350078344345,
"frac_below_0.01": 0.329857736825943,
"frac_below_0.05": 0.9127739667892456,
"frac_above_0.5": 0.0,
"max": 0.19640463590621948
},
{
"name": "blocks.6.attn.k_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.02174237370491028,
"median": 0.01627686247229576,
"q10": 0.0028480947948992252,
"q90": 0.047940269112586975,
"q99": 0.0914376974105835,
"frac_below_0.01": 0.3300510048866272,
"frac_below_0.05": 0.9095560908317566,
"frac_above_0.5": 0.0,
"max": 0.21430690586566925
},
{
"name": "blocks.6.attn.v_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.03459187224507332,
"median": 0.02503044530749321,
"q10": 0.0024408106692135334,
"q90": 0.08047246932983398,
"q99": 0.14331185817718506,
"frac_below_0.01": 0.2751024067401886,
"frac_below_0.05": 0.7422892451286316,
"frac_above_0.5": 0.0,
"max": 0.3117149770259857
},
{
"name": "blocks.6.attn.o_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.04699027165770531,
"median": 0.02844177559018135,
"q10": 0.0022870029788464308,
"q90": 0.1200011819601059,
"q99": 0.23704038560390472,
"frac_below_0.01": 0.2714368999004364,
"frac_below_0.05": 0.6671345829963684,
"frac_above_0.5": 3.3908420391526306e-06,
"max": 0.518464207649231
},
{
"name": "blocks.6.ffn.gate.raw.weight",
"shape": [
1280,
768
],
"n": 983040,
"mean": 0.03131778910756111,
"median": 0.02425358071923256,
"q10": 0.0030630475375801325,
"q90": 0.07003403455018997,
"q99": 0.11553935706615448,
"frac_below_0.01": 0.2605092525482178,
"frac_below_0.05": 0.7811055779457092,
"frac_above_0.5": 0.0,
"max": 0.24002909660339355
},
{
"name": "blocks.6.ffn.up.raw.weight",
"shape": [
1280,
768
],
"n": 983040,
"mean": 0.031340092420578,
"median": 0.024272354319691658,
"q10": 0.0030343779362738132,
"q90": 0.07006758451461792,
"q99": 0.11566709727048874,
"frac_below_0.01": 0.2608998715877533,
"frac_below_0.05": 0.780102550983429,
"frac_above_0.5": 0.0,
"max": 0.2660861313343048
},
{
"name": "blocks.6.ffn.down.raw.weight",
"shape": [
768,
1280
],
"n": 983040,
"mean": 0.034312065690755844,
"median": 0.025947941467165947,
"q10": 0.0029694088734686375,
"q90": 0.07810769975185394,
"q99": 0.13014021515846252,
"frac_below_0.01": 0.2571146786212921,
"frac_below_0.05": 0.7417826652526855,
"frac_above_0.5": 0.0,
"max": 0.26336076855659485
},
{
"name": "blocks.7.attn.q_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.022099530324339867,
"median": 0.016269685700535774,
"q10": 0.0027676254976540804,
"q90": 0.04940343275666237,
"q99": 0.0952984169125557,
"frac_below_0.01": 0.3332875669002533,
"frac_below_0.05": 0.9026760458946228,
"frac_above_0.5": 0.0,
"max": 0.21799291670322418
},
{
"name": "blocks.7.attn.k_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.022439096122980118,
"median": 0.016338741406798363,
"q10": 0.002755739027634263,
"q90": 0.050511594861745834,
"q99": 0.09869782626628876,
"frac_below_0.01": 0.3318294882774353,
"frac_below_0.05": 0.8977118730545044,
"frac_above_0.5": 0.0,
"max": 0.219570130109787
},
{
"name": "blocks.7.attn.v_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.03680150955915451,
"median": 0.024955160915851593,
"q10": 0.0015987252118065953,
"q90": 0.08944220095872879,
"q99": 0.16461941599845886,
"frac_below_0.01": 0.3004574179649353,
"frac_below_0.05": 0.7197520136833191,
"frac_above_0.5": 0.0,
"max": 0.3584654927253723
},
{
"name": "blocks.7.attn.o_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.05452443286776543,
"median": 0.029793892055749893,
"q10": 0.0014873046893626451,
"q90": 0.14702990651130676,
"q99": 0.28597983717918396,
"frac_below_0.01": 0.2919921875,
"frac_below_0.05": 0.6334109902381897,
"frac_above_0.5": 8.646646892884746e-05,
"max": 0.7674525380134583
},
{
"name": "blocks.7.ffn.gate.raw.weight",
"shape": [
1280,
768
],
"n": 983040,
"mean": 0.03266124054789543,
"median": 0.024235490709543228,
"q10": 0.0022881708573549986,
"q90": 0.07565723359584808,
"q99": 0.1269170194864273,
"frac_below_0.01": 0.2822510004043579,
"frac_below_0.05": 0.757544994354248,
"frac_above_0.5": 0.0,
"max": 0.28104186058044434
},
{
"name": "blocks.7.ffn.up.raw.weight",
"shape": [
1280,
768
],
"n": 983040,
"mean": 0.032623130828142166,
"median": 0.0241586584597826,
"q10": 0.002281878376379609,
"q90": 0.07564595341682434,
"q99": 0.1268070936203003,
"frac_below_0.01": 0.281890869140625,
"frac_below_0.05": 0.7584859728813171,
"frac_above_0.5": 0.0,
"max": 0.267304927110672
},
{
"name": "blocks.7.ffn.down.raw.weight",
"shape": [
768,
1280
],
"n": 983040,
"mean": 0.036399319767951965,
"median": 0.026400091126561165,
"q10": 0.0022754990495741367,
"q90": 0.08571262657642365,
"q99": 0.14497342705726624,
"frac_below_0.01": 0.27541911602020264,
"frac_below_0.05": 0.7168355584144592,
"frac_above_0.5": 0.0,
"max": 0.30715522170066833
},
{
"name": "blocks.8.attn.q_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.02204364724457264,
"median": 0.01618419960141182,
"q10": 0.0027065007016062737,
"q90": 0.04928082227706909,
"q99": 0.09612085670232773,
"frac_below_0.01": 0.3353847861289978,
"frac_below_0.05": 0.9031490683555603,
"frac_above_0.5": 0.0,
"max": 0.2505124807357788
},
{
"name": "blocks.8.attn.k_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.022502297535538673,
"median": 0.01621812768280506,
"q10": 0.0026969979517161846,
"q90": 0.05086328089237213,
"q99": 0.1007092148065567,
"frac_below_0.01": 0.3353763222694397,
"frac_below_0.05": 0.8963436484336853,
"frac_above_0.5": 0.0,
"max": 0.23622968792915344
},
{
"name": "blocks.8.attn.v_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.03888942673802376,
"median": 0.0260167196393013,
"q10": 0.001105984323658049,
"q90": 0.0957922413945198,
"q99": 0.17475828528404236,
"frac_below_0.01": 0.3079240620136261,
"frac_below_0.05": 0.6965315341949463,
"frac_above_0.5": 0.0,
"max": 0.49975377321243286
},
{
"name": "blocks.8.attn.o_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.058232564479112625,
"median": 0.03141973167657852,
"q10": 0.0010921643115580082,
"q90": 0.15505672991275787,
"q99": 0.33220720291137695,
"frac_below_0.01": 0.2909308671951294,
"frac_below_0.05": 0.6194356083869934,
"frac_above_0.5": 0.0006086561479605734,
"max": 0.8799290657043457
},
{
"name": "blocks.8.ffn.gate.raw.weight",
"shape": [
1280,
768
],
"n": 983040,
"mean": 0.03446163237094879,
"median": 0.024365149438381195,
"q10": 0.001547358464449644,
"q90": 0.08275697380304337,
"q99": 0.14115624129772186,
"frac_below_0.01": 0.30199891328811646,
"frac_below_0.05": 0.733826756477356,
"frac_above_0.5": 0.0,
"max": 0.2949505150318146
},
{
"name": "blocks.8.ffn.up.raw.weight",
"shape": [
1280,
768
],
"n": 983040,
"mean": 0.03450019657611847,
"median": 0.024399472400546074,
"q10": 0.0015466592740267515,
"q90": 0.08281046152114868,
"q99": 0.14151407778263092,
"frac_below_0.01": 0.3011057674884796,
"frac_below_0.05": 0.733771800994873,
"frac_above_0.5": 0.0,
"max": 0.279220312833786
},
{
"name": "blocks.8.ffn.down.raw.weight",
"shape": [
768,
1280
],
"n": 983040,
"mean": 0.03970775753259659,
"median": 0.027399426326155663,
"q10": 0.0016542956000193954,
"q90": 0.0966249480843544,
"q99": 0.16643178462982178,
"frac_below_0.01": 0.28803712129592896,
"frac_below_0.05": 0.6849355697631836,
"frac_above_0.5": 0.0,
"max": 0.3608091473579407
},
{
"name": "blocks.9.attn.q_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.022359108552336693,
"median": 0.016284987330436707,
"q10": 0.00270974263548851,
"q90": 0.05029057338833809,
"q99": 0.09790308773517609,
"frac_below_0.01": 0.3334842324256897,
"frac_below_0.05": 0.8987799882888794,
"frac_above_0.5": 0.0,
"max": 0.2203265279531479
},
{
"name": "blocks.9.attn.k_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.02273663878440857,
"median": 0.016311366111040115,
"q10": 0.0026683020405471325,
"q90": 0.05166053771972656,
"q99": 0.10231450200080872,
"frac_below_0.01": 0.3345811665058136,
"frac_below_0.05": 0.8929036259651184,
"frac_above_0.5": 0.0,
"max": 0.24579688906669617
},
{
"name": "blocks.9.attn.v_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.042768146842718124,
"median": 0.028541607782244682,
"q10": 0.0006766233709640801,
"q90": 0.10649372637271881,
"q99": 0.18854376673698425,
"frac_below_0.01": 0.303490549325943,
"frac_below_0.05": 0.6605818271636963,
"frac_above_0.5": 0.0,
"max": 0.4518583118915558
},
{
"name": "blocks.9.attn.o_proj.raw.weight",
"shape": [
768,
768
],
"n": 589824,
"mean": 0.0570216067135334,
"median": 0.03520055115222931,
"q10": 0.0008812142186798155,
"q90": 0.14035534858703613,
"q99": 0.3157142400741577,
"frac_below_0.01": 0.2710367739200592,
"frac_below_0.05": 0.59759521484375,
"frac_above_0.5": 0.0011274550342932343,
"max": 1.018288016319275
},
{
"name": "blocks.9.ffn.gate.raw.weight",
"shape": [
1280,
768
],
"n": 983040,
"mean": 0.03622754290699959,
"median": 0.024802587926387787,
"q10": 0.0009529480012133718,
"q90": 0.08900798112154007,
"q99": 0.15433676540851593,
"frac_below_0.01": 0.31352946162223816,
"frac_below_0.05": 0.7148427963256836,
"frac_above_0.5": 0.0,
"max": 0.3419005572795868
},
{
"name": "blocks.9.ffn.up.raw.weight",
"shape": [
1280,
768
],
"n": 983040,
"mean": 0.03622148931026459,
"median": 0.024851901456713676,
"q10": 0.000944304745644331,
"q90": 0.08894885331392288,
"q99": 0.15408946573734283,
"frac_below_0.01": 0.31365662813186646,
"frac_below_0.05": 0.7145477533340454,
"frac_above_0.5": 0.0,
"max": 0.3313311040401459
},
{
"name": "blocks.9.ffn.down.raw.weight",
"shape": [
768,
1280
],
"n": 983040,
"mean": 0.0442914180457592,
"median": 0.02870362251996994,
"q10": 0.0009061374003067613,
"q90": 0.11138416081666946,
"q99": 0.1992027461528778,
"frac_below_0.01": 0.3033294975757599,
"frac_below_0.05": 0.6538391709327698,
"frac_above_0.5": 0.0,
"max": 0.44191890954971313
}
],
"attention_entropy": [
{
"layer": 0,
"entropy_per_head": [
0.31201067566871643,
0.25528043508529663,
0.17442357540130615,
0.08394858986139297,
0.03744904324412346,
0.004117322154343128,
6.864327682654103e-12,
0.0,
0.0,
0.0,
0.0,
0.0
],
"max_prob_per_head": [
0.8789216876029968,
0.8898435831069946,
0.9274678230285645,
0.9629272222518921,
0.9853159785270691,
0.9986785054206848,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0
],
"mean_entropy": 0.07226914167404175,
"mean_max_prob": 0.9702628254890442
},
{
"layer": 1,
"entropy_per_head": [
0.2243039309978485,
0.1652330458164215,
0.11748333275318146,
0.055319301784038544,
0.026457656174898148,
0.005040682852268219,
1.5577959115806866e-14,
1.0707181436059495e-40,
0.0,
0.0,
0.0,
0.0
],
"max_prob_per_head": [
0.9096110463142395,
0.9270795583724976,
0.9465883374214172,
0.9738114476203918,
0.9879803657531738,
0.9976929426193237,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0
],
"mean_entropy": 0.049486491829156876,
"mean_max_prob": 0.9785636067390442
},
{
"layer": 2,
"entropy_per_head": [
0.2058943510055542,
0.16393162310123444,
0.10231588780879974,
0.06609746068716049,
0.023581046611070633,
0.009287012740969658,
7.763824034622696e-14,
4.0588610019168326e-41,
0.0,
0.0,
0.0,
0.0
],
"max_prob_per_head": [
0.9163063168525696,
0.9273470640182495,
0.9537574052810669,
0.9697834849357605,
0.9889707565307617,
0.9955309629440308,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0
],
"mean_entropy": 0.04759228229522705,
"mean_max_prob": 0.9793079495429993
},
{
"layer": 3,
"entropy_per_head": [
0.209762841463089,
0.1171560063958168,
0.09794622659683228,
0.06588968634605408,
0.026721913367509842,
0.006907845847308636,
3.580622567379583e-15,
9.686195374952433e-41,
0.0,
0.0,
0.0,
0.0
],
"max_prob_per_head": [
0.9159234166145325,
0.94795161485672,
0.9551283717155457,
0.9698406457901001,
0.9875055551528931,
0.9967637062072754,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0
],
"mean_entropy": 0.04369870945811272,
"mean_max_prob": 0.9810927510261536
},
{
"layer": 4,
"entropy_per_head": [
0.21481843292713165,
0.15817253291606903,
0.10903926193714142,
0.06867136061191559,
0.03494655713438988,
0.01739848032593727,
5.683253889200712e-13,
9.1925179259708e-43,
0.0,
0.0,
0.0,
0.0
],
"max_prob_per_head": [
0.9135462045669556,
0.9296945333480835,
0.9507255554199219,
0.9684354662895203,
0.9833245277404785,
0.9917436838150024,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0
],
"mean_entropy": 0.050253886729478836,
"mean_max_prob": 0.9781224727630615
},
{
"layer": 5,
"entropy_per_head": [
0.20507557690143585,
0.1427854597568512,
0.10421045869588852,
0.04479587450623512,
0.01711101457476616,
0.005781891755759716,
7.97702019456753e-13,
2.397621672459762e-42,
0.0,
0.0,
0.0,
0.0
],
"max_prob_per_head": [
0.917660117149353,
0.9356474876403809,
0.9525343179702759,
0.9798707962036133,
0.9924705624580383,
0.997445285320282,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0
],
"mean_entropy": 0.04331335797905922,
"mean_max_prob": 0.9813024401664734
},
{
"layer": 6,
"entropy_per_head": [
0.1552124321460724,
0.10944928973913193,
0.08590570837259293,
0.03441280126571655,
0.01842230185866356,
0.007083357311785221,
1.0530790649221355e-13,
1.4939242928166875e-41,
0.0,
0.0,
0.0,
0.0
],
"max_prob_per_head": [
0.9378371238708496,
0.9510696530342102,
0.9609823226928711,
0.983575165271759,
0.9912484288215637,
0.9965624809265137,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0
],
"mean_entropy": 0.03420715779066086,
"mean_max_prob": 0.9851062297821045
},
{
"layer": 7,
"entropy_per_head": [
0.20514321327209473,
0.12170679867267609,
0.0744284838438034,
0.051210127770900726,
0.019812947139143944,
0.017020035535097122,
6.806287304594871e-13,
5.125829270832249e-39,
0.0,
0.0,
0.0,
0.0
],
"max_prob_per_head": [
0.9193431735038757,
0.9457699060440063,
0.9668407440185547,
0.976292610168457,
0.9907869100570679,
0.9917880892753601,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0
],
"mean_entropy": 0.040776800364255905,
"mean_max_prob": 0.9825684428215027
},
{
"layer": 8,
"entropy_per_head": [
0.17893101274967194,
0.07504458725452423,
0.059000205248594284,
0.031063487753272057,
0.025076285004615784,
0.007854106836020947,
1.886881761947734e-10,
3.0688436368713494e-43,
0.0,
0.0,
0.0,
0.0
],
"max_prob_per_head": [
0.9290326833724976,
0.9654616117477417,
0.9727820158004761,
0.9861835241317749,
0.9882169961929321,
0.9964586496353149,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0
],
"mean_entropy": 0.03141414001584053,
"mean_max_prob": 0.9865112900733948
},
{
"layer": 9,
"entropy_per_head": [
0.22920922935009003,
0.13090595602989197,
0.07475412636995316,
0.05009441822767258,
0.02543170377612114,
0.006821715272963047,
7.358213973025773e-13,
1.4680002712266784e-41,
0.0,
0.0,
0.0,
0.0
],
"max_prob_per_head": [
0.909065842628479,
0.9412500262260437,
0.9665333032608032,
0.9774504899978638,
0.9881967306137085,
0.9967927932739258,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0
],
"mean_entropy": 0.04310142993927002,
"mean_max_prob": 0.9816074371337891
}
],
"error_breakdown": {
"overall_accuracy": 0.727154541015625,
"per_class": {
"space": {
"accuracy": 0.933965844402277,
"n": 31620
},
"newline": {
"accuracy": 0.5960154952960708,
"n": 1807
},
"lowercase": {
"accuracy": 0.699845422511801,
"n": 120328
},
"uppercase": {
"accuracy": 0.34219190968955787,
"n": 4252
},
"digit": {
"accuracy": 0.0,
"n": 9
},
"punct": {
"accuracy": 0.49141483516483514,
"n": 5824
}
}
},
"teacher_ckpt": "/root/bitnet1/ckpt/fp32_ref_50M_last.pt",
"teacher_val_bpc": 0.8601819578579681,
"student_teacher_similarity": [
{
"layer": 0,
"sign_agreement": 0.577347195148468
},
{
"layer": 1,
"sign_agreement": 0.5173985481262207
},
{
"layer": 2,
"sign_agreement": 0.5035130262374878
},
{
"layer": 3,
"sign_agreement": 0.5001080393791199
},
{
"layer": 4,
"sign_agreement": 0.5003906428813935
},
{
"layer": 5,
"sign_agreement": 0.5008003354072571
},
{
"layer": 6,
"sign_agreement": 0.5006855249404907
},
{
"layer": 7,
"sign_agreement": 0.5008371591567993
},
{
"layer": 8,
"sign_agreement": 0.49889018535614016
},
{
"layer": 9,
"sign_agreement": 0.49474666118621824
}
]
}