nerderlyne commited on
Commit
d7ef083
·
verified ·
1 Parent(s): 30d1f76

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 128000,
9
+ "eos_token_id": [
10
+ 128001,
11
+ 128008,
12
+ 128009
13
+ ],
14
+ "hidden_act": "silu",
15
+ "hidden_size": 4096,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 14336,
18
+ "max_position_embeddings": 131072,
19
+ "mlp_bias": false,
20
+ "model_type": "llama",
21
+ "num_attention_heads": 32,
22
+ "num_hidden_layers": 32,
23
+ "num_key_value_heads": 8,
24
+ "pretraining_tp": 1,
25
+ "rms_norm_eps": 1e-05,
26
+ "rope_scaling": {
27
+ "factor": 8.0,
28
+ "high_freq_factor": 4.0,
29
+ "low_freq_factor": 1.0,
30
+ "original_max_position_embeddings": 8192,
31
+ "rope_type": "llama3"
32
+ },
33
+ "rope_theta": 500000.0,
34
+ "tie_word_embeddings": false,
35
+ "torch_dtype": "bfloat16",
36
+ "transformers_version": "4.43.3",
37
+ "use_cache": true,
38
+ "vocab_size": 128256
39
+ }
direction.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c03b0ade42731c2c5e87c145db111310c405effea28221b1be47716876ac864f
3
+ size 5244070
direction_metadata.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "pos": -2,
3
+ "layer": 12
4
+ }
generate_directions/mean_diffs.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5675bf91ace106199ca3c6e0376fe66e541beef6e3eead2d140c004abc734f5a
3
+ size 5244075
merged_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40d95b3683b2e1ca91a9e854553003bd023cb774e0f37c1730e223c0c381c9a5
3
+ size 32121079000
select_direction/ablation_scores.png ADDED
select_direction/actadd_scores.png ADDED
select_direction/direction_evaluations.json ADDED
@@ -0,0 +1,1122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "position": -5,
4
+ "layer": 0,
5
+ "refusal_score": 7.450636863708496,
6
+ "steering_score": -11.830456733703613,
7
+ "kl_div_score": 0.0
8
+ },
9
+ {
10
+ "position": -5,
11
+ "layer": 1,
12
+ "refusal_score": 10.940382957458496,
13
+ "steering_score": -11.839740753173828,
14
+ "kl_div_score": 0.25063224011473184
15
+ },
16
+ {
17
+ "position": -5,
18
+ "layer": 2,
19
+ "refusal_score": 8.61406135559082,
20
+ "steering_score": -11.801252365112305,
21
+ "kl_div_score": 0.02662166039541102
22
+ },
23
+ {
24
+ "position": -5,
25
+ "layer": 3,
26
+ "refusal_score": 8.32544994354248,
27
+ "steering_score": -11.922663688659668,
28
+ "kl_div_score": 0.06110219972943868
29
+ },
30
+ {
31
+ "position": -5,
32
+ "layer": 4,
33
+ "refusal_score": 7.7651824951171875,
34
+ "steering_score": -11.739885330200195,
35
+ "kl_div_score": 0.06489012377394329
36
+ },
37
+ {
38
+ "position": -5,
39
+ "layer": 5,
40
+ "refusal_score": 5.078960418701172,
41
+ "steering_score": -11.384549140930176,
42
+ "kl_div_score": 0.055092546867949475
43
+ },
44
+ {
45
+ "position": -5,
46
+ "layer": 6,
47
+ "refusal_score": 4.14361572265625,
48
+ "steering_score": -10.486276626586914,
49
+ "kl_div_score": 0.05534075754782512
50
+ },
51
+ {
52
+ "position": -5,
53
+ "layer": 7,
54
+ "refusal_score": 0.5053978562355042,
55
+ "steering_score": -7.856618881225586,
56
+ "kl_div_score": 0.12158955980096406
57
+ },
58
+ {
59
+ "position": -5,
60
+ "layer": 8,
61
+ "refusal_score": -3.547269344329834,
62
+ "steering_score": -1.4370272159576416,
63
+ "kl_div_score": 0.13569090034394904
64
+ },
65
+ {
66
+ "position": -5,
67
+ "layer": 9,
68
+ "refusal_score": -4.746824264526367,
69
+ "steering_score": 2.9055585861206055,
70
+ "kl_div_score": 0.1485775017862303
71
+ },
72
+ {
73
+ "position": -5,
74
+ "layer": 10,
75
+ "refusal_score": -5.376955032348633,
76
+ "steering_score": 7.028485298156738,
77
+ "kl_div_score": 0.08796039998349871
78
+ },
79
+ {
80
+ "position": -5,
81
+ "layer": 11,
82
+ "refusal_score": -8.539012908935547,
83
+ "steering_score": 8.075133323669434,
84
+ "kl_div_score": 0.06541798089942039
85
+ },
86
+ {
87
+ "position": -5,
88
+ "layer": 12,
89
+ "refusal_score": -10.165912628173828,
90
+ "steering_score": 8.994918823242188,
91
+ "kl_div_score": 0.1132835685413567
92
+ },
93
+ {
94
+ "position": -5,
95
+ "layer": 13,
96
+ "refusal_score": -10.355609893798828,
97
+ "steering_score": 5.637273788452148,
98
+ "kl_div_score": 0.2653944454585543
99
+ },
100
+ {
101
+ "position": -5,
102
+ "layer": 14,
103
+ "refusal_score": -6.636038303375244,
104
+ "steering_score": 4.687836170196533,
105
+ "kl_div_score": 0.0141100183367428
106
+ },
107
+ {
108
+ "position": -5,
109
+ "layer": 15,
110
+ "refusal_score": -5.5487470626831055,
111
+ "steering_score": 2.144282102584839,
112
+ "kl_div_score": 0.02395068876429188
113
+ },
114
+ {
115
+ "position": -5,
116
+ "layer": 16,
117
+ "refusal_score": -5.478135108947754,
118
+ "steering_score": 1.0938483476638794,
119
+ "kl_div_score": 0.012701942605351556
120
+ },
121
+ {
122
+ "position": -5,
123
+ "layer": 17,
124
+ "refusal_score": -5.213899612426758,
125
+ "steering_score": -0.17677417397499084,
126
+ "kl_div_score": 0.013754030688354424
127
+ },
128
+ {
129
+ "position": -5,
130
+ "layer": 18,
131
+ "refusal_score": -6.074069976806641,
132
+ "steering_score": -1.1289219856262207,
133
+ "kl_div_score": 0.011300947735016189
134
+ },
135
+ {
136
+ "position": -5,
137
+ "layer": 19,
138
+ "refusal_score": -1.6875636577606201,
139
+ "steering_score": -2.335386276245117,
140
+ "kl_div_score": 0.024858671554434622
141
+ },
142
+ {
143
+ "position": -5,
144
+ "layer": 20,
145
+ "refusal_score": -2.975120782852173,
146
+ "steering_score": -2.684943914413452,
147
+ "kl_div_score": 0.012842250835919534
148
+ },
149
+ {
150
+ "position": -5,
151
+ "layer": 21,
152
+ "refusal_score": -3.183622360229492,
153
+ "steering_score": -3.214367628097534,
154
+ "kl_div_score": 0.021878747008857467
155
+ },
156
+ {
157
+ "position": -5,
158
+ "layer": 22,
159
+ "refusal_score": -1.722489595413208,
160
+ "steering_score": -3.474200487136841,
161
+ "kl_div_score": 0.022382147956212104
162
+ },
163
+ {
164
+ "position": -5,
165
+ "layer": 23,
166
+ "refusal_score": -0.6202753186225891,
167
+ "steering_score": -4.342594146728516,
168
+ "kl_div_score": 0.017418699702721193
169
+ },
170
+ {
171
+ "position": -5,
172
+ "layer": 24,
173
+ "refusal_score": 1.2065739631652832,
174
+ "steering_score": -4.2968645095825195,
175
+ "kl_div_score": 0.016258465730455446
176
+ },
177
+ {
178
+ "position": -5,
179
+ "layer": 25,
180
+ "refusal_score": 1.9451597929000854,
181
+ "steering_score": -4.316882133483887,
182
+ "kl_div_score": 0.013315564529131926
183
+ },
184
+ {
185
+ "position": -5,
186
+ "layer": 26,
187
+ "refusal_score": 2.7124099731445312,
188
+ "steering_score": -4.45754861831665,
189
+ "kl_div_score": 0.011776859122535012
190
+ },
191
+ {
192
+ "position": -5,
193
+ "layer": 27,
194
+ "refusal_score": 4.2641096115112305,
195
+ "steering_score": -4.595405578613281,
196
+ "kl_div_score": 0.009685238960157085
197
+ },
198
+ {
199
+ "position": -5,
200
+ "layer": 28,
201
+ "refusal_score": 5.035680770874023,
202
+ "steering_score": -4.640095233917236,
203
+ "kl_div_score": 0.011553904695468277
204
+ },
205
+ {
206
+ "position": -5,
207
+ "layer": 29,
208
+ "refusal_score": 4.5578694343566895,
209
+ "steering_score": -4.819130897521973,
210
+ "kl_div_score": 0.009942203119875617
211
+ },
212
+ {
213
+ "position": -5,
214
+ "layer": 30,
215
+ "refusal_score": 4.937906265258789,
216
+ "steering_score": -4.4534406661987305,
217
+ "kl_div_score": 0.010485377575664055
218
+ },
219
+ {
220
+ "position": -5,
221
+ "layer": 31,
222
+ "refusal_score": 5.580471038818359,
223
+ "steering_score": -5.183733940124512,
224
+ "kl_div_score": 0.0073827847249879185
225
+ },
226
+ {
227
+ "position": -4,
228
+ "layer": 0,
229
+ "refusal_score": 7.450636863708496,
230
+ "steering_score": -11.830456733703613,
231
+ "kl_div_score": 0.0
232
+ },
233
+ {
234
+ "position": -4,
235
+ "layer": 1,
236
+ "refusal_score": 10.812314987182617,
237
+ "steering_score": -11.795536994934082,
238
+ "kl_div_score": 0.4158383283900479
239
+ },
240
+ {
241
+ "position": -4,
242
+ "layer": 2,
243
+ "refusal_score": 6.827181816101074,
244
+ "steering_score": -11.737068176269531,
245
+ "kl_div_score": 0.20311702320501046
246
+ },
247
+ {
248
+ "position": -4,
249
+ "layer": 3,
250
+ "refusal_score": 8.025232315063477,
251
+ "steering_score": -11.748991966247559,
252
+ "kl_div_score": 0.02712648312854689
253
+ },
254
+ {
255
+ "position": -4,
256
+ "layer": 4,
257
+ "refusal_score": 6.425276279449463,
258
+ "steering_score": -11.467093467712402,
259
+ "kl_div_score": 0.006255747482580087
260
+ },
261
+ {
262
+ "position": -4,
263
+ "layer": 5,
264
+ "refusal_score": 6.909249305725098,
265
+ "steering_score": -10.97586441040039,
266
+ "kl_div_score": 0.009888444520913529
267
+ },
268
+ {
269
+ "position": -4,
270
+ "layer": 6,
271
+ "refusal_score": 4.926372528076172,
272
+ "steering_score": -10.25048828125,
273
+ "kl_div_score": 0.01978332223716977
274
+ },
275
+ {
276
+ "position": -4,
277
+ "layer": 7,
278
+ "refusal_score": 6.651894569396973,
279
+ "steering_score": -9.241510391235352,
280
+ "kl_div_score": 0.04994251613248886
281
+ },
282
+ {
283
+ "position": -4,
284
+ "layer": 8,
285
+ "refusal_score": 6.573302745819092,
286
+ "steering_score": -6.214093208312988,
287
+ "kl_div_score": 0.042811360122624306
288
+ },
289
+ {
290
+ "position": -4,
291
+ "layer": 9,
292
+ "refusal_score": 6.5962138175964355,
293
+ "steering_score": -3.7241485118865967,
294
+ "kl_div_score": 0.033524626113400284
295
+ },
296
+ {
297
+ "position": -4,
298
+ "layer": 10,
299
+ "refusal_score": 4.0028791427612305,
300
+ "steering_score": 4.6848249435424805,
301
+ "kl_div_score": 0.03246775216125604
302
+ },
303
+ {
304
+ "position": -4,
305
+ "layer": 11,
306
+ "refusal_score": 4.505997657775879,
307
+ "steering_score": 3.1285815238952637,
308
+ "kl_div_score": 0.03856063620648774
309
+ },
310
+ {
311
+ "position": -4,
312
+ "layer": 12,
313
+ "refusal_score": 3.0150375366210938,
314
+ "steering_score": 5.0643134117126465,
315
+ "kl_div_score": 0.04881445734757695
316
+ },
317
+ {
318
+ "position": -4,
319
+ "layer": 13,
320
+ "refusal_score": 1.1747349500656128,
321
+ "steering_score": 1.4116528034210205,
322
+ "kl_div_score": 0.026868538149248188
323
+ },
324
+ {
325
+ "position": -4,
326
+ "layer": 14,
327
+ "refusal_score": 3.023860454559326,
328
+ "steering_score": -3.3504014015197754,
329
+ "kl_div_score": 0.04032019237447497
330
+ },
331
+ {
332
+ "position": -4,
333
+ "layer": 15,
334
+ "refusal_score": 5.741189956665039,
335
+ "steering_score": -5.515679359436035,
336
+ "kl_div_score": 0.12204382576863267
337
+ },
338
+ {
339
+ "position": -4,
340
+ "layer": 16,
341
+ "refusal_score": 4.668534755706787,
342
+ "steering_score": -6.197427272796631,
343
+ "kl_div_score": 0.020614937937280454
344
+ },
345
+ {
346
+ "position": -4,
347
+ "layer": 17,
348
+ "refusal_score": 5.006313323974609,
349
+ "steering_score": -7.727458953857422,
350
+ "kl_div_score": 0.014675440656217304
351
+ },
352
+ {
353
+ "position": -4,
354
+ "layer": 18,
355
+ "refusal_score": 5.147755146026611,
356
+ "steering_score": -7.854608058929443,
357
+ "kl_div_score": 0.013922934527556491
358
+ },
359
+ {
360
+ "position": -4,
361
+ "layer": 19,
362
+ "refusal_score": 5.385741233825684,
363
+ "steering_score": -8.62336540222168,
364
+ "kl_div_score": 0.006841440913400462
365
+ },
366
+ {
367
+ "position": -4,
368
+ "layer": 20,
369
+ "refusal_score": 4.483187675476074,
370
+ "steering_score": -8.518564224243164,
371
+ "kl_div_score": 0.018073221561273382
372
+ },
373
+ {
374
+ "position": -4,
375
+ "layer": 21,
376
+ "refusal_score": 4.432735919952393,
377
+ "steering_score": -8.875178337097168,
378
+ "kl_div_score": 0.03618557665671606
379
+ },
380
+ {
381
+ "position": -4,
382
+ "layer": 22,
383
+ "refusal_score": 3.882089614868164,
384
+ "steering_score": -9.056167602539062,
385
+ "kl_div_score": 0.029761310959065115
386
+ },
387
+ {
388
+ "position": -4,
389
+ "layer": 23,
390
+ "refusal_score": 4.477509498596191,
391
+ "steering_score": -9.458629608154297,
392
+ "kl_div_score": 0.04115770017240697
393
+ },
394
+ {
395
+ "position": -4,
396
+ "layer": 24,
397
+ "refusal_score": 5.272890090942383,
398
+ "steering_score": -9.620115280151367,
399
+ "kl_div_score": 0.026080006169360244
400
+ },
401
+ {
402
+ "position": -4,
403
+ "layer": 25,
404
+ "refusal_score": 5.76324462890625,
405
+ "steering_score": -9.744662284851074,
406
+ "kl_div_score": 0.025975317857587155
407
+ },
408
+ {
409
+ "position": -4,
410
+ "layer": 26,
411
+ "refusal_score": 5.6695556640625,
412
+ "steering_score": -9.72978687286377,
413
+ "kl_div_score": 0.021038816397045645
414
+ },
415
+ {
416
+ "position": -4,
417
+ "layer": 27,
418
+ "refusal_score": 6.13779354095459,
419
+ "steering_score": -9.982513427734375,
420
+ "kl_div_score": 0.01604703955541417
421
+ },
422
+ {
423
+ "position": -4,
424
+ "layer": 28,
425
+ "refusal_score": 6.53122615814209,
426
+ "steering_score": -10.206818580627441,
427
+ "kl_div_score": 0.00905406421169553
428
+ },
429
+ {
430
+ "position": -4,
431
+ "layer": 29,
432
+ "refusal_score": 6.67525053024292,
433
+ "steering_score": -10.416938781738281,
434
+ "kl_div_score": 0.010709122069082595
435
+ },
436
+ {
437
+ "position": -4,
438
+ "layer": 30,
439
+ "refusal_score": 6.565028190612793,
440
+ "steering_score": -10.57150936126709,
441
+ "kl_div_score": 0.013185399142395403
442
+ },
443
+ {
444
+ "position": -4,
445
+ "layer": 31,
446
+ "refusal_score": 7.538625717163086,
447
+ "steering_score": -9.432778358459473,
448
+ "kl_div_score": 0.00659994063938069
449
+ },
450
+ {
451
+ "position": -3,
452
+ "layer": 0,
453
+ "refusal_score": 7.450636863708496,
454
+ "steering_score": -11.830456733703613,
455
+ "kl_div_score": 0.0
456
+ },
457
+ {
458
+ "position": -3,
459
+ "layer": 1,
460
+ "refusal_score": 7.351442337036133,
461
+ "steering_score": -11.879144668579102,
462
+ "kl_div_score": 0.05714835705792163
463
+ },
464
+ {
465
+ "position": -3,
466
+ "layer": 2,
467
+ "refusal_score": 7.555194854736328,
468
+ "steering_score": -11.859851837158203,
469
+ "kl_div_score": 0.018264240284618614
470
+ },
471
+ {
472
+ "position": -3,
473
+ "layer": 3,
474
+ "refusal_score": 7.613396167755127,
475
+ "steering_score": -12.02131462097168,
476
+ "kl_div_score": 0.05679509519728466
477
+ },
478
+ {
479
+ "position": -3,
480
+ "layer": 4,
481
+ "refusal_score": 7.602565765380859,
482
+ "steering_score": -11.988609313964844,
483
+ "kl_div_score": 0.017283959215815804
484
+ },
485
+ {
486
+ "position": -3,
487
+ "layer": 5,
488
+ "refusal_score": 7.565119743347168,
489
+ "steering_score": -12.049741744995117,
490
+ "kl_div_score": 0.023579715899906217
491
+ },
492
+ {
493
+ "position": -3,
494
+ "layer": 6,
495
+ "refusal_score": 7.761242389678955,
496
+ "steering_score": -12.232732772827148,
497
+ "kl_div_score": 0.0022932509288439333
498
+ },
499
+ {
500
+ "position": -3,
501
+ "layer": 7,
502
+ "refusal_score": 7.998626708984375,
503
+ "steering_score": -12.119717597961426,
504
+ "kl_div_score": 0.0026027950945717204
505
+ },
506
+ {
507
+ "position": -3,
508
+ "layer": 8,
509
+ "refusal_score": 7.18452262878418,
510
+ "steering_score": -12.026199340820312,
511
+ "kl_div_score": 0.014272619988313409
512
+ },
513
+ {
514
+ "position": -3,
515
+ "layer": 9,
516
+ "refusal_score": 6.099905967712402,
517
+ "steering_score": -11.814745903015137,
518
+ "kl_div_score": 0.022906798340328747
519
+ },
520
+ {
521
+ "position": -3,
522
+ "layer": 10,
523
+ "refusal_score": 1.6819469928741455,
524
+ "steering_score": -10.3506441116333,
525
+ "kl_div_score": 0.02854894664135504
526
+ },
527
+ {
528
+ "position": -3,
529
+ "layer": 11,
530
+ "refusal_score": -7.098132610321045,
531
+ "steering_score": -1.9208875894546509,
532
+ "kl_div_score": 0.03871730206055337
533
+ },
534
+ {
535
+ "position": -3,
536
+ "layer": 12,
537
+ "refusal_score": -9.373048782348633,
538
+ "steering_score": 3.060147523880005,
539
+ "kl_div_score": 0.0867584559325129
540
+ },
541
+ {
542
+ "position": -3,
543
+ "layer": 13,
544
+ "refusal_score": -5.892275810241699,
545
+ "steering_score": 0.07500703632831573,
546
+ "kl_div_score": 0.1633262984950845
547
+ },
548
+ {
549
+ "position": -3,
550
+ "layer": 14,
551
+ "refusal_score": -0.7610650062561035,
552
+ "steering_score": 2.843015193939209,
553
+ "kl_div_score": 0.01738917049392854
554
+ },
555
+ {
556
+ "position": -3,
557
+ "layer": 15,
558
+ "refusal_score": -0.5506424307823181,
559
+ "steering_score": 3.8185596466064453,
560
+ "kl_div_score": 0.0688840249624754
561
+ },
562
+ {
563
+ "position": -3,
564
+ "layer": 16,
565
+ "refusal_score": 0.20622670650482178,
566
+ "steering_score": 2.1972856521606445,
567
+ "kl_div_score": 0.1475035138357805
568
+ },
569
+ {
570
+ "position": -3,
571
+ "layer": 17,
572
+ "refusal_score": 1.5745298862457275,
573
+ "steering_score": -0.0007034424343146384,
574
+ "kl_div_score": 0.680177021099812
575
+ },
576
+ {
577
+ "position": -3,
578
+ "layer": 18,
579
+ "refusal_score": 2.412114381790161,
580
+ "steering_score": -0.9559683799743652,
581
+ "kl_div_score": 0.6144676889726809
582
+ },
583
+ {
584
+ "position": -3,
585
+ "layer": 19,
586
+ "refusal_score": 3.371020793914795,
587
+ "steering_score": -1.605437159538269,
588
+ "kl_div_score": 1.087222243589543
589
+ },
590
+ {
591
+ "position": -3,
592
+ "layer": 20,
593
+ "refusal_score": 3.380979061126709,
594
+ "steering_score": -1.6221394538879395,
595
+ "kl_div_score": 0.8293832443842403
596
+ },
597
+ {
598
+ "position": -3,
599
+ "layer": 21,
600
+ "refusal_score": 2.6941325664520264,
601
+ "steering_score": -1.701312780380249,
602
+ "kl_div_score": 0.5683925898575501
603
+ },
604
+ {
605
+ "position": -3,
606
+ "layer": 22,
607
+ "refusal_score": 3.3659584522247314,
608
+ "steering_score": -1.4302847385406494,
609
+ "kl_div_score": 0.5506433252182473
610
+ },
611
+ {
612
+ "position": -3,
613
+ "layer": 23,
614
+ "refusal_score": 3.5170602798461914,
615
+ "steering_score": -1.8237537145614624,
616
+ "kl_div_score": 0.5088802263664781
617
+ },
618
+ {
619
+ "position": -3,
620
+ "layer": 24,
621
+ "refusal_score": 3.5988001823425293,
622
+ "steering_score": -1.5716158151626587,
623
+ "kl_div_score": 0.4243232694243198
624
+ },
625
+ {
626
+ "position": -3,
627
+ "layer": 25,
628
+ "refusal_score": 3.224271774291992,
629
+ "steering_score": -1.4275472164154053,
630
+ "kl_div_score": 0.32617104795758256
631
+ },
632
+ {
633
+ "position": -3,
634
+ "layer": 26,
635
+ "refusal_score": 3.0915687084198,
636
+ "steering_score": -1.7615240812301636,
637
+ "kl_div_score": 0.24233564495948606
638
+ },
639
+ {
640
+ "position": -3,
641
+ "layer": 27,
642
+ "refusal_score": 3.930178642272949,
643
+ "steering_score": -2.097964286804199,
644
+ "kl_div_score": 0.2857087263265544
645
+ },
646
+ {
647
+ "position": -3,
648
+ "layer": 28,
649
+ "refusal_score": 4.111324310302734,
650
+ "steering_score": -2.3234660625457764,
651
+ "kl_div_score": 0.17600789216823837
652
+ },
653
+ {
654
+ "position": -3,
655
+ "layer": 29,
656
+ "refusal_score": 4.02263879776001,
657
+ "steering_score": -2.341846466064453,
658
+ "kl_div_score": 0.11356473608828227
659
+ },
660
+ {
661
+ "position": -3,
662
+ "layer": 30,
663
+ "refusal_score": 5.118901252746582,
664
+ "steering_score": -2.601374387741089,
665
+ "kl_div_score": 0.037726283129610017
666
+ },
667
+ {
668
+ "position": -3,
669
+ "layer": 31,
670
+ "refusal_score": 7.2652153968811035,
671
+ "steering_score": -2.683837890625,
672
+ "kl_div_score": 0.012861004017420533
673
+ },
674
+ {
675
+ "position": -2,
676
+ "layer": 0,
677
+ "refusal_score": 7.450636863708496,
678
+ "steering_score": -11.830456733703613,
679
+ "kl_div_score": 0.0
680
+ },
681
+ {
682
+ "position": -2,
683
+ "layer": 1,
684
+ "refusal_score": 6.926996231079102,
685
+ "steering_score": -11.798026084899902,
686
+ "kl_div_score": 0.5867804379978319
687
+ },
688
+ {
689
+ "position": -2,
690
+ "layer": 2,
691
+ "refusal_score": -3.8163418769836426,
692
+ "steering_score": -11.823830604553223,
693
+ "kl_div_score": 1.6575609711285022
694
+ },
695
+ {
696
+ "position": -2,
697
+ "layer": 3,
698
+ "refusal_score": 8.180908203125,
699
+ "steering_score": -11.759471893310547,
700
+ "kl_div_score": 0.04485797158089392
701
+ },
702
+ {
703
+ "position": -2,
704
+ "layer": 4,
705
+ "refusal_score": 8.299246788024902,
706
+ "steering_score": -11.92286205291748,
707
+ "kl_div_score": 0.042189485187598366
708
+ },
709
+ {
710
+ "position": -2,
711
+ "layer": 5,
712
+ "refusal_score": 8.099535942077637,
713
+ "steering_score": -11.920945167541504,
714
+ "kl_div_score": 0.013529918829430384
715
+ },
716
+ {
717
+ "position": -2,
718
+ "layer": 6,
719
+ "refusal_score": 6.200526237487793,
720
+ "steering_score": -11.95439624786377,
721
+ "kl_div_score": 0.043231514354219576
722
+ },
723
+ {
724
+ "position": -2,
725
+ "layer": 7,
726
+ "refusal_score": 2.4409306049346924,
727
+ "steering_score": -12.042317390441895,
728
+ "kl_div_score": 0.07690331220794447
729
+ },
730
+ {
731
+ "position": -2,
732
+ "layer": 8,
733
+ "refusal_score": -0.9726417660713196,
734
+ "steering_score": -11.459531784057617,
735
+ "kl_div_score": 0.07554150489178757
736
+ },
737
+ {
738
+ "position": -2,
739
+ "layer": 9,
740
+ "refusal_score": -6.368747234344482,
741
+ "steering_score": -6.816261291503906,
742
+ "kl_div_score": 0.2651261668661564
743
+ },
744
+ {
745
+ "position": -2,
746
+ "layer": 10,
747
+ "refusal_score": -6.758663177490234,
748
+ "steering_score": 0.9893432855606079,
749
+ "kl_div_score": 0.18809623941974607
750
+ },
751
+ {
752
+ "position": -2,
753
+ "layer": 11,
754
+ "refusal_score": -9.023905754089355,
755
+ "steering_score": 8.601274490356445,
756
+ "kl_div_score": 0.04104435475014729
757
+ },
758
+ {
759
+ "position": -2,
760
+ "layer": 12,
761
+ "refusal_score": -9.75497817993164,
762
+ "steering_score": 6.006311893463135,
763
+ "kl_div_score": 0.031104022506242827
764
+ },
765
+ {
766
+ "position": -2,
767
+ "layer": 13,
768
+ "refusal_score": -0.7074866890907288,
769
+ "steering_score": 2.2404544353485107,
770
+ "kl_div_score": 0.029380356415469547
771
+ },
772
+ {
773
+ "position": -2,
774
+ "layer": 14,
775
+ "refusal_score": 5.199129104614258,
776
+ "steering_score": 3.0659422874450684,
777
+ "kl_div_score": 0.5353163095877882
778
+ },
779
+ {
780
+ "position": -2,
781
+ "layer": 15,
782
+ "refusal_score": 4.9762983322143555,
783
+ "steering_score": 3.254157781600952,
784
+ "kl_div_score": 1.0480167571754937
785
+ },
786
+ {
787
+ "position": -2,
788
+ "layer": 16,
789
+ "refusal_score": 3.6731338500976562,
790
+ "steering_score": 3.8821756839752197,
791
+ "kl_div_score": 1.2368674247797558
792
+ },
793
+ {
794
+ "position": -2,
795
+ "layer": 17,
796
+ "refusal_score": 3.1663296222686768,
797
+ "steering_score": 3.5367674827575684,
798
+ "kl_div_score": 3.093188072529282
799
+ },
800
+ {
801
+ "position": -2,
802
+ "layer": 18,
803
+ "refusal_score": 3.152223587036133,
804
+ "steering_score": 2.5594911575317383,
805
+ "kl_div_score": 1.575859430034067
806
+ },
807
+ {
808
+ "position": -2,
809
+ "layer": 19,
810
+ "refusal_score": 3.236422061920166,
811
+ "steering_score": 1.7847882509231567,
812
+ "kl_div_score": 2.04167027746993
813
+ },
814
+ {
815
+ "position": -2,
816
+ "layer": 20,
817
+ "refusal_score": 2.354421615600586,
818
+ "steering_score": 1.7620494365692139,
819
+ "kl_div_score": 1.7265832841839912
820
+ },
821
+ {
822
+ "position": -2,
823
+ "layer": 21,
824
+ "refusal_score": 0.4912188947200775,
825
+ "steering_score": 2.477295398712158,
826
+ "kl_div_score": 1.3980995152256832
827
+ },
828
+ {
829
+ "position": -2,
830
+ "layer": 22,
831
+ "refusal_score": 0.4995027780532837,
832
+ "steering_score": 3.015876054763794,
833
+ "kl_div_score": 0.8391094951104818
834
+ },
835
+ {
836
+ "position": -2,
837
+ "layer": 23,
838
+ "refusal_score": 0.4436635375022888,
839
+ "steering_score": 1.9265347719192505,
840
+ "kl_div_score": 0.5401840366261479
841
+ },
842
+ {
843
+ "position": -2,
844
+ "layer": 24,
845
+ "refusal_score": 1.1446397304534912,
846
+ "steering_score": 1.4338568449020386,
847
+ "kl_div_score": 0.4640195631718479
848
+ },
849
+ {
850
+ "position": -2,
851
+ "layer": 25,
852
+ "refusal_score": 1.5433824062347412,
853
+ "steering_score": 1.1149771213531494,
854
+ "kl_div_score": 0.3339530314691103
855
+ },
856
+ {
857
+ "position": -2,
858
+ "layer": 26,
859
+ "refusal_score": 1.7812745571136475,
860
+ "steering_score": 0.8563902974128723,
861
+ "kl_div_score": 0.2535115344191417
862
+ },
863
+ {
864
+ "position": -2,
865
+ "layer": 27,
866
+ "refusal_score": 2.4479846954345703,
867
+ "steering_score": 0.3418017029762268,
868
+ "kl_div_score": 0.21052595868841084
869
+ },
870
+ {
871
+ "position": -2,
872
+ "layer": 28,
873
+ "refusal_score": 2.3051071166992188,
874
+ "steering_score": 0.10388462990522385,
875
+ "kl_div_score": 0.15181116784191137
876
+ },
877
+ {
878
+ "position": -2,
879
+ "layer": 29,
880
+ "refusal_score": 1.9919395446777344,
881
+ "steering_score": -0.19721129536628723,
882
+ "kl_div_score": 0.08905450779835926
883
+ },
884
+ {
885
+ "position": -2,
886
+ "layer": 30,
887
+ "refusal_score": 2.893739700317383,
888
+ "steering_score": -0.3327680826187134,
889
+ "kl_div_score": 0.08569501484161374
890
+ },
891
+ {
892
+ "position": -2,
893
+ "layer": 31,
894
+ "refusal_score": 3.7339277267456055,
895
+ "steering_score": -0.45471158623695374,
896
+ "kl_div_score": 0.04856147079387752
897
+ },
898
+ {
899
+ "position": -1,
900
+ "layer": 0,
901
+ "refusal_score": 7.450636863708496,
902
+ "steering_score": -11.830456733703613,
903
+ "kl_div_score": 0.0
904
+ },
905
+ {
906
+ "position": -1,
907
+ "layer": 1,
908
+ "refusal_score": 8.079374313354492,
909
+ "steering_score": -11.79311466217041,
910
+ "kl_div_score": 0.006414912641295136
911
+ },
912
+ {
913
+ "position": -1,
914
+ "layer": 2,
915
+ "refusal_score": 7.303004264831543,
916
+ "steering_score": -11.744900703430176,
917
+ "kl_div_score": 0.031120544594983552
918
+ },
919
+ {
920
+ "position": -1,
921
+ "layer": 3,
922
+ "refusal_score": 7.523357391357422,
923
+ "steering_score": -12.060215950012207,
924
+ "kl_div_score": 0.060775408374946265
925
+ },
926
+ {
927
+ "position": -1,
928
+ "layer": 4,
929
+ "refusal_score": 7.348278999328613,
930
+ "steering_score": -11.976644515991211,
931
+ "kl_div_score": 0.07863737032091514
932
+ },
933
+ {
934
+ "position": -1,
935
+ "layer": 5,
936
+ "refusal_score": 7.338820934295654,
937
+ "steering_score": -11.921005249023438,
938
+ "kl_div_score": 0.012741925993061249
939
+ },
940
+ {
941
+ "position": -1,
942
+ "layer": 6,
943
+ "refusal_score": 6.427075386047363,
944
+ "steering_score": -11.922883987426758,
945
+ "kl_div_score": 0.17200264418275032
946
+ },
947
+ {
948
+ "position": -1,
949
+ "layer": 7,
950
+ "refusal_score": 3.605194568634033,
951
+ "steering_score": -11.695231437683105,
952
+ "kl_div_score": 0.1327529109141617
953
+ },
954
+ {
955
+ "position": -1,
956
+ "layer": 8,
957
+ "refusal_score": -4.327999114990234,
958
+ "steering_score": -8.877398490905762,
959
+ "kl_div_score": 0.2133217491746702
960
+ },
961
+ {
962
+ "position": -1,
963
+ "layer": 9,
964
+ "refusal_score": -6.285901069641113,
965
+ "steering_score": -2.485100269317627,
966
+ "kl_div_score": 0.10089911963005772
967
+ },
968
+ {
969
+ "position": -1,
970
+ "layer": 10,
971
+ "refusal_score": -7.331896781921387,
972
+ "steering_score": 4.227353096008301,
973
+ "kl_div_score": 0.08309234215185003
974
+ },
975
+ {
976
+ "position": -1,
977
+ "layer": 11,
978
+ "refusal_score": -8.494709014892578,
979
+ "steering_score": 7.610245227813721,
980
+ "kl_div_score": 0.014720118389356012
981
+ },
982
+ {
983
+ "position": -1,
984
+ "layer": 12,
985
+ "refusal_score": -8.680732727050781,
986
+ "steering_score": 4.7956414222717285,
987
+ "kl_div_score": 0.01707359135099005
988
+ },
989
+ {
990
+ "position": -1,
991
+ "layer": 13,
992
+ "refusal_score": 0.6532330513000488,
993
+ "steering_score": -0.4470141530036926,
994
+ "kl_div_score": 0.016470555894460795
995
+ },
996
+ {
997
+ "position": -1,
998
+ "layer": 14,
999
+ "refusal_score": 6.329411506652832,
1000
+ "steering_score": 2.30622935295105,
1001
+ "kl_div_score": 0.9479847871943258
1002
+ },
1003
+ {
1004
+ "position": -1,
1005
+ "layer": 15,
1006
+ "refusal_score": 6.0013041496276855,
1007
+ "steering_score": 1.3209009170532227,
1008
+ "kl_div_score": 1.0885456680832901
1009
+ },
1010
+ {
1011
+ "position": -1,
1012
+ "layer": 16,
1013
+ "refusal_score": 4.267302513122559,
1014
+ "steering_score": 2.8033483028411865,
1015
+ "kl_div_score": 0.9073458383306865
1016
+ },
1017
+ {
1018
+ "position": -1,
1019
+ "layer": 17,
1020
+ "refusal_score": 5.26901912689209,
1021
+ "steering_score": 1.9798250198364258,
1022
+ "kl_div_score": 2.3829972756327256
1023
+ },
1024
+ {
1025
+ "position": -1,
1026
+ "layer": 18,
1027
+ "refusal_score": 5.262829780578613,
1028
+ "steering_score": 2.076411008834839,
1029
+ "kl_div_score": 2.364046320178272
1030
+ },
1031
+ {
1032
+ "position": -1,
1033
+ "layer": 19,
1034
+ "refusal_score": 3.3191628456115723,
1035
+ "steering_score": 1.7674345970153809,
1036
+ "kl_div_score": 1.8763310828443507
1037
+ },
1038
+ {
1039
+ "position": -1,
1040
+ "layer": 20,
1041
+ "refusal_score": 2.9167158603668213,
1042
+ "steering_score": 1.4590768814086914,
1043
+ "kl_div_score": 1.6069269699166155
1044
+ },
1045
+ {
1046
+ "position": -1,
1047
+ "layer": 21,
1048
+ "refusal_score": 1.8632879257202148,
1049
+ "steering_score": 2.609332323074341,
1050
+ "kl_div_score": 1.5477564115083355
1051
+ },
1052
+ {
1053
+ "position": -1,
1054
+ "layer": 22,
1055
+ "refusal_score": 2.5213961601257324,
1056
+ "steering_score": 2.937309741973877,
1057
+ "kl_div_score": 1.2337463087795868
1058
+ },
1059
+ {
1060
+ "position": -1,
1061
+ "layer": 23,
1062
+ "refusal_score": 2.130760669708252,
1063
+ "steering_score": 2.94647216796875,
1064
+ "kl_div_score": 1.135543047215232
1065
+ },
1066
+ {
1067
+ "position": -1,
1068
+ "layer": 24,
1069
+ "refusal_score": 1.4255648851394653,
1070
+ "steering_score": 2.6254642009735107,
1071
+ "kl_div_score": 1.0170067442120234
1072
+ },
1073
+ {
1074
+ "position": -1,
1075
+ "layer": 25,
1076
+ "refusal_score": 1.8470709323883057,
1077
+ "steering_score": 2.4709389209747314,
1078
+ "kl_div_score": 0.9048952123050806
1079
+ },
1080
+ {
1081
+ "position": -1,
1082
+ "layer": 26,
1083
+ "refusal_score": 1.4695944786071777,
1084
+ "steering_score": 1.7421600818634033,
1085
+ "kl_div_score": 0.6825789057908486
1086
+ },
1087
+ {
1088
+ "position": -1,
1089
+ "layer": 27,
1090
+ "refusal_score": 1.1728484630584717,
1091
+ "steering_score": 1.8248333930969238,
1092
+ "kl_div_score": 0.49910105954810363
1093
+ },
1094
+ {
1095
+ "position": -1,
1096
+ "layer": 28,
1097
+ "refusal_score": 0.4489290118217468,
1098
+ "steering_score": 1.8028056621551514,
1099
+ "kl_div_score": 0.3759350678814426
1100
+ },
1101
+ {
1102
+ "position": -1,
1103
+ "layer": 29,
1104
+ "refusal_score": -0.6745665073394775,
1105
+ "steering_score": 2.546116828918457,
1106
+ "kl_div_score": 0.22260585876335837
1107
+ },
1108
+ {
1109
+ "position": -1,
1110
+ "layer": 30,
1111
+ "refusal_score": -0.34203040599823,
1112
+ "steering_score": 2.926508665084839,
1113
+ "kl_div_score": 0.20333847684556203
1114
+ },
1115
+ {
1116
+ "position": -1,
1117
+ "layer": 31,
1118
+ "refusal_score": 0.09251364320516586,
1119
+ "steering_score": 2.020570993423462,
1120
+ "kl_div_score": 0.13057973266326595
1121
+ }
1122
+ ]
select_direction/direction_evaluations_filtered.json ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "position": -2,
4
+ "layer": 12,
5
+ "refusal_score": -9.75497817993164,
6
+ "steering_score": 6.006311893463135,
7
+ "kl_div_score": 0.031104022506242827
8
+ },
9
+ {
10
+ "position": -3,
11
+ "layer": 12,
12
+ "refusal_score": -9.373048782348633,
13
+ "steering_score": 3.060147523880005,
14
+ "kl_div_score": 0.0867584559325129
15
+ },
16
+ {
17
+ "position": -2,
18
+ "layer": 11,
19
+ "refusal_score": -9.023905754089355,
20
+ "steering_score": 8.601274490356445,
21
+ "kl_div_score": 0.04104435475014729
22
+ },
23
+ {
24
+ "position": -1,
25
+ "layer": 12,
26
+ "refusal_score": -8.680732727050781,
27
+ "steering_score": 4.7956414222717285,
28
+ "kl_div_score": 0.01707359135099005
29
+ },
30
+ {
31
+ "position": -5,
32
+ "layer": 11,
33
+ "refusal_score": -8.539012908935547,
34
+ "steering_score": 8.075133323669434,
35
+ "kl_div_score": 0.06541798089942039
36
+ },
37
+ {
38
+ "position": -1,
39
+ "layer": 11,
40
+ "refusal_score": -8.494709014892578,
41
+ "steering_score": 7.610245227813721,
42
+ "kl_div_score": 0.014720118389356012
43
+ },
44
+ {
45
+ "position": -1,
46
+ "layer": 10,
47
+ "refusal_score": -7.331896781921387,
48
+ "steering_score": 4.227353096008301,
49
+ "kl_div_score": 0.08309234215185003
50
+ },
51
+ {
52
+ "position": -5,
53
+ "layer": 14,
54
+ "refusal_score": -6.636038303375244,
55
+ "steering_score": 4.687836170196533,
56
+ "kl_div_score": 0.0141100183367428
57
+ },
58
+ {
59
+ "position": -5,
60
+ "layer": 15,
61
+ "refusal_score": -5.5487470626831055,
62
+ "steering_score": 2.144282102584839,
63
+ "kl_div_score": 0.02395068876429188
64
+ },
65
+ {
66
+ "position": -5,
67
+ "layer": 16,
68
+ "refusal_score": -5.478135108947754,
69
+ "steering_score": 1.0938483476638794,
70
+ "kl_div_score": 0.012701942605351556
71
+ },
72
+ {
73
+ "position": -5,
74
+ "layer": 10,
75
+ "refusal_score": -5.376955032348633,
76
+ "steering_score": 7.028485298156738,
77
+ "kl_div_score": 0.08796039998349871
78
+ },
79
+ {
80
+ "position": -3,
81
+ "layer": 14,
82
+ "refusal_score": -0.7610650062561035,
83
+ "steering_score": 2.843015193939209,
84
+ "kl_div_score": 0.01738917049392854
85
+ },
86
+ {
87
+ "position": -2,
88
+ "layer": 13,
89
+ "refusal_score": -0.7074866890907288,
90
+ "steering_score": 2.2404544353485107,
91
+ "kl_div_score": 0.029380356415469547
92
+ },
93
+ {
94
+ "position": -3,
95
+ "layer": 15,
96
+ "refusal_score": -0.5506424307823181,
97
+ "steering_score": 3.8185596466064453,
98
+ "kl_div_score": 0.0688840249624754
99
+ },
100
+ {
101
+ "position": -4,
102
+ "layer": 13,
103
+ "refusal_score": 1.1747349500656128,
104
+ "steering_score": 1.4116528034210205,
105
+ "kl_div_score": 0.026868538149248188
106
+ },
107
+ {
108
+ "position": -4,
109
+ "layer": 12,
110
+ "refusal_score": 3.0150375366210938,
111
+ "steering_score": 5.0643134117126465,
112
+ "kl_div_score": 0.04881445734757695
113
+ },
114
+ {
115
+ "position": -4,
116
+ "layer": 10,
117
+ "refusal_score": 4.0028791427612305,
118
+ "steering_score": 4.6848249435424805,
119
+ "kl_div_score": 0.03246775216125604
120
+ },
121
+ {
122
+ "position": -4,
123
+ "layer": 11,
124
+ "refusal_score": 4.505997657775879,
125
+ "steering_score": 3.1285815238952637,
126
+ "kl_div_score": 0.03856063620648774
127
+ }
128
+ ]
select_direction/kl_div_scores.png ADDED