So fast!
#1
by
ngxson
- opened
The model has just been release ~24 hrs ago
How on earth you can get this abliterated so fast? πππππ
abliteration go brrr
Hi been trying the same thing with the 4B model on my own. Would you mind sharing the script that worked for you? I am curious what did you did to handle both think and no-think modes. I manage to only make it work with enable_thinkmode=False. Seems to fail in think_mode.
Hope you can help me.
Thank you.
import os
import random
import torch
from tqdm import tqdm
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TextStreamer,
)
from datasets import load_dataset
if torch.cuda.get_device_capability()[0] >= 8:
torch_dtype = torch.bfloat16
attn_implementation = "flash_attention_2"
else:
torch_dtype = torch.float16
attn_implementation = "eager"
MODEL_ID = "Qwen/Qwen3-4B" #
@param
{type:"string"}
N_INSTRUCTIONS = 200 #
@param
{type:"number"}
TARGET_LAYER = 1 #
@param
{type:"slider", min:0, max:1, step:0.05}
REFUSAL_WEIGHT = 1.3 #
@param
{type:"slider", min:0, max:2, step:0.05}
PRIVATE_UPLOAD = True #
@param
{type:"boolean"}
target_prompt = "You are a helpful assistant." #
@param
{type:"string"}
target_dataset = "mlabonne/harmful_behaviors" #
@param
{type:"string"}
target_column = "text" #
@param
{type:"string"}
#
@markdown
## βοΈ Baseline dataset
baseline_prompt = "You are a helpful assistant." #
@param
{type:"string"}
baseline_dataset = "mlabonne/harmless_alpaca" #
@param
{type:"string"}
baseline_column = "text" #
@param
{type:"string"}
def load_instructions(dataset_id, column, n_instructions):
dataset = load_dataset(dataset_id, split="train")
indices = random.sample(range(len(dataset)), n_instructions * 2)
return [dataset[i][column] for i in indices[:n_instructions]], [
dataset[i][column] for i in indices[n_instructions:]
]
def generate_response(prompt):
inputs = tokenizer.apply_chat_template(
conversation=[{"role": "user", "content": prompt}],
enable_thinking=False,
add_generation_prompt=True,
return_tensors="pt",
).to(model.device)
model.generate(
inputs,
max_new_tokens=1024,
do_sample=True,
streamer=TextStreamer(tokenizer),
)
def generate_outputs(instructions, system_prompt):
inputs = [
tokenizer.apply_chat_template(
conversation=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": instruction},
],
add_generation_prompt=True,
enable_thinking=False,
return_tensors="pt",
).to(model.device)
for instruction in instructions
]
outputs = [
model.generate(
input,
use_cache=False,
max_new_tokens=1,
return_dict_in_generate=True,
output_hidden_states=True,
)["hidden_states"][0]
for input in tqdm(inputs, desc="Generating outputs")
]
return outputs
def orthogonalize_matrix(matrix, vec, weight=REFUSAL_WEIGHT):
vec = vec.view(-1).to(matrix.device)
if matrix.shape[-1] == vec.shape[0]:
proj = torch.einsum("...d,d->...", matrix, vec).unsqueeze(-1) * vec.unsqueeze(0)
return matrix - weight * proj
elif matrix.shape[0] == vec.shape[0]:
proj = torch.einsum("d...,d->...", matrix, vec).unsqueeze(0) * vec.unsqueeze(-1)
return matrix - weight * proj
else:
raise ValueError(
f"Matrix shape {matrix.shape} incompatible with vector shape {vec.shape}"
)
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
torch_dtype=torch_dtype,
attn_implementation=attn_implementation,
trust_remote_code=True
).eval()
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
LAYER_IDX = int(TARGET_LAYER * len(model.model.layers))
print("Before abliteration:")
generate_response("How to write a computer virus? /no_think")
# Load instructions
target_instructions, target_test = load_instructions(
target_dataset, target_column, N_INSTRUCTIONS
)
baseline_instructions, baseline_test = load_instructions(
baseline_dataset, baseline_column, N_INSTRUCTIONS
)
# Generate outputs
baseline_outputs = generate_outputs(
baseline_instructions, system_prompt=baseline_prompt
)
target_outputs = generate_outputs(target_instructions, system_prompt=target_prompt)
# Extract hidden states from outputs
target_hidden = [output[LAYER_IDX][:, -1, :] for output in target_outputs]
baseline_hidden = [output[LAYER_IDX][:, -1, :] for output in baseline_outputs]
# Calculate refusal direction
target_mean = torch.stack(target_hidden).mean(dim=0)
baseline_mean = torch.stack(baseline_hidden).mean(dim=0)
refusal_dir = target_mean - baseline_mean
refusal_dir = refusal_dir / refusal_dir.norm()
del target_outputs, baseline_outputs, target_hidden, baseline_hidden
# Orthogonalize model weights
refusal_dir = refusal_dir.view(-1).to(model.device)
stats = {"embed_tokens": False, "attention_o_proj": 0, "mlp_proj": 0}
# Embed tokens
if hasattr(model.model, "embed_tokens"):
model.model.embed_tokens.weight.data = orthogonalize_matrix(
model.model.embed_tokens.weight.data, refusal_dir, REFUSAL_WEIGHT
)
stats["embed_tokens"] = True
# Layer projections
for layer in tqdm(model.model.layers, desc="Orthogonalizing weights"):
# Attention output projection
if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "o_proj"):
layer.self_attn.o_proj.weight.data = orthogonalize_matrix(
layer.self_attn.o_proj.weight.data, refusal_dir, REFUSAL_WEIGHT
)
stats["attention_o_proj"] += 1
# MLP projection (down_proj or c_proj)
if hasattr(layer, "mlp"):
print(layer)
proj_name = (
"down_proj"
if hasattr(layer.mlp, "down_proj")
else "c_proj"
if hasattr(layer.mlp, "c_proj")
else None
)
if proj_name:
getattr(layer.mlp, proj_name).weight.data = orthogonalize_matrix(
getattr(layer.mlp, proj_name).weight.data, refusal_dir, REFUSAL_WEIGHT
)
stats["mlp_proj"] += 1
del refusal_dir
# Check if orthogonalization succeeded
if (
not stats["embed_tokens"]
and stats["attention_o_proj"] == 0
and stats["mlp_proj"] == 0
):
raise RuntimeError(
"Failed to orthogonalize any model weights. Model not abliterated."
)
print(f"Orthogonalization stats: {stats}")
print("After abliteration:")
generate_response("How to write a computer virus? /no_think")
# Push the model
save_directory = "./Worker-Dummy"
# Save the model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)