So fast!

#1
by ngxson - opened

The model has just been release ~24 hrs ago

How on earth you can get this abliterated so fast? πŸ˜‚πŸ˜‚πŸ˜‚πŸ˜‚πŸ˜‚

abliteration go brrr

Hi been trying the same thing with the 4B model on my own. Would you mind sharing the script that worked for you? I am curious what did you did to handle both think and no-think modes. I manage to only make it work with enable_thinkmode=False. Seems to fail in think_mode.
Hope you can help me.
Thank you.

import os
import random

import torch
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TextStreamer,
)
from datasets import load_dataset

if torch.cuda.get_device_capability()[0] >= 8:
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

MODEL_ID = "Qwen/Qwen3-4B"  # 

@param
	 {type:"string"}
N_INSTRUCTIONS = 200  # 

@param
	 {type:"number"}
TARGET_LAYER = 1 # 

@param
	 {type:"slider", min:0, max:1, step:0.05}
REFUSAL_WEIGHT = 1.3  # 

@param
	 {type:"slider", min:0, max:2, step:0.05}
PRIVATE_UPLOAD = True # 

@param
	 {type:"boolean"}

target_prompt = "You are a helpful assistant." # 

@param
	 {type:"string"}
target_dataset = "mlabonne/harmful_behaviors" # 

@param
	 {type:"string"}
target_column = "text" # 

@param
	 {type:"string"}

# 

@markdown
	 ## βš–οΈ Baseline dataset
baseline_prompt = "You are a helpful assistant." # 

@param
	 {type:"string"}
baseline_dataset = "mlabonne/harmless_alpaca" # 

@param
	 {type:"string"}
baseline_column = "text" # 

@param
	 {type:"string"}

def load_instructions(dataset_id, column, n_instructions):
    dataset = load_dataset(dataset_id, split="train")
    indices = random.sample(range(len(dataset)), n_instructions * 2)
    return [dataset[i][column] for i in indices[:n_instructions]], [
        dataset[i][column] for i in indices[n_instructions:]
    ]

def generate_response(prompt):
    inputs = tokenizer.apply_chat_template(
        conversation=[{"role": "user", "content": prompt}],
        enable_thinking=False,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to(model.device)

    model.generate(
        inputs,
        max_new_tokens=1024,
        do_sample=True,
        streamer=TextStreamer(tokenizer),
    )

def generate_outputs(instructions, system_prompt):
    inputs = [
        tokenizer.apply_chat_template(
            conversation=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": instruction},
            ],
            add_generation_prompt=True,
            enable_thinking=False,
            return_tensors="pt",
        ).to(model.device)
        for instruction in instructions
    ]

    outputs = [
        model.generate(
            input,
            use_cache=False,
            max_new_tokens=1,
            return_dict_in_generate=True,
            output_hidden_states=True,
        )["hidden_states"][0]
        for input in tqdm(inputs, desc="Generating outputs")
    ]
    return outputs

def orthogonalize_matrix(matrix, vec, weight=REFUSAL_WEIGHT):
    vec = vec.view(-1).to(matrix.device)

    if matrix.shape[-1] == vec.shape[0]:
        proj = torch.einsum("...d,d->...", matrix, vec).unsqueeze(-1) * vec.unsqueeze(0)
        return matrix - weight * proj
    elif matrix.shape[0] == vec.shape[0]:
        proj = torch.einsum("d...,d->...", matrix, vec).unsqueeze(0) * vec.unsqueeze(-1)
        return matrix - weight * proj
    else:
        raise ValueError(
            f"Matrix shape {matrix.shape} incompatible with vector shape {vec.shape}"
        )

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch_dtype,
    attn_implementation=attn_implementation,
    trust_remote_code=True
).eval()
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
LAYER_IDX = int(TARGET_LAYER * len(model.model.layers))
print("Before abliteration:")
generate_response("How to write a computer virus? /no_think")

# Load instructions
target_instructions, target_test = load_instructions(
    target_dataset, target_column, N_INSTRUCTIONS
)

baseline_instructions, baseline_test = load_instructions(
    baseline_dataset, baseline_column, N_INSTRUCTIONS
)

# Generate outputs
baseline_outputs = generate_outputs(
    baseline_instructions, system_prompt=baseline_prompt
)
target_outputs = generate_outputs(target_instructions, system_prompt=target_prompt)

# Extract hidden states from outputs
target_hidden = [output[LAYER_IDX][:, -1, :] for output in target_outputs]
baseline_hidden = [output[LAYER_IDX][:, -1, :] for output in baseline_outputs]

# Calculate refusal direction
target_mean = torch.stack(target_hidden).mean(dim=0)
baseline_mean = torch.stack(baseline_hidden).mean(dim=0)
refusal_dir = target_mean - baseline_mean
refusal_dir = refusal_dir / refusal_dir.norm()

del target_outputs, baseline_outputs, target_hidden, baseline_hidden

# Orthogonalize model weights
refusal_dir = refusal_dir.view(-1).to(model.device)
stats = {"embed_tokens": False, "attention_o_proj": 0, "mlp_proj": 0}

# Embed tokens
if hasattr(model.model, "embed_tokens"):
    model.model.embed_tokens.weight.data = orthogonalize_matrix(
        model.model.embed_tokens.weight.data, refusal_dir, REFUSAL_WEIGHT
    )
    stats["embed_tokens"] = True

# Layer projections
for layer in tqdm(model.model.layers, desc="Orthogonalizing weights"):
    # Attention output projection
    if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "o_proj"):
        layer.self_attn.o_proj.weight.data = orthogonalize_matrix(
            layer.self_attn.o_proj.weight.data, refusal_dir, REFUSAL_WEIGHT
        )
        stats["attention_o_proj"] += 1

    # MLP projection (down_proj or c_proj)
    if hasattr(layer, "mlp"):
        print(layer)
        proj_name = (
            "down_proj"
            if hasattr(layer.mlp, "down_proj")
            else "c_proj"
            if hasattr(layer.mlp, "c_proj")
            else None
        )
        if proj_name:
            getattr(layer.mlp, proj_name).weight.data = orthogonalize_matrix(
                getattr(layer.mlp, proj_name).weight.data, refusal_dir, REFUSAL_WEIGHT
            )
            stats["mlp_proj"] += 1

del refusal_dir

# Check if orthogonalization succeeded
if (
    not stats["embed_tokens"]
    and stats["attention_o_proj"] == 0
    and stats["mlp_proj"] == 0
):
    raise RuntimeError(
        "Failed to orthogonalize any model weights. Model not abliterated."
    )

print(f"Orthogonalization stats: {stats}")

print("After abliteration:")
generate_response("How to write a computer virus? /no_think")

# Push the model
save_directory = "./Worker-Dummy"
# Save the model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

Sign up or log in to comment