Added generator code

Browse files

Files changed (7) hide show

Meta-Llama-3-70B-Instruct-8bpw/suppress_dir.safetensors +3 -0
Meta-Llama-3-8B-Instruct/suppress_dir.safetensors +3 -0
Phi-3-mini-128k-instruct/suppress_dir.safetensors +3 -0
README.md +86 -0
exl2_wrapper.py +61 -0
gen.py +198 -0
test_inference.py +542 -0

Meta-Llama-3-70B-Instruct-8bpw/suppress_dir.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:faec2cc2c48d1a925a58d08a5396e3255f50d269ccc66b6610defd5ce6074cfe
+size 2634640

Meta-Llama-3-8B-Instruct/suppress_dir.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de22c6df410a1bb839b3ae66a1d3b7aadcc1254d81a3c7fae17b8d509ed1f801
+size 529440

Phi-3-mini-128k-instruct/suppress_dir.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b0d2417d1c9684e73f44b5338f024975fcefd8d777a124633e27f6e9cc13e56a
+size 398360

README.md ADDED Viewed

	@@ -0,0 +1,86 @@

+---
+license: mit
+pipeline_tag: text-generation
+---
+ZoRA: Zero Rank Adaption
+=
+Inspired by [*Refusal in LLMs is mediated by a single direction*](https://www.alignmentforum.org/posts/jGuXSZgv6qfdhMCuJ/refusal-in-llms-is-mediated-by-a-single-direction), ZoRA is a refinement of the original approach that allows for adapting large language models to suppress refusals. The key features of ZoRA include:
+* **Layer-wise ablation**: Measure and ablate a separate set of vectors for each layer
+* **Multi-pass refinement**: Re-measure multiple times to refine the vectors
+* **Single-token generation**: Measure refusal at the beginning of the response
+* **Inference engine injection**: Load a small set of vectors to suppress refusals directly into a high-performance inference engine
+This approach enables the use of original model weights while loading a small set of suppression vectors. See below for vector generation details.
+ZoRA currently supports Exllamav2 only and is intended for research purposes. Seeking feedback on the viability of these models with suppression applied.
+Usage
+=
+Put the `supress_dir.safetensors` into the model directory and wrap your ExLlamaV2 model object in the code:
+```
+from exl2_wrapper import ExLlamaV2ModuleWrapper
+ExLlamaV2ModuleWrapper.wrap(model)
+```
+Example
+=
+There's a modified `test_inference.py` from [exllamav2](https://github.com/turboderp/exllamav2) for testing. For example:
+```
+python test_inference.py -m Meta-Llama-3-70B-Instruct-8bpw -p '<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful AI assistant.<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\n\nYour prompt.<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n\n' -gs auto
+```
+Generator
+=
+The code to generate the ablation vectors has been added. To run the code, you need to add the URL for the harmful prompts.
+Here is a sample output for the Llama3-8b model:
+```
+Downloading harmful prompts
+Done
+ -- Loading model...
+ -- Loaded model in 2.7671 seconds
+ -- Loading tokenizer...
+Building refused residual data
+Processing 5000 prompts
+ ---------------------------------------------------------------------------------------------------- 100
+ ---------------------------------------------------------------------------------------------------- 200
+ [...]
+ ---------------------------------------------+------------------------------------------------------ 1898
+ ---------------------------------------------------------------------------------------------------- 1998
+ --
+Max capture reached
+Captured 2000 residual streams
+Done
+Building allowed residual data
+Downloading harmless prompts
+Done
+Processing 31323 prompts
+ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 100
+ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 200
+ [...]
+ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1898
+ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1998
+ ++
+Max capture reached
+Captured 2000 residual streams
+Done
+Calculating mean allowed residual
+Done
+Iteration 0
+Processing 2000 prompts
+ ---+++++++++++++++++++++++++-+-+++++++++-++++++++++++++-+++-++-++++++++++++++-++++---++++++++-++++-+ 15
+ +++++++-++++++++++++++-+-++++++++++++++++++++++++++++-+++++++++--+++++++++++-++++++++++++++++++++++- 23
+ +++++++++++++++++++++++-++-++++++++++++++++-++++++++++-++-++++++++++++++++++++-++++++++--+++++++++++ 31
+ --+-+++++++++++++-++++++-+++++-+++-+++++-++++-++++++++++-++++-++++++++-++++++++++++++++++-++++++++++ 44
+ -++++++++-+++++++++-++++++++--++++-
+Max capture reached
+Captured 50 residual streams
+Iteration 1
+Processing 2000 prompts
+ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 0
+ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 0
+ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 0
+ [...]
+```

exl2_wrapper.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import os
+import torch
+from safetensors import safe_open
+class ExLlamaV2ModuleWrapper:
+    @classmethod
+    def wrap(cls, model, load = True):
+        for idx, module in enumerate(model.modules):
+            if idx == 0 or idx >= (len(model.modules) - 2):
+                continue
+            model.modules[idx] = ExLlamaV2ModuleWrapper(model, module, idx)
+        if not load:
+            return
+        suppress_dir_file = os.path.join(model.config.model_dir, 'suppress_dir.safetensors')
+        if os.path.exists(suppress_dir_file):
+            print(f'Loading suppress direction file "{suppress_dir_file}"')
+            with safe_open(suppress_dir_file, framework='pt', device='cpu') as f:
+                model._suppress_dir = []
+                for layer in range(len(f.keys())):
+                    model._suppress_dir.append(f.get_tensor(f'_suppress_dir_{layer}'))
+        else:
+            print(f'No suppress direction file, not wrapping. Tried to load: "{suppress_dir_file}"')
+            return
+    def __init__(self, model, module, idx):
+        if not hasattr(model, '_suppress_dir'):
+            model._suppress_dir = None
+        if not hasattr(model, '_residual'):
+            model._residual = None
+        self.model = model
+        self.module = module
+        self.idx = idx
+    def __getattribute__(self, name):
+        if name == 'forward':
+            return object.__getattribute__(self, 'wrapped_forward')
+        try:
+            return getattr(object.__getattribute__(self, 'module'), name)
+        except AttributeError:
+            pass
+        return object.__getattribute__(self, name)
+    def suppress(self, x):
+        if self.model._suppress_dir is not None:
+            r = self.model._suppress_dir[self.idx - 2].clone().to(x.device)
+            r = r.view(-1, 1)
+            proj_scalar = torch.matmul(x, r)
+            proj = proj_scalar * r.transpose(0, 1)
+            x = x - proj
+        return x
+    def wrapped_forward(self, *args, **kwargs):
+        if self.model._residual is not None:
+            if len(self.model._residual) < self.idx and args[0].shape[1] == 1:
+                self.model._residual.append(args[0].clone().to('cpu'))
+        x = self.suppress(args[0])
+        x = self.module.forward(*((x,) + args[1:]), **kwargs)
+        return self.suppress(x)

gen.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import re
+import time
+import random
+import io
+from pathlib import Path
+import json
+import torch
+import requests
+from safetensors.torch import save_file
+from exllamav2 import(
+    ExLlamaV2,
+    ExLlamaV2Config,
+    ExLlamaV2Cache,
+    ExLlamaV2Tokenizer,
+)
+from exllamav2.generator import (
+    ExLlamaV2BaseGenerator,
+    ExLlamaV2Sampler
+)
+from exl2_wrapper import ExLlamaV2ModuleWrapper
+### START Settings
+template = '<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful AI assistant.<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\n\n{instruction}<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n\n'
+model_dir = '/path/to/Meta-Llama-3-8B-Instruct'
+harmful_prompts_url = 'ADD_URL_HERE'
+harmless_prompts_url = 'https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json'
+### END Settings
+torch.cuda._lazy_init()
+torch.set_printoptions(precision = 5, sci_mode = False, linewidth = 150)
+config = ExLlamaV2Config()
+config.model_dir = model_dir
+config.prepare()
+config.max_seq_len = 2048
+model = ExLlamaV2(config)
+ExLlamaV2ModuleWrapper.wrap(model, False)
+model._residual = [] # Enable residual capture
+out_dir = Path(config.model_dir.replace('/', '_'))
+out_dir.mkdir(exist_ok = True)
+harmful_prompts_file = out_dir / Path('harmful_prompts.json')
+harmless_prompts_file = out_dir / Path('harmless_prompts.json')
+refused_residual_file = out_dir / Path('refused_residual.pth')
+allowed_residual_file = out_dir / Path('allowed_residual.pth')
+allowed_residual_mean_file = out_dir / Path('allowed_residual_mean.pth')
+suppress_dir_file = out_dir / Path('suppress_dir.safetensors')
+refused = []
+def get_residual(prompts, num_tokens, silent, max_capture, capture_type):
+    global model, tokenizer, settings, refused, generator
+    refused = []
+    residuals = []
+    print(f'Processing {len(prompts)} prompts')
+    for idx, prompt in enumerate(prompts):
+        if idx and not (idx % 100):
+            print('', len(residuals))
+        prompt = template.format(instruction = prompt)
+        model._residual = []
+        out = generator.generate_simple(prompt, settings, num_tokens, completion_only = True)
+        refusal = re.match(r'^(I\'m not|I cannot|I can\'t|I\'m sorry|As an A|I apolog|I\'m (unable|really|here)|[1I], as|I must|I understand|It(\'s| is) important|Sorry|The (assistant|AI))', out)
+        if capture_type is None or (capture_type == 'refused' and refusal) or (capture_type == 'allowed' and not refusal):
+            residuals.append(model._residual[:])
+        if refusal:
+            refused.append(prompt)
+        print('-' if refusal else '+', end='', flush = True)
+        if max_capture and len(residuals) >= max_capture:
+            print('\nMax capture reached')
+            break
+        if not silent:
+            print(out)
+    if not len(residuals):
+        return None
+    print(f'\nCaptured {len(residuals)} residual streams')
+    res = []
+    for l in range(len(residuals[0])):
+        res.append(torch.cat([t[l][0, -1, :].unsqueeze(0) for t in residuals], dim=0))
+    return res
+if not harmful_prompts_file.exists():
+    print('Downloading harmful prompts')
+    res = requests.get(harmful_prompts_url)
+    harmful_prompts = []
+    for line in res.iter_lines():
+        if line:
+            harmful_prompts.append(json.loads(line.decode())['prompt'])
+    with harmful_prompts_file.open('w') as f:
+        json.dump(harmful_prompts, f)
+    print('Done')
+else:
+    with harmful_prompts_file.open('r') as f:
+        harmful_prompts = json.load(f)
+print(" -- Loading model...")
+t = time.time()
+cache = ExLlamaV2Cache(model, lazy=True)
+model.load_autosplit(cache)
+t = time.time() - t
+print(f" -- Loaded model in {t:.4f} seconds")
+print(" -- Loading tokenizer...")
+tokenizer = ExLlamaV2Tokenizer(config)
+settings = ExLlamaV2Sampler.Settings()
+settings.temperature = 0
+generator = ExLlamaV2BaseGenerator(model, cache, tokenizer)
+with torch.inference_mode():
+    if not refused_residual_file.exists():
+        print('Building refused residual data')
+        refused_residual = get_residual(harmful_prompts, 4, True, 2000, 'refused')
+        torch.save(refused_residual, refused_residual_file)
+    else:
+        print('Loading refusal residual data')
+        refused_residual = torch.load(refused_residual_file)
+    print('Done')
+    allowed_residual_mean = []
+    if not allowed_residual_mean_file.exists():
+        if not allowed_residual_file.exists():
+            print('Building allowed residual data')
+            if not harmless_prompts_file.exists():
+                print('Downloading harmless prompts')
+                res = requests.get(harmless_prompts_url)
+                all_prompts = json.loads(res.content.decode('utf8'))
+                harmless_prompts = [i['instruction'] for i in all_prompts if i['input'] == '']
+                with harmless_prompts_file.open('w') as f:
+                    json.dump(harmless_prompts, f)
+                print('Done')
+            else:
+                with harmless_prompts_file.open('r') as f:
+                    harmless_prompts = json.load(f)
+            allowed_residual = get_residual(harmless_prompts, 4, True, 2000, 'allowed')
+            torch.save(allowed_residual, allowed_residual_file)
+        else:
+            print('Loading allowed residual data')
+            allowed_residual = torch.load(allowed_residual_file)
+        print('Done')
+        print('Calculating mean allowed residual')
+        for i in range(len(allowed_residual)):
+            allowed_residual_mean.append(allowed_residual[i].mean(dim = 0))
+        print('Done')
+        torch.save(allowed_residual_mean, allowed_residual_mean_file)
+    else:
+        allowed_residual_mean = torch.load(allowed_residual_mean_file)
+    if model._suppress_dir is None:
+        model._suppress_dir = []
+    for o in range(6):
+        print('Iteration', o)
+        for i in range(len(refused_residual)):
+            refusal_dir = refused_residual[i].mean(dim = 0) - allowed_residual_mean[i]
+            refusal_dir = refusal_dir / refusal_dir.norm() if refusal_dir.norm() > 0.0001 else torch.zeros_like(refusal_dir)
+            if len(model._suppress_dir) > i:
+                model._suppress_dir[i] = (model._suppress_dir[i] + refusal_dir) / 2
+            else:
+                model._suppress_dir.append(refusal_dir)
+        refused_residual = get_residual(random.sample(harmful_prompts, 2000), 4, True, 50, 'refused')
+        if not refused_residual or refused_residual[0].shape[0] < 30:
+            break
+    save_file({f'_suppress_dir_{layer}': tensor for layer, tensor in enumerate(model._suppress_dir)}, suppress_dir_file)
+    torch.cuda.synchronize()

test_inference.py ADDED Viewed

	@@ -0,0 +1,542 @@

+from exllamav2 import(
+    ExLlamaV2,
+    ExLlamaV2Config,
+    ExLlamaV2Cache,
+    ExLlamaV2Cache_8bit,
+    ExLlamaV2Cache_Q4,
+    ExLlamaV2Tokenizer,
+    model_init,
+)
+from exllamav2.generator import (
+    ExLlamaV2BaseGenerator,
+    ExLlamaV2Sampler
+)
+from exllamav2.attn import ExLlamaV2Attention
+from exllamav2.mlp import ExLlamaV2MLP
+from exllamav2.moe_mlp import ExLlamaV2MoEMLP
+from exllamav2.parallel_decoder import ExLlamaV2ParallelDecoder
+import argparse, os, math, time
+import torch
+import torch.nn.functional as F
+from conversion.tokenize import get_tokens
+from conversion.quantize import list_live_tensors
+import gc
+# from exllamav2.mlp import set_catch
+import sys
+import json
+torch.cuda._lazy_init()
+torch.set_printoptions(precision = 5, sci_mode = False, linewidth = 150)
+# torch.backends.cuda.matmul.allow_tf32 = True
+# torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
+# torch.set_float32_matmul_precision("medium")
+# (!!!) NOTE: These go on top of the engine arguments that can be found in `model_init.py` (!!!)
+parser = argparse.ArgumentParser(description = "Test inference on ExLlamaV2 model")
+parser.add_argument("-ed", "--eval_dataset", type = str, help = "Perplexity evaluation dataset (.parquet file)")
+parser.add_argument("-er", "--eval_rows", type = int, default = 128, help = "Number of rows to apply from dataset")
+parser.add_argument("-el", "--eval_length", type = int, default = 2048, help = "Max no. tokens per sample")
+parser.add_argument("-et", "--eval_token", action = "store_true", help = "Evaluate perplexity on token-by-token inference using cache")
+parser.add_argument("-e8", "--eval_token_8bit", action = "store_true", help = "Evaluate perplexity on token-by-token inference using 8-bit (FP8) cache")
+parser.add_argument("-eq4", "--eval_token_q4", action = "store_true", help = "Evaluate perplexity on token-by-token inference using Q4 cache")
+# parser.add_argument("-eb", "--eval_bos", action = "store_true", help = "Add BOS token to every row in perplexity test (required by Gemma and maybe other models.)")
+parser.add_argument("-p", "--prompt", type = str, help = "Generate from prompt (basic sampling settings)")
+parser.add_argument("-pnb", "--prompt_no_bos", action = "store_true", help = "Don't add BOS token to prompt")
+parser.add_argument("-t", "--tokens", type = int, default = 128, help = "Max no. tokens")
+parser.add_argument("-ps", "--prompt_speed", action = "store_true", help = "Test prompt processing (batch) speed over context length")
+parser.add_argument("-s", "--speed", action = "store_true", help = "Test raw generation speed over context length")
+parser.add_argument("-mix", "--mix_layers", type = str, help = "Load replacement layers from secondary model. Example: --mix_layers 1,6-7:/mnt/models/other_model")
+parser.add_argument("-nwu", "--no_warmup", action = "store_true", help = "Skip warmup before testing model")
+parser.add_argument("-sl", "--stream_layers", action = "store_true", help = "Load model layer by layer (perplexity evaluation only)")
+parser.add_argument("-sp", "--standard_perplexity", choices = ["wiki2"], help = "Run standard (HF) perplexity test, stride 512 (experimental)")
+parser.add_argument("-rr", "--rank_reduce", type = str, help = "Rank-reduction for MLP layers of model, in reverse order (for experimentation)")
+parser.add_argument("-mol", "--max_output_len", type = int, help = "Set max output chunk size (incompatible with ppl tests)")
+# Initialize model and tokenizer
+model_init.add_args(parser)
+args = parser.parse_args()
+# Check conflicting settings
+if args.stream_layers:
+    if args.eval_token or args.eval_token_8bit or args.eval_token_q4:
+        print(" ## Can't test token ppl while streaming layers")
+        sys.exit()
+    if args.prompt:
+        print(" ## Can't generate while streaming layers")
+        sys.exit()
+    if args.speed or args.prompt_speed:
+        print(" ## Can't test speed while streaming layers")
+        sys.exit()
+    if args.gpu_split:
+        print(" ## Can only use one GPU when streaming layers")
+        sys.exit()
+    if args.eval_dataset:
+        if args.length and args.eval_length != args.length:
+            print(" !! Overriding model context length to match eval row length")
+        args.length = args.eval_length
+# Init
+model_init.check_args(args)
+model_init.print_options(args)
+model, tokenizer = model_init.init(args,
+                                   allow_auto_split = True,
+                                   skip_load = args.stream_layers,
+                                   benchmark = True,
+                                   max_output_len = args.max_output_len)
+cache = None
+from exl2_wrapper import ExLlamaV2ModuleWrapper
+ExLlamaV2ModuleWrapper.wrap(model)
+# Auto split
+if not model.loaded and not args.stream_layers:
+    if args.mix_layers:
+        print(" !! Warning, auto split does not account for VRAM requirement of replacement layers")
+    print(" -- Loading model...")
+    cache = ExLlamaV2Cache(model, lazy = True)
+    t = time.time()
+    model.load_autosplit(cache)
+    t = time.time() - t
+    print(f" -- Loaded model in {t:.4f} seconds")
+if args.stream_layers:
+    stream_batch_size = 2
+    model.config.max_batch_size = stream_batch_size
+    model.load(lazy = True)
+# Rank reduction
+if args.rank_reduce:
+    if args.stream_layers:
+        print(" ## --rank_reduce can not be combined with --stream_layers")
+        sys.exit()
+    rr = args.rank_reduce.split(",")
+    idx = len(model.modules) - 1
+    for r in rr:
+        k = float(r)
+        while True:
+            idx -= 1
+            module = model.modules[idx]
+            if isinstance(module, ExLlamaV2ParallelDecoder): break
+            if isinstance(module, ExLlamaV2MLP): break
+            if isinstance(module, ExLlamaV2MoEMLP): break
+            if idx < 0:
+                print(" ## Not enough layers")
+                sys.exit()
+        print(f" -- Reducing {module.key} ({module.name}) to {k * 100:.2f}%")
+        module.rank_reduce(k)
+# Replacement
+if args.mix_layers:
+    intervals_, extra_dir = args.mix_layers.split(":")
+    print(f" -- Loading replacement layers from: {extra_dir}")
+    extra_config = ExLlamaV2Config()
+    extra_config.model_dir = extra_dir
+    extra_config.prepare()
+    intervals = intervals_.split(",")
+    for interval in intervals:
+        ab = interval.split("-")
+        a, b = int(ab[0]), int(ab[-1])
+        for idx in range(a, b + 1):
+            print(f" --   Layer {idx}...")
+            layerkey = "model.layers." + str(idx) + "."
+            remove = [k for k in model.config.tensor_file_map.keys() if k.startswith(layerkey)]
+            replace = [k for k in extra_config.tensor_file_map.keys() if k.startswith(layerkey)]
+            # reload = [k for k in model.modules_dict.keys() if k.startswith(layerkey)]
+            for k in remove: del model.config.tensor_file_map[k]
+            for k in replace: model.config.tensor_file_map[k] = extra_config.tensor_file_map[k]
+            # for k in reload:
+            #     model.modules_dict[k].unload()
+            #     model.modules_dict[k].load()
+            if not args.stream_layers:
+                model.modules[idx * 2 + 1].reload()
+                model.modules[idx * 2 + 2].reload()
+# Test generation
+if args.prompt:
+    with torch.inference_mode():
+        if cache is None:
+            cache = ExLlamaV2Cache(model)
+        ids = tokenizer.encode(args.prompt)
+        tokens_prompt = ids.shape[-1]
+        print(f" -- Warmup...")
+        generator = ExLlamaV2BaseGenerator(model, cache, tokenizer)
+        if not args.no_warmup: generator.warmup()
+        print(f" -- Generating...")
+        print()
+        settings = ExLlamaV2Sampler.Settings()
+        settings.temperature = 0.75
+        settings.top_k = 100
+        settings.top_p = 0.75
+        settings.token_repetition_penalty = 1.05
+        settings.disallow_tokens(tokenizer, [tokenizer.eos_token_id])
+        time_begin = time.time()
+        output = generator.generate_simple(args.prompt, settings, args.tokens, token_healing = True, add_bos = not args.prompt_no_bos)
+        torch.cuda.synchronize()
+        time_prompt = time.time()
+        time_end = time.time()
+    print(output)
+    print()
+    total_gen = time_end - time_begin
+    print(f" -- Response generated in {total_gen:.2f} seconds, {args.tokens} tokens, {args.tokens / total_gen:.2f} tokens/second (includes prompt eval.)")
+# Test perplexity
+if args.eval_dataset or args.standard_perplexity:
+    with torch.inference_mode():
+        print(f" -- Running perplexity test")
+        if args.standard_perplexity:
+            eval_length = args.eval_length
+            if args.eval_dataset:
+                print(f" !! Note, overriding specified --eval_dataset with {args.standard_perplexity}")
+            from datasets import load_dataset
+            if args.standard_perplexity == "wiki2":
+                ds = "wikitext"
+                part = "wikitext-2-raw-v1"
+                split = "test"
+            # if args.standard_perplexity == "c4":
+            #     ds = "allenai/c4"
+            #     part = "allenai--c4"
+            #     split = "train"
+            print(f" -- Loading dataset {ds}, {part}, {split}...")
+            test = load_dataset(ds, part, split = split)
+            print(f" -- Tokenizing samples...")
+            text = "\n\n".join(test["text"])
+            eval_tokens = tokenizer.encode(text)
+            stride = 512
+            seqs = []
+            eval_len = []
+            a = 0
+            while True:
+                b = a + model.config.max_seq_len
+                if b > eval_tokens.shape[-1]: break
+                seqs.append(eval_tokens[:, a:b])
+                eval_len.append(b if a == 0 else stride)
+                a += stride
+            eval_tokens = torch.cat(seqs, dim = 0)
+        else:
+            eval_dataset = args.eval_dataset
+            eval_rows = args.eval_rows
+            eval_length = args.eval_length
+            print(f" -- Dataset: {eval_dataset}")
+            print(f" -- Tokenizing eval data, {eval_rows} rows x {eval_length} tokens...")
+            eval_tokens = get_tokens(eval_rows, eval_length, eval_dataset, tokenizer)
+            eval_len = [eval_tokens.shape[1]] * eval_tokens.shape[0]
+            # if args.eval_bos:
+            if model.config.arch.requires_bos:
+                boss = torch.full((eval_tokens.shape[0], 1), tokenizer.bos_token_id, dtype = torch.long)
+                eval_tokens = torch.cat((boss, eval_tokens[:, :-1]), dim = 1)
+        logprob_sum = 0.0
+        logprob_count = 0
+        def ppl(input_ids__, logits__, lengths__):
+            logprob_sum_ = 0.0
+            logprob_count_ = 0
+            assert logits__.shape[0] == input_ids__.shape[0]
+            ll = logits__.shape[1]
+            for bi in range(logits__.shape[0]):
+                cl = max(ll - lengths__[bi], 0)
+                logits_ = logits__[bi:bi+1, cl:, :]
+                input_ids_ = input_ids__[bi:bi+1, cl:]
+                chunksize = logits_.shape[1] * 4000 // logits_.shape[2] + 1
+                b_ = 0
+                while b_ < logits_.shape[1]:
+                    a_ = b_
+                    b_ = min(b_ + chunksize, logits_.shape[1])
+                    logits_f = logits_[:, a_:b_, :].float() + 1e-10
+                    target_ids = input_ids_[:, a_ + 1:b_ + 1].to(logits_.device)
+                    log_probs = F.log_softmax(logits_f, dim=-1)
+                    token_log_probs = log_probs.gather(-1, target_ids.unsqueeze(-1)).squeeze(-1)
+                    logprob_sum_ += token_log_probs.sum().item()
+                    logprob_count_ += target_ids.numel()
+            return logprob_sum_, logprob_count_
+        if args.stream_layers:
+            print(f" -- Inference (streamed)", end = "")
+            sys.stdout.flush()
+            batch_size, seq_len = eval_tokens.shape
+            attn_params = ExLlamaV2Attention.Params(stream_batch_size, seq_len, 0, None, None)
+            # attn_mask = model.build_attn_mask(stream_batch_size, seq_len, 0, None, "cuda:0")
+            for idx, module in enumerate(model.modules):
+                module.set_device_idx(-1 if idx == 0 else 0)
+            model.modules[0].load()
+            hidden_state = model.modules[0].forward(eval_tokens)
+            model.modules[0].unload()
+            for idx, module in enumerate(model.modules):
+                if idx == 0: continue
+                print(".", end = "")
+                sys.stdout.flush()
+                module.load()
+                b = 0
+                while b < eval_tokens.shape[0]:
+                    a = b
+                    b = min(b + stream_batch_size, eval_tokens.shape[0])
+                    x = hidden_state[a:b, :, :].to("cuda:0")
+                    x = module.forward(x, cache = None, attn_params = attn_params, past_len = 0, loras = None)
+                    if idx < len(model.modules) - 1:
+                        hidden_state[a:b, :, :] = x.to("cpu")
+                    else:
+                        input_ids = eval_tokens[a:b, :]
+                        logits = x[:, :-1, :]
+                        # if model.config.logit_scale != 1:
+                        #     logits.mul_(model.config.logit_scale)
+                        logprob_sum__, logprob_count__ = ppl(input_ids, logits, eval_len[a:b])
+                        logprob_sum += logprob_sum__
+                        logprob_count += logprob_count__
+                module.unload()
+            print()
+        else:
+            print(f" -- Inference", end = "")
+            sys.stdout.flush()
+            if cache is None:
+                cache = ExLlamaV2Cache(model, max_seq_len = eval_length) if eval_length > model.config.max_input_len else None
+            for i in range(eval_tokens.shape[0]):
+                if i % 10 == 0: print(".", end = "")
+                sys.stdout.flush()
+                input_ids = eval_tokens[i:i+1, :]
+                input_ids = input_ids[:, :]
+                if cache is not None: cache.current_seq_len = 0
+                logits = model.forward(input_ids, cache)
+                logits = logits[:, :-1, :]
+                logprob_sum__, logprob_count__ = ppl(input_ids, logits, eval_len[i:i+1])
+                logprob_sum += logprob_sum__
+                logprob_count += logprob_count__
+        print()
+        mean_log_prob = logprob_sum / logprob_count
+        perplexity = math.exp(-mean_log_prob)
+        print(f" -- Evaluation perplexity: {perplexity:.4f}")
+        def test_ppl_token():
+            global logprob_sum, logprob_count, i, input_ids
+            global logits, target_ids, log_probs, token_log_probs
+            global mean_log_prob, perplexity
+            # set_catch("model.layers.3")
+            logprob_sum = 0
+            logprob_count = 0
+            for i in range(eval_tokens.shape[0]):
+                cache.current_seq_len = 0
+                for j in range(eval_tokens.shape[1] - 1):
+                    if j % 256 == 0: print(".", end = "")
+                    sys.stdout.flush()
+                    input_ids = eval_tokens[i:i + 1, j:j + 1]
+                    logits = model.forward(input_ids, cache)
+                    logits = logits.float() + 1e-10
+                    log_probs = F.log_softmax(logits, dim = -1)
+                    logprob_sum += log_probs[0, 0, eval_tokens[i, j+1]]
+                    logprob_count += 1
+                    # mean_log_prob = logprob_sum / logprob_count
+                    # perplexity = math.exp(-mean_log_prob)
+                    # print(f" -- Token {j}: {perplexity:.4f}")
+            print()
+            mean_log_prob = logprob_sum / logprob_count
+            perplexity = math.exp(-mean_log_prob)
+            print(f" -- Evaluation perplexity: {perplexity:.4f}")
+        if args.eval_token:
+            if args.standard_perplexity:
+                print(f" !! Note, can't evalutate token perplexity on standard test")
+            else:
+                print(f" -- Inference (token)", end = "")
+                sys.stdout.flush()
+                cache = ExLlamaV2Cache(model, max_seq_len = eval_length)
+                test_ppl_token()
+        if args.eval_token_8bit:
+            if args.standard_perplexity:
+                print(f" !! Note, can't evalutate token perplexity on standard test")
+            else:
+                print(f" -- Inference (token, 8-bit cache)", end = "")
+                sys.stdout.flush()
+                cache = ExLlamaV2Cache_8bit(model, max_seq_len = eval_length)
+                test_ppl_token()
+        if args.eval_token_q4:
+            if args.standard_perplexity:
+                print(f" !! Note, can't evalutate token perplexity on standard test")
+            else:
+                print(f" -- Inference (token, Q4 cache)", end = "")
+                sys.stdout.flush()
+                cache = ExLlamaV2Cache_Q4(model, max_seq_len = eval_length)
+                test_ppl_token()
+# Test prompt speed
+if args.prompt_speed:
+    with torch.inference_mode():
+        if cache is None:
+            cache = ExLlamaV2Cache(model)
+        ids = torch.randint(0, model.config.vocab_size - 1, (1, model.config.max_seq_len))
+        print(f" -- Warmup...")
+        if not args.no_warmup:
+            model.forward(ids[:, -1:])
+        print(f" -- Measuring prompt speed...")
+        torch.cuda.synchronize()
+        current_len = 128
+        step = 128
+        prompt_iters = 3
+        while True:
+            total_time = 0
+            for i in range(prompt_iters):
+                torch.cuda.synchronize()
+                time_begin = time.time()
+                cache.current_seq_len = 0
+                model.forward(ids[:, :current_len], cache, preprocess_only = True)
+                torch.cuda.synchronize()
+                time_end = time.time()
+                total_time += time_end - time_begin
+            tps = current_len / (total_time / prompt_iters)
+            print(f" ** Length {current_len:>5} tokens: {tps:>11.4f} t/s")
+            if current_len >= 1024: step = 1024
+            if current_len >= 4096: step = 4096
+            if current_len >= 16384: step = 8192
+            current_len_ = current_len
+            current_len = min(current_len + step, model.config.max_seq_len)
+            if current_len == current_len_: break
+# Test token speed
+if args.speed:
+    with torch.inference_mode():
+        if cache is None:
+            cache = ExLlamaV2Cache(model)
+        cache.current_seq_len = 0
+        print(f" -- Measuring token speed...")
+        ids = tokenizer.encode("X")
+        model.forward(ids[:, :])
+        current_idx = ids.shape[-1]
+        next_stop = 128
+        while True:
+            time_begin = time.time()
+            tokens = next_stop - current_idx
+            for i in range(tokens):
+                logits = model.forward(ids[:, -1:], cache)
+                sample = torch.argmax(logits[0, -1]).cpu().unsqueeze(0).unsqueeze(0)
+                ids = torch.cat((ids, sample), dim=-1)
+            time_end = time.time()
+            tps = tokens / (time_end - time_begin)
+            print(f" ** Position {current_idx:>5} + {tokens:>3} tokens: {tps:>9.4f} t/s")
+            current_idx = next_stop
+            next_stop = min(next_stop + 128, model.config.max_seq_len)
+            if next_stop == current_idx: break