Spaces:

juanma1907
/

la-llama-que-llama

Paused

File size: 4,660 Bytes

462dacf

from model import ExLlamaConfig, Ex4bitLinear
import torch
import json
from safetensors.torch import load_file as safe_load_file
from torch import load as load_file

class ExLlamaLora:

    lora_config_path: str
    lora_path: str
    lora_r: int
    lora_alpha: float
    lora_scaling: float
    config: ExLlamaConfig
    tensors: dict[torch.tensor]
    bias_ignored: bool

    def __init__(self, model, lora_config_path, lora_path):

        self.lora_config_path = lora_config_path
        self.lora_path = lora_path
        self.model = model
        self.config = model.config
        self.tensors = {}
        self.bias_ignored = False

        # Grab relevant items from LoRA config

        with open(lora_config_path) as f:
            read_config = json.load(f)

        self.lora_r = read_config["r"]
        self.lora_alpha = float(read_config["lora_alpha"])
        self.lora_scaling = self.lora_alpha / self.lora_r

        if "fan_in_fan_out" in read_config and read_config["fan_in_fan_out"]:
            raise ValueError(" ## Error: fan_in_fan_out mode not supported.")

        # Load LoRA weights

        if self.lora_path.endswith(".safetensors"):
            f = safe_load_file(self.lora_path, device = "cpu")
        else:
            f = load_file(self.lora_path, map_location = "cpu")

        for key in f.keys():
            tensor = f[key]

            # Find target

            i = key.find("model.layers.")
            if i == -1: raise ValueError(f" ## Error: unsupported layer in {self.lora_path}: {key}")

            target_key = key[i:]
            ks = target_key.split(".")
            decoder_idx = int(ks[2])
            decoder_part = ks[3]
            decoder_layer = ks[4]
            lora_half = ks[5]

            if lora_half == "bias":
                epsilon = 1e-6
                if torch.max(tensor) > epsilon or torch.max(tensor) < -epsilon:
                    raise ValueError(f" ## Error: unsupported bias target {self.lora_path}: {key}")
                self.bias_ignored = True
                continue

            target_module = self.model.layers[decoder_idx]
            if decoder_part == "self_attn": target_module = target_module.self_attn
            elif decoder_part == "mlp": target_module = target_module.mlp
            else: raise ValueError(f" ## Error: unsupported layer in {self.lora_path}: {key}")

            if   decoder_layer == "q_proj": target_module = target_module.q_proj
            elif decoder_layer == "k_proj": target_module = target_module.k_proj
            elif decoder_layer == "v_proj": target_module = target_module.v_proj
            elif decoder_layer == "o_proj": target_module = target_module.o_proj
            elif decoder_layer == "gate_proj": target_module = target_module.gate_proj
            elif decoder_layer == "up_proj": target_module = target_module.up_proj
            elif decoder_layer == "down_proj": target_module = target_module.down_proj
            else: raise ValueError(f" ## Error: unsupported layer in {self.lora_path}: {key}")

            # Check that shape is compatible

            assert isinstance(target_module, Ex4bitLinear)

            if lora_half == "lora_A":
                in_features = tensor.shape[1]
                out_features = None
            elif lora_half == "lora_B":
                in_features = None
                out_features = tensor.shape[0]
            else: raise ValueError(f" ## Error: unsupported layer in {self.lora_path}: {key}")

            if (in_features and in_features != target_module.in_features) or (out_features and out_features != target_module.out_features):
                raise ValueError(f" ## Error: incompatible tensor shape in {self.lora_path}: {key}")

            # For efficiency, transpose adapter instead of transposing state during inference

            tensor = tensor.T.contiguous()

            # Pre-scale

            if lora_half == "lora_B" and self.lora_scaling != 1.0: tensor.mul_(self.lora_scaling)

            # Check that dtype is compatible, or convert

            if tensor.dtype == torch.bfloat16:
                tensor = tensor.to(torch.float16)

            elif tensor.dtype == torch.float32:
                tensor = tensor.to(torch.float16)

            elif tensor.dtype == torch.float16:
                pass

            else: raise ValueError(f" ## Error: unsupported tensor dtype in {self.lora_path}")

            # Move to target device

            device = self.config.device_map.map(target_key)
            tensor = tensor.to(device, non_blocking = True)

            # Store adapter tensor

            self.tensors[target_key] = tensor