|
from model import ExLlamaConfig, Ex4bitLinear |
|
import torch |
|
import json |
|
from safetensors.torch import load_file as safe_load_file |
|
from torch import load as load_file |
|
|
|
class ExLlamaLora: |
|
|
|
lora_config_path: str |
|
lora_path: str |
|
lora_r: int |
|
lora_alpha: float |
|
lora_scaling: float |
|
config: ExLlamaConfig |
|
tensors: dict[torch.tensor] |
|
bias_ignored: bool |
|
|
|
def __init__(self, model, lora_config_path, lora_path): |
|
|
|
self.lora_config_path = lora_config_path |
|
self.lora_path = lora_path |
|
self.model = model |
|
self.config = model.config |
|
self.tensors = {} |
|
self.bias_ignored = False |
|
|
|
|
|
|
|
with open(lora_config_path) as f: |
|
read_config = json.load(f) |
|
|
|
self.lora_r = read_config["r"] |
|
self.lora_alpha = float(read_config["lora_alpha"]) |
|
self.lora_scaling = self.lora_alpha / self.lora_r |
|
|
|
if "fan_in_fan_out" in read_config and read_config["fan_in_fan_out"]: |
|
raise ValueError(" ## Error: fan_in_fan_out mode not supported.") |
|
|
|
|
|
|
|
if self.lora_path.endswith(".safetensors"): |
|
f = safe_load_file(self.lora_path, device = "cpu") |
|
else: |
|
f = load_file(self.lora_path, map_location = "cpu") |
|
|
|
for key in f.keys(): |
|
tensor = f[key] |
|
|
|
|
|
|
|
i = key.find("model.layers.") |
|
if i == -1: raise ValueError(f" ## Error: unsupported layer in {self.lora_path}: {key}") |
|
|
|
target_key = key[i:] |
|
ks = target_key.split(".") |
|
decoder_idx = int(ks[2]) |
|
decoder_part = ks[3] |
|
decoder_layer = ks[4] |
|
lora_half = ks[5] |
|
|
|
if lora_half == "bias": |
|
epsilon = 1e-6 |
|
if torch.max(tensor) > epsilon or torch.max(tensor) < -epsilon: |
|
raise ValueError(f" ## Error: unsupported bias target {self.lora_path}: {key}") |
|
self.bias_ignored = True |
|
continue |
|
|
|
target_module = self.model.layers[decoder_idx] |
|
if decoder_part == "self_attn": target_module = target_module.self_attn |
|
elif decoder_part == "mlp": target_module = target_module.mlp |
|
else: raise ValueError(f" ## Error: unsupported layer in {self.lora_path}: {key}") |
|
|
|
if decoder_layer == "q_proj": target_module = target_module.q_proj |
|
elif decoder_layer == "k_proj": target_module = target_module.k_proj |
|
elif decoder_layer == "v_proj": target_module = target_module.v_proj |
|
elif decoder_layer == "o_proj": target_module = target_module.o_proj |
|
elif decoder_layer == "gate_proj": target_module = target_module.gate_proj |
|
elif decoder_layer == "up_proj": target_module = target_module.up_proj |
|
elif decoder_layer == "down_proj": target_module = target_module.down_proj |
|
else: raise ValueError(f" ## Error: unsupported layer in {self.lora_path}: {key}") |
|
|
|
|
|
|
|
assert isinstance(target_module, Ex4bitLinear) |
|
|
|
if lora_half == "lora_A": |
|
in_features = tensor.shape[1] |
|
out_features = None |
|
elif lora_half == "lora_B": |
|
in_features = None |
|
out_features = tensor.shape[0] |
|
else: raise ValueError(f" ## Error: unsupported layer in {self.lora_path}: {key}") |
|
|
|
if (in_features and in_features != target_module.in_features) or (out_features and out_features != target_module.out_features): |
|
raise ValueError(f" ## Error: incompatible tensor shape in {self.lora_path}: {key}") |
|
|
|
|
|
|
|
tensor = tensor.T.contiguous() |
|
|
|
|
|
|
|
if lora_half == "lora_B" and self.lora_scaling != 1.0: tensor.mul_(self.lora_scaling) |
|
|
|
|
|
|
|
if tensor.dtype == torch.bfloat16: |
|
tensor = tensor.to(torch.float16) |
|
|
|
elif tensor.dtype == torch.float32: |
|
tensor = tensor.to(torch.float16) |
|
|
|
elif tensor.dtype == torch.float16: |
|
pass |
|
|
|
else: raise ValueError(f" ## Error: unsupported tensor dtype in {self.lora_path}") |
|
|
|
|
|
|
|
device = self.config.device_map.map(target_key) |
|
tensor = tensor.to(device, non_blocking = True) |
|
|
|
|
|
|
|
self.tensors[target_key] = tensor |
|
|