import contextlib import logging import math from typing import List, Optional import torch import transformers from torch import nn LOGGER = logging.getLogger(__name__) QUANT_LAYERS = [nn.Linear, nn.Conv2d, transformers.Conv1D] def is_transformer_conv1d(layer): return isinstance(layer, transformers.Conv1D) # These two functions only work on per-channel symmetric quantization for weight def get_weight_scale(weight, weight_bit_width): weight_scale = (weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half() return weight_scale def fake_quantize_weight(weight, weight_scale): weight_scale = weight_scale[:, None] fake_quantized_weight = torch.round(weight / weight_scale) * weight_scale return fake_quantized_weight class GPTQLayerWrapper: def __init__(self, layer_name, layer, weight_bit_width): super().__init__() self.layer_name = layer_name self.layer = layer self.device = layer.weight.device columns = layer.weight.shape[1] self.columns = columns self.H = torch.zeros((columns, columns), device=self.device) self.nsamples = 0 self.is_record = True self.weight_bit_width = weight_bit_width self.weight_scale = None def record_h(self, x): if self.is_record: x = x.detach().clone() if len(x.shape) == 2: x = x.unsqueeze(0) batch = x.shape[0] if isinstance(self.layer, nn.Linear) or is_transformer_conv1d(self.layer): if len(x.shape) == 3: x = x.reshape((-1, x.shape[-1])) x = x.t() if isinstance(self.layer, nn.Conv2d): unfold = nn.Unfold( self.layer.kernel_size, dilation=self.layer.dilation, padding=self.layer.padding, stride=self.layer.stride ) x = unfold(x) x = x.permute([1, 0, 2]) x = x.flatten(1) self.H *= self.nsamples / (self.nsamples + batch) self.nsamples += batch x = math.sqrt(2 / self.nsamples) * x.float() self.H += x.matmul(x.t()) def quant_weight(self, blocksize=128, percdamp=.01, groupsize=-1): if groupsize != -1: raise RuntimeError("Group quantization of gptq quantizer is not supported for now") weight = self.layer.weight.data.clone() if isinstance(self.layer, nn.Conv2d): weight = weight.flatten(1) if is_transformer_conv1d(self.layer): weight = weight.t() weight = weight.float() weight_scale = get_weight_scale(weight, self.weight_bit_width) # todo: use buffer to store scale self.weight_scale = weight_scale H = self.H dead = torch.diag(H) == 0 H[dead, dead] = 1 weight[:, dead] = 0 losses = torch.zeros_like(weight) Q = torch.zeros_like(weight) damp = percdamp * torch.mean(torch.diag(H)) diag = torch.arange(self.columns, device=self.device) H[diag, diag] += damp try: H = torch.linalg.cholesky(H) H = torch.cholesky_inverse(H) H = torch.linalg.cholesky(H, upper=True) except Exception: logging.warning(f"Warning: cannot do compression on layer {self.layer_name} because of inverse error") return if H.isnan().any(): logging.warning(f"Warning: cannot do compression on layer {self.layer_name} because of inverse error") return hinv = H for i1 in range(0, self.columns, blocksize): i2 = min(i1 + blocksize, self.columns) count = i2 - i1 w1 = weight[:, i1:i2].clone() q1 = torch.zeros_like(w1) total_err = torch.zeros_like(w1) losses1 = torch.zeros_like(w1) hinv1 = hinv[i1:i2, i1:i2] for i in range(count): w = w1[:, i] d = hinv1[i, i] q = fake_quantize_weight(w.unsqueeze(1), weight_scale).flatten() q1[:, i] = q losses1[:, i] = (w - q) ** 2 / d ** 2 err = (w - q) / d w1[:, i:] -= err.unsqueeze(1).matmul(hinv1[i, i:].unsqueeze(0)) total_err[:, i] = err Q[:, i1:i2] = q1 losses[:, i1:i2] = losses1 / 2 weight[:, i2:] -= total_err.matmul(hinv[i1:i2, i2:]) if torch.cuda.is_available(): torch.cuda.synchronize() if is_transformer_conv1d(self.layer): Q = Q.t() self.layer.weight = nn.Parameter(Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype), requires_grad=False) del self.H if torch.cuda.is_available(): torch.cuda.empty_cache() def release_gpu_memory(self): if hasattr(self, "H"): del self.H class GPTQBlockWrapper: def __init__(self, module_name: str, module: nn.Module, weight_bit_width=8): self.layer_wrappers = {} self.hook_handles = [] # module order in the whole network self.order = 0 self.module_name = module_name def get_hook(layer_name): def record_hook(_, x): self.layer_wrappers[layer_name].record_h(x[0]) return record_hook for layer_name, layer in module.named_modules(): if isinstance(layer, tuple(QUANT_LAYERS)): full_layer_name = f"{module_name}.{layer_name}" if layer_name else f"{module_name}" self.layer_wrappers[full_layer_name] = GPTQLayerWrapper(full_layer_name, layer, weight_bit_width) handle = layer.register_forward_pre_hook(get_hook(full_layer_name)) self.hook_handles.append(handle) def quant_module(self): for _, wrapper in self.layer_wrappers.items(): wrapper.quant_weight() for h in self.hook_handles: h.remove() def set_order(self, idx): self.order = idx def get_order(self): return self.order def enable(self): for n, l in self.layer_wrappers.items(): l.is_record = True def disable(self): for n, l in self.layer_wrappers.items(): l.is_record = False def release_gpu_memory(self): for _, wrapper in self.layer_wrappers.items(): wrapper.release_gpu_memory() class GPTQuantizer: def __init__(self, block_type: Optional[List[type]] = None): self.gptq_block_wrappers = {} self.block_type = block_type def wrap_model(self, model: nn.Module, weight_bit_width=8): def wrap_block(m, prefix=""): for name, child in m.named_children(): child_prefix = f"{prefix}.{name}" if prefix else name if isinstance(child, tuple(self.block_type)): self.gptq_block_wrappers[name] = GPTQBlockWrapper(child_prefix, child, weight_bit_width) LOGGER.debug(f"Calibrate module {child_prefix} as a whole block in GPTQ") else: wrap_block(child, child_prefix) wrap_block(model) return model def quantize(self, model: nn.Module): for _, module_wrapper in self.gptq_block_wrappers.items(): module_wrapper.quant_module() return model @property def calibration_iters(self): return len(self.gptq_block_wrappers) @contextlib.contextmanager def record_order(self): counter = 0 record_handles = [] orders = {} try: def get_record_order_hook(module_name): def record_hook(*args, **kwargs): nonlocal counter if module_name not in orders: orders[module_name] = counter counter += 1 return record_hook for module_name, module_wrapper in self.gptq_block_wrappers.items(): # disable the record for _, layer_wrapper in module_wrapper.layer_wrappers.items(): layer_wrapper.is_record = False one_layer_wrapper_in_module = list(module_wrapper.layer_wrappers.values())[0] handles = one_layer_wrapper_in_module.layer.register_forward_pre_hook(get_record_order_hook(module_name)) record_handles.append(handles) yield except Exception as e: logging.warning(e) finally: for module_name, order in orders.items(): self.gptq_block_wrappers[module_name].set_order(order) for h in record_handles: h.remove() for module_name, module_wrapper in self.gptq_block_wrappers.items(): # disable the record for _, layer_wrapper in module_wrapper.layer_wrappers.items(): layer_wrapper.is_record = True @contextlib.contextmanager def start_calib_iter(self, i): assert i < len(self.gptq_block_wrappers) target_module_wrapper = None try: for _, module_wrapper in self.gptq_block_wrappers.items(): if module_wrapper.get_order() == i: module_wrapper.enable() target_module_wrapper = module_wrapper else: module_wrapper.disable() yield finally: target_module_wrapper.quant_module() def release_gpu_memory(self): for block_name, block_wrapper in self.gptq_block_wrappers.items(): block_wrapper.release_gpu_memory() torch.cuda.empty_cache() def locate_parent(root: nn.Module, full_path: str): parent = root path = full_path.split('.') for p in path[:-1]: parent = getattr(parent, p) return parent, path[-1] @torch.no_grad() def gptq_quantize(model, tokenizer, weight_bit_width, calib_data): from .modeling_chatglm import GLMBlock from .quantization import QuantizedLinear quantizer = GPTQuantizer([GLMBlock]) calib_model = quantizer.wrap_model(model, weight_bit_width) with quantizer.record_order(): calib_model.chat(tokenizer, calib_data[0], history=[]) logging.info("Start doing calibration using GPTQ ") for i in range(quantizer.calibration_iters): logging.info(f"Process: {i + 1}/{quantizer.calibration_iters}") # todo: should add early return to speed up the calibration with quantizer.start_calib_iter(i): for prompt in calib_data: model.chat(tokenizer, prompt, history=[]) # replace the fp16 linear with quantized linear for _, block_wrapper in quantizer.gptq_block_wrappers.items(): for layer_name, layer_wrapper in block_wrapper.layer_wrappers.items(): layer = layer_wrapper.layer parent, name_in_parent = locate_parent(model, layer_name) quantized_layer = QuantizedLinear( weight_bit_width=weight_bit_width, weight_tensor=layer.weight, bias_tensor=layer.bias, weight_scale=layer_wrapper.weight_scale, in_features=layer.in_features, out_features=layer.out_features, bias=True, dtype=torch.half, device=layer_wrapper.device, empty_init=False ) parent.add_module(name_in_parent, quantized_layer) torch.cuda.empty_cache() return