not-found's picture
Add SVD-compressed model with rank 512
db45d00
raw
history blame
7.59 kB
# Copied from rut5compressed/util.py of rut5compressed repository.
import logging
import re
from functools import wraps
from re import Pattern
from typing import Callable, Dict, Optional, Tuple
import numpy as np
import torch as T
from .modules import SVDCompressedLinear
def map_module(root: T.nn.Module,
func: Callable[[T.nn.Module, str], T.nn.Module],
patt: Optional[str] = None) -> T.nn.Module:
"""Function ``map_module`` applies a function to each leaf of module tree
which matches to a specified pattern.
Parameters
----------
root : torch.nn.Module
Module to modify.
func : callable
Function to be applied to every module (or matched to pattern) in
module tree.
patt : str, optional
Pattern to filter modules by path in module tree.
Returns
-------
torch.nn.Module
Module modified in-place.
"""
@wraps(func)
def func_safe(*args, **kwargs):
node = func(*args, **kwargs)
if not isinstance(node, T.nn.Module):
raise ValueError('Mapped result must be toch.nn.Module type '
f'but given {type(node)}.')
return node
return _map_module(root, func_safe, re.compile(patt or r'.*'), '')
def _map_module(root: T.nn.Module,
func: Callable[[T.nn.Module, str], T.nn.Module], patt: Pattern,
path: str) -> T.nn.Module:
for name, child in root.named_children():
node = _map_module(child, func, patt, f'{path}/{name}')
if node != child:
setattr(root, name, node)
if patt.match(path or '/'):
root = func(root, path or '/')
return root
def convert_linear(module: T.nn.Linear, ctor, **kwargs) -> T.nn.Module:
"""Function convert_linear takes module and returns linear module with
approximate matmul. Non-linear modules are returned intact.
"""
if not isinstance(module, T.nn.Linear):
return module
raise NotImplementedError
def numel(module: T.nn.Module):
value = sum(x.numel() for x in module.parameters()) + \
sum(x.numel() for x in module.buffers())
def account_prunned(module: T.nn.Module, path: str):
nonlocal value
for name, attr in vars(module).items():
if not name.endswith('_mask') or not isinstance(attr, T.Tensor):
continue
weight_name = name[:-5]
if not hasattr(module, weight_name):
continue
weight = getattr(module, weight_name)
value -= weight.numel() - attr.sum()
value += attr.numel()
return module
def account_quantized(module: T.nn.Module, path: str):
nonlocal value
if isinstance(module, T.nn.quantized.Linear):
value += module.weight().numel()
if module.bias() is not None:
value += module.bias().numel()
return module
def account_rest(module: T.nn.Module, path: str):
account_prunned(module, path)
account_quantized(module, path)
return module
map_module(module, account_rest)
return value
def sizeof(module: T.nn.Module):
value = sum(x.numel() * x.element_size() for x in module.parameters()) + \
sum(x.numel() * x.element_size() for x in module.buffers())
def account_prunned(module: T.nn.Module, path: str):
nonlocal value
for name, attr in vars(module).items():
if not name.endswith('_mask') or not isinstance(attr, T.Tensor):
continue
weight_name = name[:-5]
if not hasattr(module, weight_name):
continue
weight = getattr(module, weight_name)
value -= (weight.numel() - attr.sum()) * weight.element_size()
value += attr.numel() * attr.element_size()
return module
def account_quantized(module: T.nn.Module, path: str):
nonlocal value
if isinstance(module, T.nn.quantized.Linear):
value += module.weight().numel() * module.weight().element_size()
if (bias := module.bias()) is not None:
value += bias.numel() * bias.element_size()
return module
def account_rest(module: T.nn.Module, path: str):
account_prunned(module, path)
account_quantized(module, path)
return module
map_module(module, account_rest)
return value
def flatten_module(module: T.nn.Module, regexp=None) -> Dict[str, T.nn.Module]:
modules = {}
map_module(module, lambda x, y: modules.update(**{y: x}) or x, regexp)
return modules
def print_flatten(module: T.nn.Module):
paths = []
path_len = 0
names = []
name_len = 0
indx_len = 0
def func(module, path):
nonlocal path_len, name_len, indx_len
paths.append(path)
path_len = max(path_len, len(path))
name = module.__class__.__name__
names.append(name)
name_len = max(name_len, len(name))
indx_len += 1
return module
map_module(module, func)
indx_len = int(np.ceil(np.log10(indx_len)))
fmt = f'{{indx:>{indx_len}s}} {{path:{path_len}s}} {{name:{name_len}s}}'
print(fmt.format(indx='#', path='Path', name='Layer'))
print('-' * (indx_len + path_len + name_len + 2))
for i, (path, name) in enumerate(zip(paths, names)):
print(fmt.format(indx=str(i), path=path, name=name))
def compress_linear_svd(module: T.nn.Module, path: str,
rank: Optional[int] = None) -> T.nn.Module:
if not isinstance(module, T.nn.Linear):
return module
# Do not factorize if ranks equals to the size of the
# smallest dimension.
norows, nocols = module.weight.shape
if rank == min(norows, nocols):
return module
# If there is no rank, then choose rank to be equal point when the number
# of elements in original matrix is approximately equal to the number of
# elements in SVD factors.
if rank is None:
ratio = norows * nocols / (norows + nocols)
rank = int(np.floor(ratio))
return SVDCompressedLinear.from_linear(module, rank)
def compress_linear_tt(module: T.nn.Module, path: str,
shape: Tuple[Tuple[int], Tuple[int]],
rank: int) -> T.nn.Module:
if not isinstance(module, T.nn.Linear):
return module
# TODO(@not-found): We need propper compression config.
inp_size = np.prod(shape[0])
out_size = np.prod(shape[1])
if inp_size == module.in_features and out_size == module.out_features:
pass
elif inp_size == module.out_features and out_size == module.in_features:
shape = (shape[1], shape[0])
else:
raise ValueError(
'Input and output features does not match to compression shape: '
f'{shape[0]} vs {module.in_features} and {shape[1]} vs '
f'{module.out_features}.')
logging.info('apply tt compression to layer %s', path)
return TTCompressedLinear.from_linear(module, shape, rank) # noqa: F821
def compress(module: T.nn.Module, rank: int) -> T.nn.Module:
"""Function compress substitutes in-place linear layer of T5 model with
linear layer which weight matrix is factorized with SVD.
:param module: Model to compress.
:param rank: Desired rank of compressed layer.
"""
return map_module(
root=module,
func=lambda x, y: compress_linear_svd(x, y, rank),
patt=r'.*/DenseReluDense/w.*') # TODO(@not-found): Remove?