File size: 7,585 Bytes

db45d00

# Copied from rut5compressed/util.py of rut5compressed repository.

import logging
import re
from functools import wraps
from re import Pattern
from typing import Callable, Dict, Optional, Tuple

import numpy as np
import torch as T

from .modules import SVDCompressedLinear


def map_module(root: T.nn.Module,
               func: Callable[[T.nn.Module, str], T.nn.Module],
               patt: Optional[str] = None) -> T.nn.Module:
    """Function ``map_module`` applies a function to each leaf of module tree
    which matches to a specified pattern.

    Parameters
    ----------
    root : torch.nn.Module
        Module to modify.
    func : callable
        Function to be applied to every module (or matched to pattern) in
        module tree.
    patt : str, optional
        Pattern to filter modules by path in module tree.

    Returns
    -------
    torch.nn.Module
        Module modified in-place.
    """
    @wraps(func)
    def func_safe(*args, **kwargs):
        node = func(*args, **kwargs)
        if not isinstance(node, T.nn.Module):
            raise ValueError('Mapped result must be toch.nn.Module type '
                             f'but given {type(node)}.')
        return node

    return _map_module(root, func_safe, re.compile(patt or r'.*'), '')


def _map_module(root: T.nn.Module,
                func: Callable[[T.nn.Module, str], T.nn.Module], patt: Pattern,
                path: str) -> T.nn.Module:
    for name, child in root.named_children():
        node = _map_module(child, func, patt, f'{path}/{name}')
        if node != child:
            setattr(root, name, node)
    if patt.match(path or '/'):
        root = func(root, path or '/')
    return root


def convert_linear(module: T.nn.Linear, ctor, **kwargs) -> T.nn.Module:
    """Function convert_linear takes module and returns linear module with
    approximate matmul. Non-linear modules are returned intact.
    """
    if not isinstance(module, T.nn.Linear):
        return module
    raise NotImplementedError


def numel(module: T.nn.Module):
    value = sum(x.numel() for x in module.parameters()) + \
            sum(x.numel() for x in module.buffers())

    def account_prunned(module: T.nn.Module, path: str):
        nonlocal value
        for name, attr in vars(module).items():
            if not name.endswith('_mask') or not isinstance(attr, T.Tensor):
                continue

            weight_name = name[:-5]
            if not hasattr(module, weight_name):
                continue

            weight = getattr(module, weight_name)
            value -= weight.numel() - attr.sum()
            value += attr.numel()
        return module

    def account_quantized(module: T.nn.Module, path: str):
        nonlocal value
        if isinstance(module, T.nn.quantized.Linear):
            value += module.weight().numel()
            if module.bias() is not None:
                value += module.bias().numel()
        return module

    def account_rest(module: T.nn.Module, path: str):
        account_prunned(module, path)
        account_quantized(module, path)
        return module

    map_module(module, account_rest)
    return value


def sizeof(module: T.nn.Module):
    value = sum(x.numel() * x.element_size() for x in module.parameters()) + \
            sum(x.numel() * x.element_size() for x in module.buffers())

    def account_prunned(module: T.nn.Module, path: str):
        nonlocal value
        for name, attr in vars(module).items():
            if not name.endswith('_mask') or not isinstance(attr, T.Tensor):
                continue

            weight_name = name[:-5]
            if not hasattr(module, weight_name):
                continue

            weight = getattr(module, weight_name)
            value -= (weight.numel() - attr.sum()) * weight.element_size()
            value += attr.numel() * attr.element_size()
        return module

    def account_quantized(module: T.nn.Module, path: str):
        nonlocal value
        if isinstance(module, T.nn.quantized.Linear):
            value += module.weight().numel() * module.weight().element_size()
            if (bias := module.bias()) is not None:
                value += bias.numel() * bias.element_size()
        return module

    def account_rest(module: T.nn.Module, path: str):
        account_prunned(module, path)
        account_quantized(module, path)
        return module

    map_module(module, account_rest)
    return value


def flatten_module(module: T.nn.Module, regexp=None) -> Dict[str, T.nn.Module]:
    modules = {}
    map_module(module, lambda x, y: modules.update(**{y: x}) or x, regexp)
    return modules


def print_flatten(module: T.nn.Module):
    paths = []
    path_len = 0
    names = []
    name_len = 0
    indx_len = 0

    def func(module, path):
        nonlocal path_len, name_len, indx_len
        paths.append(path)
        path_len = max(path_len, len(path))
        name = module.__class__.__name__
        names.append(name)
        name_len = max(name_len, len(name))
        indx_len += 1
        return module

    map_module(module, func)

    indx_len = int(np.ceil(np.log10(indx_len)))
    fmt = f'{{indx:>{indx_len}s}} {{path:{path_len}s}} {{name:{name_len}s}}'
    print(fmt.format(indx='#', path='Path', name='Layer'))
    print('-' * (indx_len + path_len + name_len + 2))
    for i, (path, name) in enumerate(zip(paths, names)):
        print(fmt.format(indx=str(i), path=path, name=name))


def compress_linear_svd(module: T.nn.Module, path: str,
                        rank: Optional[int] = None) -> T.nn.Module:
    if not isinstance(module, T.nn.Linear):
        return module

    # Do not factorize if ranks equals to the size of the
    # smallest dimension.
    norows, nocols = module.weight.shape
    if rank == min(norows, nocols):
        return module

    # If there is no rank, then choose rank to be equal point when the number
    # of elements in original matrix is approximately equal to the number of
    # elements in SVD factors.
    if rank is None:
        ratio = norows * nocols / (norows + nocols)
        rank = int(np.floor(ratio))

    return SVDCompressedLinear.from_linear(module, rank)


def compress_linear_tt(module: T.nn.Module, path: str,
                       shape: Tuple[Tuple[int], Tuple[int]],
                       rank: int) -> T.nn.Module:
    if not isinstance(module, T.nn.Linear):
        return module

    # TODO(@not-found): We need propper compression config.
    inp_size = np.prod(shape[0])
    out_size = np.prod(shape[1])
    if inp_size == module.in_features and out_size == module.out_features:
        pass
    elif inp_size == module.out_features and out_size == module.in_features:
        shape = (shape[1], shape[0])
    else:
        raise ValueError(
            'Input and output features does not match to compression shape: '
            f'{shape[0]} vs {module.in_features} and {shape[1]} vs '
            f'{module.out_features}.')

    logging.info('apply tt compression to layer %s', path)
    return TTCompressedLinear.from_linear(module, shape, rank)  # noqa: F821


def compress(module: T.nn.Module, rank: int) -> T.nn.Module:
    """Function compress substitutes in-place linear layer of T5 model with
    linear layer which weight matrix is factorized with SVD.

    :param module: Model to compress.
    :param rank: Desired rank of compressed layer.
    """
    return map_module(
        root=module,
        func=lambda x, y: compress_linear_svd(x, y, rank),
        patt=r'.*/DenseReluDense/w.*')  # TODO(@not-found): Remove?