diff --git a/fn_gen/nlr_t_cos/1/distortion.png b/fn_gen/nlr_t_cos/1/distortion.png new file mode 100644 index 0000000000000000000000000000000000000000..46a6695485aae4e1a62d92e9088719e8350a9acf Binary files /dev/null and b/fn_gen/nlr_t_cos/1/distortion.png differ diff --git a/fn_gen/nlr_t_cos/1/expressions.txt b/fn_gen/nlr_t_cos/1/expressions.txt new file mode 100644 index 0000000000000000000000000000000000000000..03413827fa8f4c8ad49a40b543460cf31d1ce803 --- /dev/null +++ b/fn_gen/nlr_t_cos/1/expressions.txt @@ -0,0 +1,2 @@ +asin(_0*x)/_s +sin(_s*x)/_0 \ No newline at end of file diff --git a/fn_gen/nlr_t_cos/1/fn.py b/fn_gen/nlr_t_cos/1/fn.py new file mode 100644 index 0000000000000000000000000000000000000000..7ec285d816e2f04b8840c8705e2384d849fe910a --- /dev/null +++ b/fn_gen/nlr_t_cos/1/fn.py @@ -0,0 +1,481 @@ +from __future__ import annotations + +import torch +from torch import amin # Necessary for arcsin +import copy +import torch.nn as nn +import numpy as np + +from scipy.optimize import curve_fit +from typing import Dict, Any, Tuple, List, Callable + + +def quantization(x, **params): + return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * torch.asin(domain_guard((params['_0'] * x), min=-0.99999, max=0.99999, nan=0))) + + +def dequantization(x, **params): + return (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * torch.sin((params['_s'] * x))) + + +def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]: + base_p0 = { + '_0': init_space_search(x, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], param='_0', **kwargs), + } + + base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs) + if 'post_init_hook' in kwargs: + kwargs['post_init_hook'](parameters=base_p0) + + params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], **kwargs) + params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()} + if 'post_method_hook' in kwargs: + kwargs['post_method_hook'](parameters=params) + + params = learn_parameters(x, params, + qtz_func=quantization, + deqtz_func=dequantization, + bits=kwargs['bits'], + target_dtype=torch.int8, + epochs=500, + early_stop=False, + ) + if 'post_train_hook' in kwargs: + kwargs['post_train_hook'](parameters=params) + + return params + + +############### Numpy Qtz ############### + + +def np_quantization(x, _0, _s): + return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np.arcsin(np_domain_guard((_0 * x), min=-0.99999, max=0.99999, nan=0))) + + +def np_dequantization(x, _0, _s): + return (np.divide(1, np_replace_num(_0, num=0, to=10000)) * np.sin((_s * x))) + + +def fit_func(x, _0, _s): + x_ = np_quantization(x, _0, _s) + x_ = np_dequantization(x_, _0, _s) + return x_ + + + +############### HELPERS ############### + +def domain_guard( + x: torch.Tensor, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> torch.Tensor: + """Guard a tensor to a valid domain.""" + x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = torch.clamp(x, min=min, max=max) + return x + + +def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor: + """Replace a number in a tensor with another number. + + Args: + x (torch.Tensor): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + torch.Tensor: The tensor with the number replaced. + """ + return torch.where(x == num, to, x) + + +def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor: + """Guard the power operation to a valid domain.""" + return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp) + + +def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.ones_like(val, dtype=torch.float32, device=x.device) + + +def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.randn_like(val, dtype=torch.float32, device=x.device) + + +def init_space_search( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int): + """Generates the initial set of parameters. The first iteration generates 10 times more parameters.""" + for _ in range(n_params * 10): # The first iteration generates 10 times more parameters + yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial] + + def _search_param(tensors: List[torch.tensor], n_params): + """Takes the best parameters and generates new parameters around the mean of the best parameters.""" + torch_tensors = torch.stack(tensors) + min_vals, max_vals = torch.aminmax(torch_tensors, dim=0) + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + mean = torch.mean(torch_tensors, dim=0) + for _ in range(n_params): # Generates n_params around the mean of the tensors + yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean + + def _calc(x, qtz_func, deqtz_func, **params): + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params) + x_ = deqtz_func(x=x_, **params) + x_ = x_.transpose(0, 1) + return x_ + + assert "qtz_func" in kwargs, "qtz_func must be provided." + assert "deqtz_func" in kwargs, "deqtz_func must be provided." + assert "params_list" in kwargs, "params list must be provided." + assert "param" in kwargs, "param must be provided." + + qtz_func = kwargs.get('qtz_func') + deqtz_func = kwargs.get('deqtz_func') + params_list = kwargs.get('params_list') + param = kwargs.get('param') + + n_runs = 50 # Number of runs to try to find the best parameters + n_random_params = 50 # Number of random parameters to generate + n_best_to_pick = 5 # Number of best parameters to pick after each run + max_initial = 10000 # Maximum value to initialize the parameters + + # Initializes the parameters + base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param } + params = _build_initial_param(x, max_initial, n_random_params) + + # Performs the search + for _ in range(n_runs): + + best_params = [] + for param_ in params: + try: + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_}) + loss_ones = nn.MSELoss()(x, x_) + + if len(best_params) < n_best_to_pick: + best_params.append((param_, loss_ones.item())) + best_params = sorted(best_params, key=lambda x: x[1]) + elif loss_ones < best_params[-1][1]: + best_params[-1] = (param_, loss_ones.item()) + best_params = sorted(best_params, key=lambda x: x[1]) + + except Exception: # The parameters might not be valid for the function's domain + continue + + # Generates new parameters around the mean + params = _search_param([p for p, _ in best_params], n_random_params) + + # Checks if the best parameter is better than the init_ones + p_ones = init_ones(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones}) + loss_ones = nn.MSELoss()(x, x_) + + # Checks if the best parameter is better than the init_rand + p_rand = init_rand(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand}) + loss_rand = nn.MSELoss()(x, x_) + + if loss_rand < best_params[0][1] and loss_rand < loss_ones: + return p_rand + elif loss_ones < best_params[0][1] and loss_ones < loss_rand: + return p_ones + else: + return best_params[0][0] + + +def init_linear_scale( # Symmetric scale. From the study folder + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + assert "bits" in kwargs, "bits must be provided." + assert "params" in kwargs, "params must be provided." + assert "qtz_func" in kwargs, "qtz_func must be provided." + + bits = kwargs.get('bits') + params = kwargs.get('params') + qtz_func = kwargs.get('qtz_func') + + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs)) + x_ = x_.transpose(0, 1) + + quant_min, quant_max = get_min_max_from_bits_signed(bits) + min_vals, max_vals = torch.aminmax(x_, dim=1) + min_vals = torch.min(min_vals, torch.zeros_like(min_vals)) + max_vals = torch.max(max_vals, torch.zeros_like(max_vals)) + + eps = torch.finfo(torch.float32).eps + + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2) + + scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device) + + # Introduces some noise in scale + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything. + # NOTE(diogo): This has been disproven. The noise does not help the learning process but I still + # left it here for future reference. Will be removed later. + # scale = scale + 0.01 * torch.randn_like(scale) + + return scale + + +def init_non_linear_regression_fit( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + assert "params_list" in kwargs, "params list must be provided." + assert "np_fit_func" in kwargs, "np_fit_func must be provided." + assert "p0" in kwargs, "p0 must be provided." + np_fit_func = kwargs.get('np_fit_func') + params_list = kwargs.get('params_list') + p0 = kwargs.get('p0') + + def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]): + popt, _ = curve_fit( + func, + xdata, + ydata, + maxfev=1000, + p0=p0, + method='lm' + ) + return popt + + # 1. Needs to convert the torch tensor to numpy tensor + xdata = x.cpu().numpy() + + # 2. Sorts the data so that it makes it easier to fit to it + sorted_xdata = np.sort(xdata, axis=-1) + + p0 = {k: v.cpu().numpy() for k, v in p0.items()} + params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order + + # 3. Finds the best parameters for each channel + try: + params = [] + for i in range(sorted_xdata.shape[0]): + xdata_ = sorted_xdata[i] + p0_ = [p0[p][i] for p in params_list] + ch_params = _fit(xdata_, xdata_, np_fit_func, p0_) + params.append(ch_params) + + # 4. Builds the parameters + result = {} + for i, p in enumerate(params_list): + result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device) + + return result + + except ValueError as e: + print(f"Could not fit the function with error: {e}") + print(f"Using fallback result...") + return { + k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items() + } + + +def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.zeros_like(val, dtype=torch.float32, device=x.device) + + +def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor: + # Calculate the original minimum and maximum values + min_vals, max_vals = torch.aminmax(tensor, dim=-1) + x_min = torch.min(min_vals, torch.zeros_like(min_vals)) + x_max = torch.max(max_vals, torch.zeros_like(max_vals)) + + if _max is torch.inf: # We do not need to scale the tensor. Just need to move it + return torch.ones_like(x_min) + + # Calculate the scale factor + scale = (_max - _min) / (x_max - x_min) + return scale + + + +############## Quant ############### + +@torch.enable_grad() +def learn_parameters( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + qtz_func: nn.Module, + deqtz_func: nn.Module, + bits: int, + target_dtype: torch.dtype, + epochs: int = 1000, + early_stop: bool = True, + do_report: bool = False +) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]: + loss_fn = nn.MSELoss() + + # Determines the initial learning rate by computing the initial loss and multiplying it by + # the order of magnitude of the loss divided by 2 + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + base_lr = 0.1 + exponent = int(np.floor(np.log10(loss.item()))) + lr = base_lr * (10 ** (exponent // 2)) + + # Requires gradients in the parameters + for p in params.values(): + p.requires_grad = True + p.grad = None + + param_keys = list(params.keys()) + param_values = list(params.values()) + + # Defines optimizer and loss function + optimizer = torch.optim.Adam(param_values, lr=lr) + scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=epochs // 10, T_mult=1, eta_min=lr * 0.1, last_epoch=-1) + + # Contains the best loss and the best parameters + best_loss = float("inf") + best_params = None + + # Used to stop the search early + min_delta = 1e-7 + acc_loss = [] + percent_epochs_before_stop = 0.1 + + for i in range(epochs): + optimizer.zero_grad() + + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + if loss.isnan() or loss.isinf(): + raise Exception("Loss is NaN or Inf. Stopping the search.") + + loss.backward() + optimizer.step() + scheduler.step() + + acc_loss.append(loss.item()) + + # Reports loss every 10 steps + if i % 10 == 0 and do_report: + print(f"Epoch {i}: Loss {loss.item()}") + + # Optimizes the parameter search by storing the best loss and the parameters + if loss.item() < best_loss: + best_loss = loss.item() + best_params = copy.deepcopy({ + k: v for k, v in params.items() if k in param_keys + }) + + # We also stop the search if the loss has not considerably during the last 10% epochs + if early_stop: + epochs_before_stop = int(epochs * percent_epochs_before_stop) + if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta: + break + + # No longer requires gradients in the parameters + for p in best_params.values(): + p.requires_grad = False + p.grad = None + + if do_report: + print(f"Best loss: {best_loss}") + return best_params, acc_loss + else: + return best_params + + +def quantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + target_dtype: torch.dtype = torch.int8 +) -> torch.Tensor: + quant_min, quant_max = get_min_max_from_bits_signed(bits) + x = x.transpose(0, 1) # Aligns shapes + x = func(x=x, **params) + x = x.transpose(0, 1) + x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype) + return x + + +def dequantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + out_dtype: torch.dtype +) -> torch.Tensor: + x = x.to(dtype=out_dtype) + x = x.transpose(0, 1) + x = func(x=x, **params) + x = x.transpose(0, 1) + return x + + +def round_func_BPDA(input): + # This is equivalent to replacing round function (non-differentiable) with + # an identity function (differentiable) only when backward. + forward_value = torch.round(input) + out = input.clone() + out.data = forward_value.data + return out + + +def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]: + return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1 + + + +############## Numpy ############### + +def np_domain_guard( + x: np.ndarray, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> np.ndarray: + """Guard a tensor to a valid domain.""" + x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = np.clip(x, min, max) + return x + + +def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray: + """Replace a number in a tensor with another number. + + Args: + x (np.ndarray): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + np.ndarray: The tensor with the number replaced. + """ + return np.where(x == num, to, x) + + +def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray: + """Guard the power operation to a valid domain.""" + return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp) + diff --git a/fn_gen/nlr_t_cos/1/loss.png b/fn_gen/nlr_t_cos/1/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..c71b179bf103ec3d0f51b0eb4a50d81ad7df2593 Binary files /dev/null and b/fn_gen/nlr_t_cos/1/loss.png differ diff --git a/fn_gen/nlr_t_cos/1/quantization.png b/fn_gen/nlr_t_cos/1/quantization.png new file mode 100644 index 0000000000000000000000000000000000000000..ee190c12b00e736c353f00cf771f23a7843ac1f0 Binary files /dev/null and b/fn_gen/nlr_t_cos/1/quantization.png differ diff --git a/fn_gen/nlr_t_cos/10/distortion.png b/fn_gen/nlr_t_cos/10/distortion.png new file mode 100644 index 0000000000000000000000000000000000000000..c430eaa3abf3d43fb35f1002903e8fc1ea1f8fe7 Binary files /dev/null and b/fn_gen/nlr_t_cos/10/distortion.png differ diff --git a/fn_gen/nlr_t_cos/10/expressions.txt b/fn_gen/nlr_t_cos/10/expressions.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8458af52eb4cfce21cf8459f3c454003cd78158 --- /dev/null +++ b/fn_gen/nlr_t_cos/10/expressions.txt @@ -0,0 +1,2 @@ +sqrt(_0*x)/_s +_s**2*x**2/_0 \ No newline at end of file diff --git a/fn_gen/nlr_t_cos/10/fn.py b/fn_gen/nlr_t_cos/10/fn.py new file mode 100644 index 0000000000000000000000000000000000000000..9d433b6eec1f7876bdaef86cee1991b6c49a96e6 --- /dev/null +++ b/fn_gen/nlr_t_cos/10/fn.py @@ -0,0 +1,481 @@ +from __future__ import annotations + +import torch +from torch import amin # Necessary for arcsin +import copy +import torch.nn as nn +import numpy as np + +from scipy.optimize import curve_fit +from typing import Dict, Any, Tuple, List, Callable + + +def quantization(x, **params): + return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * torch.sqrt(domain_guard((params['_0'] * x), min=0.1, nan=0.1))) + + +def dequantization(x, **params): + return (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * guarded_torch_power(params['_s'], torch.tensor(2)) * guarded_torch_power(x, torch.tensor(2))) + + +def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]: + base_p0 = { + '_0': init_space_search(x, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], param='_0', **kwargs), + } + + base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs) + if 'post_init_hook' in kwargs: + kwargs['post_init_hook'](parameters=base_p0) + + params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], **kwargs) + params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()} + if 'post_method_hook' in kwargs: + kwargs['post_method_hook'](parameters=params) + + params = learn_parameters(x, params, + qtz_func=quantization, + deqtz_func=dequantization, + bits=kwargs['bits'], + target_dtype=torch.int8, + epochs=500, + early_stop=False, + ) + if 'post_train_hook' in kwargs: + kwargs['post_train_hook'](parameters=params) + + return params + + +############### Numpy Qtz ############### + + +def np_quantization(x, _0, _s): + return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np.sqrt(np_domain_guard((_0 * x), min=0.1, nan=0.1))) + + +def np_dequantization(x, _0, _s): + return (np.divide(1, np_replace_num(_0, num=0, to=10000)) * np_guarded_power(_s, np.array(2)) * np_guarded_power(x, np.array(2))) + + +def fit_func(x, _0, _s): + x_ = np_quantization(x, _0, _s) + x_ = np_dequantization(x_, _0, _s) + return x_ + + + +############### HELPERS ############### + +def domain_guard( + x: torch.Tensor, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> torch.Tensor: + """Guard a tensor to a valid domain.""" + x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = torch.clamp(x, min=min, max=max) + return x + + +def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor: + """Replace a number in a tensor with another number. + + Args: + x (torch.Tensor): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + torch.Tensor: The tensor with the number replaced. + """ + return torch.where(x == num, to, x) + + +def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor: + """Guard the power operation to a valid domain.""" + return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp) + + +def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.ones_like(val, dtype=torch.float32, device=x.device) + + +def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.randn_like(val, dtype=torch.float32, device=x.device) + + +def init_space_search( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int): + """Generates the initial set of parameters. The first iteration generates 10 times more parameters.""" + for _ in range(n_params * 10): # The first iteration generates 10 times more parameters + yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial] + + def _search_param(tensors: List[torch.tensor], n_params): + """Takes the best parameters and generates new parameters around the mean of the best parameters.""" + torch_tensors = torch.stack(tensors) + min_vals, max_vals = torch.aminmax(torch_tensors, dim=0) + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + mean = torch.mean(torch_tensors, dim=0) + for _ in range(n_params): # Generates n_params around the mean of the tensors + yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean + + def _calc(x, qtz_func, deqtz_func, **params): + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params) + x_ = deqtz_func(x=x_, **params) + x_ = x_.transpose(0, 1) + return x_ + + assert "qtz_func" in kwargs, "qtz_func must be provided." + assert "deqtz_func" in kwargs, "deqtz_func must be provided." + assert "params_list" in kwargs, "params list must be provided." + assert "param" in kwargs, "param must be provided." + + qtz_func = kwargs.get('qtz_func') + deqtz_func = kwargs.get('deqtz_func') + params_list = kwargs.get('params_list') + param = kwargs.get('param') + + n_runs = 50 # Number of runs to try to find the best parameters + n_random_params = 50 # Number of random parameters to generate + n_best_to_pick = 5 # Number of best parameters to pick after each run + max_initial = 10000 # Maximum value to initialize the parameters + + # Initializes the parameters + base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param } + params = _build_initial_param(x, max_initial, n_random_params) + + # Performs the search + for _ in range(n_runs): + + best_params = [] + for param_ in params: + try: + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_}) + loss_ones = nn.MSELoss()(x, x_) + + if len(best_params) < n_best_to_pick: + best_params.append((param_, loss_ones.item())) + best_params = sorted(best_params, key=lambda x: x[1]) + elif loss_ones < best_params[-1][1]: + best_params[-1] = (param_, loss_ones.item()) + best_params = sorted(best_params, key=lambda x: x[1]) + + except Exception: # The parameters might not be valid for the function's domain + continue + + # Generates new parameters around the mean + params = _search_param([p for p, _ in best_params], n_random_params) + + # Checks if the best parameter is better than the init_ones + p_ones = init_ones(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones}) + loss_ones = nn.MSELoss()(x, x_) + + # Checks if the best parameter is better than the init_rand + p_rand = init_rand(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand}) + loss_rand = nn.MSELoss()(x, x_) + + if loss_rand < best_params[0][1] and loss_rand < loss_ones: + return p_rand + elif loss_ones < best_params[0][1] and loss_ones < loss_rand: + return p_ones + else: + return best_params[0][0] + + +def init_linear_scale( # Symmetric scale. From the study folder + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + assert "bits" in kwargs, "bits must be provided." + assert "params" in kwargs, "params must be provided." + assert "qtz_func" in kwargs, "qtz_func must be provided." + + bits = kwargs.get('bits') + params = kwargs.get('params') + qtz_func = kwargs.get('qtz_func') + + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs)) + x_ = x_.transpose(0, 1) + + quant_min, quant_max = get_min_max_from_bits_signed(bits) + min_vals, max_vals = torch.aminmax(x_, dim=1) + min_vals = torch.min(min_vals, torch.zeros_like(min_vals)) + max_vals = torch.max(max_vals, torch.zeros_like(max_vals)) + + eps = torch.finfo(torch.float32).eps + + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2) + + scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device) + + # Introduces some noise in scale + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything. + # NOTE(diogo): This has been disproven. The noise does not help the learning process but I still + # left it here for future reference. Will be removed later. + # scale = scale + 0.01 * torch.randn_like(scale) + + return scale + + +def init_non_linear_regression_fit( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + assert "params_list" in kwargs, "params list must be provided." + assert "np_fit_func" in kwargs, "np_fit_func must be provided." + assert "p0" in kwargs, "p0 must be provided." + np_fit_func = kwargs.get('np_fit_func') + params_list = kwargs.get('params_list') + p0 = kwargs.get('p0') + + def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]): + popt, _ = curve_fit( + func, + xdata, + ydata, + maxfev=1000, + p0=p0, + method='lm' + ) + return popt + + # 1. Needs to convert the torch tensor to numpy tensor + xdata = x.cpu().numpy() + + # 2. Sorts the data so that it makes it easier to fit to it + sorted_xdata = np.sort(xdata, axis=-1) + + p0 = {k: v.cpu().numpy() for k, v in p0.items()} + params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order + + # 3. Finds the best parameters for each channel + try: + params = [] + for i in range(sorted_xdata.shape[0]): + xdata_ = sorted_xdata[i] + p0_ = [p0[p][i] for p in params_list] + ch_params = _fit(xdata_, xdata_, np_fit_func, p0_) + params.append(ch_params) + + # 4. Builds the parameters + result = {} + for i, p in enumerate(params_list): + result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device) + + return result + + except ValueError as e: + print(f"Could not fit the function with error: {e}") + print(f"Using fallback result...") + return { + k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items() + } + + +def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.zeros_like(val, dtype=torch.float32, device=x.device) + + +def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor: + # Calculate the original minimum and maximum values + min_vals, max_vals = torch.aminmax(tensor, dim=-1) + x_min = torch.min(min_vals, torch.zeros_like(min_vals)) + x_max = torch.max(max_vals, torch.zeros_like(max_vals)) + + if _max is torch.inf: # We do not need to scale the tensor. Just need to move it + return torch.ones_like(x_min) + + # Calculate the scale factor + scale = (_max - _min) / (x_max - x_min) + return scale + + + +############## Quant ############### + +@torch.enable_grad() +def learn_parameters( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + qtz_func: nn.Module, + deqtz_func: nn.Module, + bits: int, + target_dtype: torch.dtype, + epochs: int = 1000, + early_stop: bool = True, + do_report: bool = False +) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]: + loss_fn = nn.MSELoss() + + # Determines the initial learning rate by computing the initial loss and multiplying it by + # the order of magnitude of the loss divided by 2 + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + base_lr = 0.1 + exponent = int(np.floor(np.log10(loss.item()))) + lr = base_lr * (10 ** (exponent // 2)) + + # Requires gradients in the parameters + for p in params.values(): + p.requires_grad = True + p.grad = None + + param_keys = list(params.keys()) + param_values = list(params.values()) + + # Defines optimizer and loss function + optimizer = torch.optim.Adam(param_values, lr=lr) + scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=epochs // 10, T_mult=1, eta_min=lr * 0.1, last_epoch=-1) + + # Contains the best loss and the best parameters + best_loss = float("inf") + best_params = None + + # Used to stop the search early + min_delta = 1e-7 + acc_loss = [] + percent_epochs_before_stop = 0.1 + + for i in range(epochs): + optimizer.zero_grad() + + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + if loss.isnan() or loss.isinf(): + raise Exception("Loss is NaN or Inf. Stopping the search.") + + loss.backward() + optimizer.step() + scheduler.step() + + acc_loss.append(loss.item()) + + # Reports loss every 10 steps + if i % 10 == 0 and do_report: + print(f"Epoch {i}: Loss {loss.item()}") + + # Optimizes the parameter search by storing the best loss and the parameters + if loss.item() < best_loss: + best_loss = loss.item() + best_params = copy.deepcopy({ + k: v for k, v in params.items() if k in param_keys + }) + + # We also stop the search if the loss has not considerably during the last 10% epochs + if early_stop: + epochs_before_stop = int(epochs * percent_epochs_before_stop) + if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta: + break + + # No longer requires gradients in the parameters + for p in best_params.values(): + p.requires_grad = False + p.grad = None + + if do_report: + print(f"Best loss: {best_loss}") + return best_params, acc_loss + else: + return best_params + + +def quantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + target_dtype: torch.dtype = torch.int8 +) -> torch.Tensor: + quant_min, quant_max = get_min_max_from_bits_signed(bits) + x = x.transpose(0, 1) # Aligns shapes + x = func(x=x, **params) + x = x.transpose(0, 1) + x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype) + return x + + +def dequantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + out_dtype: torch.dtype +) -> torch.Tensor: + x = x.to(dtype=out_dtype) + x = x.transpose(0, 1) + x = func(x=x, **params) + x = x.transpose(0, 1) + return x + + +def round_func_BPDA(input): + # This is equivalent to replacing round function (non-differentiable) with + # an identity function (differentiable) only when backward. + forward_value = torch.round(input) + out = input.clone() + out.data = forward_value.data + return out + + +def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]: + return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1 + + + +############## Numpy ############### + +def np_domain_guard( + x: np.ndarray, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> np.ndarray: + """Guard a tensor to a valid domain.""" + x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = np.clip(x, min, max) + return x + + +def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray: + """Replace a number in a tensor with another number. + + Args: + x (np.ndarray): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + np.ndarray: The tensor with the number replaced. + """ + return np.where(x == num, to, x) + + +def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray: + """Guard the power operation to a valid domain.""" + return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp) + diff --git a/fn_gen/nlr_t_cos/10/loss.png b/fn_gen/nlr_t_cos/10/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..9608970dd3a4284ac7d27e3c71f08b01f336b70c Binary files /dev/null and b/fn_gen/nlr_t_cos/10/loss.png differ diff --git a/fn_gen/nlr_t_cos/10/quantization.png b/fn_gen/nlr_t_cos/10/quantization.png new file mode 100644 index 0000000000000000000000000000000000000000..17935b149eb337b0520be9808c3dc06303bc1034 Binary files /dev/null and b/fn_gen/nlr_t_cos/10/quantization.png differ diff --git a/fn_gen/nlr_t_cos/11/distortion.png b/fn_gen/nlr_t_cos/11/distortion.png new file mode 100644 index 0000000000000000000000000000000000000000..14b621bdae348ae94a422c59fbb97878912170b9 Binary files /dev/null and b/fn_gen/nlr_t_cos/11/distortion.png differ diff --git a/fn_gen/nlr_t_cos/11/expressions.txt b/fn_gen/nlr_t_cos/11/expressions.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed99293c42843616c361d59b23d32ae553cc0f8d --- /dev/null +++ b/fn_gen/nlr_t_cos/11/expressions.txt @@ -0,0 +1,2 @@ +atanh(_0*x)/_s +tanh(_s*x)/_0 \ No newline at end of file diff --git a/fn_gen/nlr_t_cos/11/fn.py b/fn_gen/nlr_t_cos/11/fn.py new file mode 100644 index 0000000000000000000000000000000000000000..70901ca0737d413e2f2e47f99389813037b5cc9d --- /dev/null +++ b/fn_gen/nlr_t_cos/11/fn.py @@ -0,0 +1,481 @@ +from __future__ import annotations + +import torch +from torch import amin # Necessary for arcsin +import copy +import torch.nn as nn +import numpy as np + +from scipy.optimize import curve_fit +from typing import Dict, Any, Tuple, List, Callable + + +def quantization(x, **params): + return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * torch.atanh(domain_guard((params['_0'] * x), min=-0.9999, max=0.9999, nan=0))) + + +def dequantization(x, **params): + return (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * torch.tanh((params['_s'] * x))) + + +def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]: + base_p0 = { + '_0': init_space_search(x, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], param='_0', **kwargs), + } + + base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs) + if 'post_init_hook' in kwargs: + kwargs['post_init_hook'](parameters=base_p0) + + params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], **kwargs) + params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()} + if 'post_method_hook' in kwargs: + kwargs['post_method_hook'](parameters=params) + + params = learn_parameters(x, params, + qtz_func=quantization, + deqtz_func=dequantization, + bits=kwargs['bits'], + target_dtype=torch.int8, + epochs=500, + early_stop=False, + ) + if 'post_train_hook' in kwargs: + kwargs['post_train_hook'](parameters=params) + + return params + + +############### Numpy Qtz ############### + + +def np_quantization(x, _0, _s): + return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np.arctanh(np_domain_guard((_0 * x), min=-0.9999, max=0.9999, nan=0))) + + +def np_dequantization(x, _0, _s): + return (np.divide(1, np_replace_num(_0, num=0, to=10000)) * np.tanh((_s * x))) + + +def fit_func(x, _0, _s): + x_ = np_quantization(x, _0, _s) + x_ = np_dequantization(x_, _0, _s) + return x_ + + + +############### HELPERS ############### + +def domain_guard( + x: torch.Tensor, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> torch.Tensor: + """Guard a tensor to a valid domain.""" + x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = torch.clamp(x, min=min, max=max) + return x + + +def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor: + """Replace a number in a tensor with another number. + + Args: + x (torch.Tensor): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + torch.Tensor: The tensor with the number replaced. + """ + return torch.where(x == num, to, x) + + +def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor: + """Guard the power operation to a valid domain.""" + return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp) + + +def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.ones_like(val, dtype=torch.float32, device=x.device) + + +def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.randn_like(val, dtype=torch.float32, device=x.device) + + +def init_space_search( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int): + """Generates the initial set of parameters. The first iteration generates 10 times more parameters.""" + for _ in range(n_params * 10): # The first iteration generates 10 times more parameters + yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial] + + def _search_param(tensors: List[torch.tensor], n_params): + """Takes the best parameters and generates new parameters around the mean of the best parameters.""" + torch_tensors = torch.stack(tensors) + min_vals, max_vals = torch.aminmax(torch_tensors, dim=0) + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + mean = torch.mean(torch_tensors, dim=0) + for _ in range(n_params): # Generates n_params around the mean of the tensors + yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean + + def _calc(x, qtz_func, deqtz_func, **params): + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params) + x_ = deqtz_func(x=x_, **params) + x_ = x_.transpose(0, 1) + return x_ + + assert "qtz_func" in kwargs, "qtz_func must be provided." + assert "deqtz_func" in kwargs, "deqtz_func must be provided." + assert "params_list" in kwargs, "params list must be provided." + assert "param" in kwargs, "param must be provided." + + qtz_func = kwargs.get('qtz_func') + deqtz_func = kwargs.get('deqtz_func') + params_list = kwargs.get('params_list') + param = kwargs.get('param') + + n_runs = 50 # Number of runs to try to find the best parameters + n_random_params = 50 # Number of random parameters to generate + n_best_to_pick = 5 # Number of best parameters to pick after each run + max_initial = 10000 # Maximum value to initialize the parameters + + # Initializes the parameters + base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param } + params = _build_initial_param(x, max_initial, n_random_params) + + # Performs the search + for _ in range(n_runs): + + best_params = [] + for param_ in params: + try: + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_}) + loss_ones = nn.MSELoss()(x, x_) + + if len(best_params) < n_best_to_pick: + best_params.append((param_, loss_ones.item())) + best_params = sorted(best_params, key=lambda x: x[1]) + elif loss_ones < best_params[-1][1]: + best_params[-1] = (param_, loss_ones.item()) + best_params = sorted(best_params, key=lambda x: x[1]) + + except Exception: # The parameters might not be valid for the function's domain + continue + + # Generates new parameters around the mean + params = _search_param([p for p, _ in best_params], n_random_params) + + # Checks if the best parameter is better than the init_ones + p_ones = init_ones(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones}) + loss_ones = nn.MSELoss()(x, x_) + + # Checks if the best parameter is better than the init_rand + p_rand = init_rand(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand}) + loss_rand = nn.MSELoss()(x, x_) + + if loss_rand < best_params[0][1] and loss_rand < loss_ones: + return p_rand + elif loss_ones < best_params[0][1] and loss_ones < loss_rand: + return p_ones + else: + return best_params[0][0] + + +def init_linear_scale( # Symmetric scale. From the study folder + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + assert "bits" in kwargs, "bits must be provided." + assert "params" in kwargs, "params must be provided." + assert "qtz_func" in kwargs, "qtz_func must be provided." + + bits = kwargs.get('bits') + params = kwargs.get('params') + qtz_func = kwargs.get('qtz_func') + + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs)) + x_ = x_.transpose(0, 1) + + quant_min, quant_max = get_min_max_from_bits_signed(bits) + min_vals, max_vals = torch.aminmax(x_, dim=1) + min_vals = torch.min(min_vals, torch.zeros_like(min_vals)) + max_vals = torch.max(max_vals, torch.zeros_like(max_vals)) + + eps = torch.finfo(torch.float32).eps + + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2) + + scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device) + + # Introduces some noise in scale + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything. + # NOTE(diogo): This has been disproven. The noise does not help the learning process but I still + # left it here for future reference. Will be removed later. + # scale = scale + 0.01 * torch.randn_like(scale) + + return scale + + +def init_non_linear_regression_fit( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + assert "params_list" in kwargs, "params list must be provided." + assert "np_fit_func" in kwargs, "np_fit_func must be provided." + assert "p0" in kwargs, "p0 must be provided." + np_fit_func = kwargs.get('np_fit_func') + params_list = kwargs.get('params_list') + p0 = kwargs.get('p0') + + def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]): + popt, _ = curve_fit( + func, + xdata, + ydata, + maxfev=1000, + p0=p0, + method='lm' + ) + return popt + + # 1. Needs to convert the torch tensor to numpy tensor + xdata = x.cpu().numpy() + + # 2. Sorts the data so that it makes it easier to fit to it + sorted_xdata = np.sort(xdata, axis=-1) + + p0 = {k: v.cpu().numpy() for k, v in p0.items()} + params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order + + # 3. Finds the best parameters for each channel + try: + params = [] + for i in range(sorted_xdata.shape[0]): + xdata_ = sorted_xdata[i] + p0_ = [p0[p][i] for p in params_list] + ch_params = _fit(xdata_, xdata_, np_fit_func, p0_) + params.append(ch_params) + + # 4. Builds the parameters + result = {} + for i, p in enumerate(params_list): + result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device) + + return result + + except ValueError as e: + print(f"Could not fit the function with error: {e}") + print(f"Using fallback result...") + return { + k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items() + } + + +def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.zeros_like(val, dtype=torch.float32, device=x.device) + + +def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor: + # Calculate the original minimum and maximum values + min_vals, max_vals = torch.aminmax(tensor, dim=-1) + x_min = torch.min(min_vals, torch.zeros_like(min_vals)) + x_max = torch.max(max_vals, torch.zeros_like(max_vals)) + + if _max is torch.inf: # We do not need to scale the tensor. Just need to move it + return torch.ones_like(x_min) + + # Calculate the scale factor + scale = (_max - _min) / (x_max - x_min) + return scale + + + +############## Quant ############### + +@torch.enable_grad() +def learn_parameters( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + qtz_func: nn.Module, + deqtz_func: nn.Module, + bits: int, + target_dtype: torch.dtype, + epochs: int = 1000, + early_stop: bool = True, + do_report: bool = False +) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]: + loss_fn = nn.MSELoss() + + # Determines the initial learning rate by computing the initial loss and multiplying it by + # the order of magnitude of the loss divided by 2 + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + base_lr = 0.1 + exponent = int(np.floor(np.log10(loss.item()))) + lr = base_lr * (10 ** (exponent // 2)) + + # Requires gradients in the parameters + for p in params.values(): + p.requires_grad = True + p.grad = None + + param_keys = list(params.keys()) + param_values = list(params.values()) + + # Defines optimizer and loss function + optimizer = torch.optim.Adam(param_values, lr=lr) + scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=epochs // 10, T_mult=1, eta_min=lr * 0.1, last_epoch=-1) + + # Contains the best loss and the best parameters + best_loss = float("inf") + best_params = None + + # Used to stop the search early + min_delta = 1e-7 + acc_loss = [] + percent_epochs_before_stop = 0.1 + + for i in range(epochs): + optimizer.zero_grad() + + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + if loss.isnan() or loss.isinf(): + raise Exception("Loss is NaN or Inf. Stopping the search.") + + loss.backward() + optimizer.step() + scheduler.step() + + acc_loss.append(loss.item()) + + # Reports loss every 10 steps + if i % 10 == 0 and do_report: + print(f"Epoch {i}: Loss {loss.item()}") + + # Optimizes the parameter search by storing the best loss and the parameters + if loss.item() < best_loss: + best_loss = loss.item() + best_params = copy.deepcopy({ + k: v for k, v in params.items() if k in param_keys + }) + + # We also stop the search if the loss has not considerably during the last 10% epochs + if early_stop: + epochs_before_stop = int(epochs * percent_epochs_before_stop) + if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta: + break + + # No longer requires gradients in the parameters + for p in best_params.values(): + p.requires_grad = False + p.grad = None + + if do_report: + print(f"Best loss: {best_loss}") + return best_params, acc_loss + else: + return best_params + + +def quantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + target_dtype: torch.dtype = torch.int8 +) -> torch.Tensor: + quant_min, quant_max = get_min_max_from_bits_signed(bits) + x = x.transpose(0, 1) # Aligns shapes + x = func(x=x, **params) + x = x.transpose(0, 1) + x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype) + return x + + +def dequantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + out_dtype: torch.dtype +) -> torch.Tensor: + x = x.to(dtype=out_dtype) + x = x.transpose(0, 1) + x = func(x=x, **params) + x = x.transpose(0, 1) + return x + + +def round_func_BPDA(input): + # This is equivalent to replacing round function (non-differentiable) with + # an identity function (differentiable) only when backward. + forward_value = torch.round(input) + out = input.clone() + out.data = forward_value.data + return out + + +def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]: + return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1 + + + +############## Numpy ############### + +def np_domain_guard( + x: np.ndarray, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> np.ndarray: + """Guard a tensor to a valid domain.""" + x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = np.clip(x, min, max) + return x + + +def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray: + """Replace a number in a tensor with another number. + + Args: + x (np.ndarray): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + np.ndarray: The tensor with the number replaced. + """ + return np.where(x == num, to, x) + + +def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray: + """Guard the power operation to a valid domain.""" + return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp) + diff --git a/fn_gen/nlr_t_cos/11/loss.png b/fn_gen/nlr_t_cos/11/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..37ecf57bffa8d83cba4874f0435c1b7be03cc2e6 Binary files /dev/null and b/fn_gen/nlr_t_cos/11/loss.png differ diff --git a/fn_gen/nlr_t_cos/11/quantization.png b/fn_gen/nlr_t_cos/11/quantization.png new file mode 100644 index 0000000000000000000000000000000000000000..b10e6e3132d98e22c4a880249b2ae9cf772c97a2 Binary files /dev/null and b/fn_gen/nlr_t_cos/11/quantization.png differ diff --git a/fn_gen/nlr_t_cos/13/distortion.png b/fn_gen/nlr_t_cos/13/distortion.png new file mode 100644 index 0000000000000000000000000000000000000000..5d9b24fc7fa3a342286b217ea711c2b8d756db41 Binary files /dev/null and b/fn_gen/nlr_t_cos/13/distortion.png differ diff --git a/fn_gen/nlr_t_cos/13/expressions.txt b/fn_gen/nlr_t_cos/13/expressions.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec55493201f7b2b8effaefed75e0a9258fc25c56 --- /dev/null +++ b/fn_gen/nlr_t_cos/13/expressions.txt @@ -0,0 +1,2 @@ +tanh(_0*x)/_s +log((-_s*x - 1)/(_s*x - 1))/_0 \ No newline at end of file diff --git a/fn_gen/nlr_t_cos/13/fn.py b/fn_gen/nlr_t_cos/13/fn.py new file mode 100644 index 0000000000000000000000000000000000000000..967e091fde137a298b6e79ea13cf9bc5ca1a661b --- /dev/null +++ b/fn_gen/nlr_t_cos/13/fn.py @@ -0,0 +1,481 @@ +from __future__ import annotations + +import torch +from torch import amin # Necessary for arcsin +import copy +import torch.nn as nn +import numpy as np + +from scipy.optimize import curve_fit +from typing import Dict, Any, Tuple, List, Callable + + +def quantization(x, **params): + return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * torch.tanh((params['_0'] * x))) + + +def dequantization(x, **params): + return (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * torch.log(domain_guard((torch.div(1, replace_num((torch.tensor(-1) + (params['_s'] * x)), num=0, to=10000)) * (torch.tensor(-1) + (torch.tensor(-1) * params['_s'] * x))), min=1e-5, nan=1e-5))) + + +def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]: + base_p0 = { + '_0': init_space_search(x, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], param='_0', **kwargs), + } + + base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs) + if 'post_init_hook' in kwargs: + kwargs['post_init_hook'](parameters=base_p0) + + params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], **kwargs) + params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()} + if 'post_method_hook' in kwargs: + kwargs['post_method_hook'](parameters=params) + + params = learn_parameters(x, params, + qtz_func=quantization, + deqtz_func=dequantization, + bits=kwargs['bits'], + target_dtype=torch.int8, + epochs=500, + early_stop=False, + ) + if 'post_train_hook' in kwargs: + kwargs['post_train_hook'](parameters=params) + + return params + + +############### Numpy Qtz ############### + + +def np_quantization(x, _0, _s): + return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np.tanh((_0 * x))) + + +def np_dequantization(x, _0, _s): + return (np.divide(1, np_replace_num(_0, num=0, to=10000)) * np.log(np_domain_guard((np.divide(1, np_replace_num((np.array(-1) + (_s * x)), num=0, to=10000)) * (np.array(-1) + (np.array(-1) * _s * x))), min=1e-5, nan=1e-5))) + + +def fit_func(x, _0, _s): + x_ = np_quantization(x, _0, _s) + x_ = np_dequantization(x_, _0, _s) + return x_ + + + +############### HELPERS ############### + +def domain_guard( + x: torch.Tensor, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> torch.Tensor: + """Guard a tensor to a valid domain.""" + x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = torch.clamp(x, min=min, max=max) + return x + + +def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor: + """Replace a number in a tensor with another number. + + Args: + x (torch.Tensor): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + torch.Tensor: The tensor with the number replaced. + """ + return torch.where(x == num, to, x) + + +def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor: + """Guard the power operation to a valid domain.""" + return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp) + + +def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.ones_like(val, dtype=torch.float32, device=x.device) + + +def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.randn_like(val, dtype=torch.float32, device=x.device) + + +def init_space_search( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int): + """Generates the initial set of parameters. The first iteration generates 10 times more parameters.""" + for _ in range(n_params * 10): # The first iteration generates 10 times more parameters + yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial] + + def _search_param(tensors: List[torch.tensor], n_params): + """Takes the best parameters and generates new parameters around the mean of the best parameters.""" + torch_tensors = torch.stack(tensors) + min_vals, max_vals = torch.aminmax(torch_tensors, dim=0) + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + mean = torch.mean(torch_tensors, dim=0) + for _ in range(n_params): # Generates n_params around the mean of the tensors + yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean + + def _calc(x, qtz_func, deqtz_func, **params): + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params) + x_ = deqtz_func(x=x_, **params) + x_ = x_.transpose(0, 1) + return x_ + + assert "qtz_func" in kwargs, "qtz_func must be provided." + assert "deqtz_func" in kwargs, "deqtz_func must be provided." + assert "params_list" in kwargs, "params list must be provided." + assert "param" in kwargs, "param must be provided." + + qtz_func = kwargs.get('qtz_func') + deqtz_func = kwargs.get('deqtz_func') + params_list = kwargs.get('params_list') + param = kwargs.get('param') + + n_runs = 50 # Number of runs to try to find the best parameters + n_random_params = 50 # Number of random parameters to generate + n_best_to_pick = 5 # Number of best parameters to pick after each run + max_initial = 10000 # Maximum value to initialize the parameters + + # Initializes the parameters + base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param } + params = _build_initial_param(x, max_initial, n_random_params) + + # Performs the search + for _ in range(n_runs): + + best_params = [] + for param_ in params: + try: + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_}) + loss_ones = nn.MSELoss()(x, x_) + + if len(best_params) < n_best_to_pick: + best_params.append((param_, loss_ones.item())) + best_params = sorted(best_params, key=lambda x: x[1]) + elif loss_ones < best_params[-1][1]: + best_params[-1] = (param_, loss_ones.item()) + best_params = sorted(best_params, key=lambda x: x[1]) + + except Exception: # The parameters might not be valid for the function's domain + continue + + # Generates new parameters around the mean + params = _search_param([p for p, _ in best_params], n_random_params) + + # Checks if the best parameter is better than the init_ones + p_ones = init_ones(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones}) + loss_ones = nn.MSELoss()(x, x_) + + # Checks if the best parameter is better than the init_rand + p_rand = init_rand(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand}) + loss_rand = nn.MSELoss()(x, x_) + + if loss_rand < best_params[0][1] and loss_rand < loss_ones: + return p_rand + elif loss_ones < best_params[0][1] and loss_ones < loss_rand: + return p_ones + else: + return best_params[0][0] + + +def init_linear_scale( # Symmetric scale. From the study folder + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + assert "bits" in kwargs, "bits must be provided." + assert "params" in kwargs, "params must be provided." + assert "qtz_func" in kwargs, "qtz_func must be provided." + + bits = kwargs.get('bits') + params = kwargs.get('params') + qtz_func = kwargs.get('qtz_func') + + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs)) + x_ = x_.transpose(0, 1) + + quant_min, quant_max = get_min_max_from_bits_signed(bits) + min_vals, max_vals = torch.aminmax(x_, dim=1) + min_vals = torch.min(min_vals, torch.zeros_like(min_vals)) + max_vals = torch.max(max_vals, torch.zeros_like(max_vals)) + + eps = torch.finfo(torch.float32).eps + + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2) + + scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device) + + # Introduces some noise in scale + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything. + # NOTE(diogo): This has been disproven. The noise does not help the learning process but I still + # left it here for future reference. Will be removed later. + # scale = scale + 0.01 * torch.randn_like(scale) + + return scale + + +def init_non_linear_regression_fit( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + assert "params_list" in kwargs, "params list must be provided." + assert "np_fit_func" in kwargs, "np_fit_func must be provided." + assert "p0" in kwargs, "p0 must be provided." + np_fit_func = kwargs.get('np_fit_func') + params_list = kwargs.get('params_list') + p0 = kwargs.get('p0') + + def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]): + popt, _ = curve_fit( + func, + xdata, + ydata, + maxfev=1000, + p0=p0, + method='lm' + ) + return popt + + # 1. Needs to convert the torch tensor to numpy tensor + xdata = x.cpu().numpy() + + # 2. Sorts the data so that it makes it easier to fit to it + sorted_xdata = np.sort(xdata, axis=-1) + + p0 = {k: v.cpu().numpy() for k, v in p0.items()} + params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order + + # 3. Finds the best parameters for each channel + try: + params = [] + for i in range(sorted_xdata.shape[0]): + xdata_ = sorted_xdata[i] + p0_ = [p0[p][i] for p in params_list] + ch_params = _fit(xdata_, xdata_, np_fit_func, p0_) + params.append(ch_params) + + # 4. Builds the parameters + result = {} + for i, p in enumerate(params_list): + result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device) + + return result + + except ValueError as e: + print(f"Could not fit the function with error: {e}") + print(f"Using fallback result...") + return { + k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items() + } + + +def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.zeros_like(val, dtype=torch.float32, device=x.device) + + +def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor: + # Calculate the original minimum and maximum values + min_vals, max_vals = torch.aminmax(tensor, dim=-1) + x_min = torch.min(min_vals, torch.zeros_like(min_vals)) + x_max = torch.max(max_vals, torch.zeros_like(max_vals)) + + if _max is torch.inf: # We do not need to scale the tensor. Just need to move it + return torch.ones_like(x_min) + + # Calculate the scale factor + scale = (_max - _min) / (x_max - x_min) + return scale + + + +############## Quant ############### + +@torch.enable_grad() +def learn_parameters( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + qtz_func: nn.Module, + deqtz_func: nn.Module, + bits: int, + target_dtype: torch.dtype, + epochs: int = 1000, + early_stop: bool = True, + do_report: bool = False +) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]: + loss_fn = nn.MSELoss() + + # Determines the initial learning rate by computing the initial loss and multiplying it by + # the order of magnitude of the loss divided by 2 + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + base_lr = 0.1 + exponent = int(np.floor(np.log10(loss.item()))) + lr = base_lr * (10 ** (exponent // 2)) + + # Requires gradients in the parameters + for p in params.values(): + p.requires_grad = True + p.grad = None + + param_keys = list(params.keys()) + param_values = list(params.values()) + + # Defines optimizer and loss function + optimizer = torch.optim.Adam(param_values, lr=lr) + scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=epochs // 10, T_mult=1, eta_min=lr * 0.1, last_epoch=-1) + + # Contains the best loss and the best parameters + best_loss = float("inf") + best_params = None + + # Used to stop the search early + min_delta = 1e-7 + acc_loss = [] + percent_epochs_before_stop = 0.1 + + for i in range(epochs): + optimizer.zero_grad() + + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + if loss.isnan() or loss.isinf(): + raise Exception("Loss is NaN or Inf. Stopping the search.") + + loss.backward() + optimizer.step() + scheduler.step() + + acc_loss.append(loss.item()) + + # Reports loss every 10 steps + if i % 10 == 0 and do_report: + print(f"Epoch {i}: Loss {loss.item()}") + + # Optimizes the parameter search by storing the best loss and the parameters + if loss.item() < best_loss: + best_loss = loss.item() + best_params = copy.deepcopy({ + k: v for k, v in params.items() if k in param_keys + }) + + # We also stop the search if the loss has not considerably during the last 10% epochs + if early_stop: + epochs_before_stop = int(epochs * percent_epochs_before_stop) + if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta: + break + + # No longer requires gradients in the parameters + for p in best_params.values(): + p.requires_grad = False + p.grad = None + + if do_report: + print(f"Best loss: {best_loss}") + return best_params, acc_loss + else: + return best_params + + +def quantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + target_dtype: torch.dtype = torch.int8 +) -> torch.Tensor: + quant_min, quant_max = get_min_max_from_bits_signed(bits) + x = x.transpose(0, 1) # Aligns shapes + x = func(x=x, **params) + x = x.transpose(0, 1) + x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype) + return x + + +def dequantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + out_dtype: torch.dtype +) -> torch.Tensor: + x = x.to(dtype=out_dtype) + x = x.transpose(0, 1) + x = func(x=x, **params) + x = x.transpose(0, 1) + return x + + +def round_func_BPDA(input): + # This is equivalent to replacing round function (non-differentiable) with + # an identity function (differentiable) only when backward. + forward_value = torch.round(input) + out = input.clone() + out.data = forward_value.data + return out + + +def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]: + return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1 + + + +############## Numpy ############### + +def np_domain_guard( + x: np.ndarray, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> np.ndarray: + """Guard a tensor to a valid domain.""" + x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = np.clip(x, min, max) + return x + + +def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray: + """Replace a number in a tensor with another number. + + Args: + x (np.ndarray): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + np.ndarray: The tensor with the number replaced. + """ + return np.where(x == num, to, x) + + +def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray: + """Guard the power operation to a valid domain.""" + return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp) + diff --git a/fn_gen/nlr_t_cos/13/loss.png b/fn_gen/nlr_t_cos/13/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..ebeffcbd9b8b2a3cdfe61a2a5ae8aafc44b3e423 Binary files /dev/null and b/fn_gen/nlr_t_cos/13/loss.png differ diff --git a/fn_gen/nlr_t_cos/13/quantization.png b/fn_gen/nlr_t_cos/13/quantization.png new file mode 100644 index 0000000000000000000000000000000000000000..5e9234c79797412ed7bd4555a93623ae3b6f7f28 Binary files /dev/null and b/fn_gen/nlr_t_cos/13/quantization.png differ diff --git a/fn_gen/nlr_t_cos/14/distortion.png b/fn_gen/nlr_t_cos/14/distortion.png new file mode 100644 index 0000000000000000000000000000000000000000..af6d0222662ee6b7a15d587f68e70bf0b48de8cf Binary files /dev/null and b/fn_gen/nlr_t_cos/14/distortion.png differ diff --git a/fn_gen/nlr_t_cos/14/expressions.txt b/fn_gen/nlr_t_cos/14/expressions.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7b68c388fdf6e1b6e2be8076f1d4b8d7bcef4f9 --- /dev/null +++ b/fn_gen/nlr_t_cos/14/expressions.txt @@ -0,0 +1,2 @@ +(_0*x)**(1/3)/_s +_s**3*x**3/_0 \ No newline at end of file diff --git a/fn_gen/nlr_t_cos/14/fn.py b/fn_gen/nlr_t_cos/14/fn.py new file mode 100644 index 0000000000000000000000000000000000000000..407c7271331f08db39440db109896f52da43b44b --- /dev/null +++ b/fn_gen/nlr_t_cos/14/fn.py @@ -0,0 +1,481 @@ +from __future__ import annotations + +import torch +from torch import amin # Necessary for arcsin +import copy +import torch.nn as nn +import numpy as np + +from scipy.optimize import curve_fit +from typing import Dict, Any, Tuple, List, Callable + + +def quantization(x, **params): + return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * guarded_torch_power((params['_0'] * x), 1 / 3)) + + +def dequantization(x, **params): + return (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * guarded_torch_power(params['_s'], torch.tensor(3)) * guarded_torch_power(x, torch.tensor(3))) + + +def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]: + base_p0 = { + '_0': init_space_search(x, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], param='_0', **kwargs), + } + + base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs) + if 'post_init_hook' in kwargs: + kwargs['post_init_hook'](parameters=base_p0) + + params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], **kwargs) + params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()} + if 'post_method_hook' in kwargs: + kwargs['post_method_hook'](parameters=params) + + params = learn_parameters(x, params, + qtz_func=quantization, + deqtz_func=dequantization, + bits=kwargs['bits'], + target_dtype=torch.int8, + epochs=500, + early_stop=False, + ) + if 'post_train_hook' in kwargs: + kwargs['post_train_hook'](parameters=params) + + return params + + +############### Numpy Qtz ############### + + +def np_quantization(x, _0, _s): + return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np_guarded_power((_0 * x), 1 / 3)) + + +def np_dequantization(x, _0, _s): + return (np.divide(1, np_replace_num(_0, num=0, to=10000)) * np_guarded_power(_s, np.array(3)) * np_guarded_power(x, np.array(3))) + + +def fit_func(x, _0, _s): + x_ = np_quantization(x, _0, _s) + x_ = np_dequantization(x_, _0, _s) + return x_ + + + +############### HELPERS ############### + +def domain_guard( + x: torch.Tensor, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> torch.Tensor: + """Guard a tensor to a valid domain.""" + x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = torch.clamp(x, min=min, max=max) + return x + + +def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor: + """Replace a number in a tensor with another number. + + Args: + x (torch.Tensor): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + torch.Tensor: The tensor with the number replaced. + """ + return torch.where(x == num, to, x) + + +def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor: + """Guard the power operation to a valid domain.""" + return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp) + + +def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.ones_like(val, dtype=torch.float32, device=x.device) + + +def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.randn_like(val, dtype=torch.float32, device=x.device) + + +def init_space_search( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int): + """Generates the initial set of parameters. The first iteration generates 10 times more parameters.""" + for _ in range(n_params * 10): # The first iteration generates 10 times more parameters + yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial] + + def _search_param(tensors: List[torch.tensor], n_params): + """Takes the best parameters and generates new parameters around the mean of the best parameters.""" + torch_tensors = torch.stack(tensors) + min_vals, max_vals = torch.aminmax(torch_tensors, dim=0) + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + mean = torch.mean(torch_tensors, dim=0) + for _ in range(n_params): # Generates n_params around the mean of the tensors + yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean + + def _calc(x, qtz_func, deqtz_func, **params): + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params) + x_ = deqtz_func(x=x_, **params) + x_ = x_.transpose(0, 1) + return x_ + + assert "qtz_func" in kwargs, "qtz_func must be provided." + assert "deqtz_func" in kwargs, "deqtz_func must be provided." + assert "params_list" in kwargs, "params list must be provided." + assert "param" in kwargs, "param must be provided." + + qtz_func = kwargs.get('qtz_func') + deqtz_func = kwargs.get('deqtz_func') + params_list = kwargs.get('params_list') + param = kwargs.get('param') + + n_runs = 50 # Number of runs to try to find the best parameters + n_random_params = 50 # Number of random parameters to generate + n_best_to_pick = 5 # Number of best parameters to pick after each run + max_initial = 10000 # Maximum value to initialize the parameters + + # Initializes the parameters + base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param } + params = _build_initial_param(x, max_initial, n_random_params) + + # Performs the search + for _ in range(n_runs): + + best_params = [] + for param_ in params: + try: + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_}) + loss_ones = nn.MSELoss()(x, x_) + + if len(best_params) < n_best_to_pick: + best_params.append((param_, loss_ones.item())) + best_params = sorted(best_params, key=lambda x: x[1]) + elif loss_ones < best_params[-1][1]: + best_params[-1] = (param_, loss_ones.item()) + best_params = sorted(best_params, key=lambda x: x[1]) + + except Exception: # The parameters might not be valid for the function's domain + continue + + # Generates new parameters around the mean + params = _search_param([p for p, _ in best_params], n_random_params) + + # Checks if the best parameter is better than the init_ones + p_ones = init_ones(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones}) + loss_ones = nn.MSELoss()(x, x_) + + # Checks if the best parameter is better than the init_rand + p_rand = init_rand(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand}) + loss_rand = nn.MSELoss()(x, x_) + + if loss_rand < best_params[0][1] and loss_rand < loss_ones: + return p_rand + elif loss_ones < best_params[0][1] and loss_ones < loss_rand: + return p_ones + else: + return best_params[0][0] + + +def init_linear_scale( # Symmetric scale. From the study folder + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + assert "bits" in kwargs, "bits must be provided." + assert "params" in kwargs, "params must be provided." + assert "qtz_func" in kwargs, "qtz_func must be provided." + + bits = kwargs.get('bits') + params = kwargs.get('params') + qtz_func = kwargs.get('qtz_func') + + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs)) + x_ = x_.transpose(0, 1) + + quant_min, quant_max = get_min_max_from_bits_signed(bits) + min_vals, max_vals = torch.aminmax(x_, dim=1) + min_vals = torch.min(min_vals, torch.zeros_like(min_vals)) + max_vals = torch.max(max_vals, torch.zeros_like(max_vals)) + + eps = torch.finfo(torch.float32).eps + + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2) + + scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device) + + # Introduces some noise in scale + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything. + # NOTE(diogo): This has been disproven. The noise does not help the learning process but I still + # left it here for future reference. Will be removed later. + # scale = scale + 0.01 * torch.randn_like(scale) + + return scale + + +def init_non_linear_regression_fit( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + assert "params_list" in kwargs, "params list must be provided." + assert "np_fit_func" in kwargs, "np_fit_func must be provided." + assert "p0" in kwargs, "p0 must be provided." + np_fit_func = kwargs.get('np_fit_func') + params_list = kwargs.get('params_list') + p0 = kwargs.get('p0') + + def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]): + popt, _ = curve_fit( + func, + xdata, + ydata, + maxfev=1000, + p0=p0, + method='lm' + ) + return popt + + # 1. Needs to convert the torch tensor to numpy tensor + xdata = x.cpu().numpy() + + # 2. Sorts the data so that it makes it easier to fit to it + sorted_xdata = np.sort(xdata, axis=-1) + + p0 = {k: v.cpu().numpy() for k, v in p0.items()} + params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order + + # 3. Finds the best parameters for each channel + try: + params = [] + for i in range(sorted_xdata.shape[0]): + xdata_ = sorted_xdata[i] + p0_ = [p0[p][i] for p in params_list] + ch_params = _fit(xdata_, xdata_, np_fit_func, p0_) + params.append(ch_params) + + # 4. Builds the parameters + result = {} + for i, p in enumerate(params_list): + result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device) + + return result + + except ValueError as e: + print(f"Could not fit the function with error: {e}") + print(f"Using fallback result...") + return { + k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items() + } + + +def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.zeros_like(val, dtype=torch.float32, device=x.device) + + +def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor: + # Calculate the original minimum and maximum values + min_vals, max_vals = torch.aminmax(tensor, dim=-1) + x_min = torch.min(min_vals, torch.zeros_like(min_vals)) + x_max = torch.max(max_vals, torch.zeros_like(max_vals)) + + if _max is torch.inf: # We do not need to scale the tensor. Just need to move it + return torch.ones_like(x_min) + + # Calculate the scale factor + scale = (_max - _min) / (x_max - x_min) + return scale + + + +############## Quant ############### + +@torch.enable_grad() +def learn_parameters( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + qtz_func: nn.Module, + deqtz_func: nn.Module, + bits: int, + target_dtype: torch.dtype, + epochs: int = 1000, + early_stop: bool = True, + do_report: bool = False +) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]: + loss_fn = nn.MSELoss() + + # Determines the initial learning rate by computing the initial loss and multiplying it by + # the order of magnitude of the loss divided by 2 + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + base_lr = 0.1 + exponent = int(np.floor(np.log10(loss.item()))) + lr = base_lr * (10 ** (exponent // 2)) + + # Requires gradients in the parameters + for p in params.values(): + p.requires_grad = True + p.grad = None + + param_keys = list(params.keys()) + param_values = list(params.values()) + + # Defines optimizer and loss function + optimizer = torch.optim.Adam(param_values, lr=lr) + scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=epochs // 10, T_mult=1, eta_min=lr * 0.1, last_epoch=-1) + + # Contains the best loss and the best parameters + best_loss = float("inf") + best_params = None + + # Used to stop the search early + min_delta = 1e-7 + acc_loss = [] + percent_epochs_before_stop = 0.1 + + for i in range(epochs): + optimizer.zero_grad() + + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + if loss.isnan() or loss.isinf(): + raise Exception("Loss is NaN or Inf. Stopping the search.") + + loss.backward() + optimizer.step() + scheduler.step() + + acc_loss.append(loss.item()) + + # Reports loss every 10 steps + if i % 10 == 0 and do_report: + print(f"Epoch {i}: Loss {loss.item()}") + + # Optimizes the parameter search by storing the best loss and the parameters + if loss.item() < best_loss: + best_loss = loss.item() + best_params = copy.deepcopy({ + k: v for k, v in params.items() if k in param_keys + }) + + # We also stop the search if the loss has not considerably during the last 10% epochs + if early_stop: + epochs_before_stop = int(epochs * percent_epochs_before_stop) + if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta: + break + + # No longer requires gradients in the parameters + for p in best_params.values(): + p.requires_grad = False + p.grad = None + + if do_report: + print(f"Best loss: {best_loss}") + return best_params, acc_loss + else: + return best_params + + +def quantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + target_dtype: torch.dtype = torch.int8 +) -> torch.Tensor: + quant_min, quant_max = get_min_max_from_bits_signed(bits) + x = x.transpose(0, 1) # Aligns shapes + x = func(x=x, **params) + x = x.transpose(0, 1) + x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype) + return x + + +def dequantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + out_dtype: torch.dtype +) -> torch.Tensor: + x = x.to(dtype=out_dtype) + x = x.transpose(0, 1) + x = func(x=x, **params) + x = x.transpose(0, 1) + return x + + +def round_func_BPDA(input): + # This is equivalent to replacing round function (non-differentiable) with + # an identity function (differentiable) only when backward. + forward_value = torch.round(input) + out = input.clone() + out.data = forward_value.data + return out + + +def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]: + return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1 + + + +############## Numpy ############### + +def np_domain_guard( + x: np.ndarray, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> np.ndarray: + """Guard a tensor to a valid domain.""" + x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = np.clip(x, min, max) + return x + + +def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray: + """Replace a number in a tensor with another number. + + Args: + x (np.ndarray): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + np.ndarray: The tensor with the number replaced. + """ + return np.where(x == num, to, x) + + +def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray: + """Guard the power operation to a valid domain.""" + return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp) + diff --git a/fn_gen/nlr_t_cos/14/loss.png b/fn_gen/nlr_t_cos/14/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..03a6988a19bd8249e3a22f9466b451c51d4b457d Binary files /dev/null and b/fn_gen/nlr_t_cos/14/loss.png differ diff --git a/fn_gen/nlr_t_cos/14/quantization.png b/fn_gen/nlr_t_cos/14/quantization.png new file mode 100644 index 0000000000000000000000000000000000000000..4f7e52837b4d2f6cf4c5c65fa573fa81123ee1cc Binary files /dev/null and b/fn_gen/nlr_t_cos/14/quantization.png differ diff --git a/fn_gen/nlr_t_cos/15/distortion.png b/fn_gen/nlr_t_cos/15/distortion.png new file mode 100644 index 0000000000000000000000000000000000000000..bfb3d981cb32e173902038468ac4c65da0013465 Binary files /dev/null and b/fn_gen/nlr_t_cos/15/distortion.png differ diff --git a/fn_gen/nlr_t_cos/15/expressions.txt b/fn_gen/nlr_t_cos/15/expressions.txt new file mode 100644 index 0000000000000000000000000000000000000000..6d6553d091cd1d343d7aa9b52b85ef6ec88ea854 --- /dev/null +++ b/fn_gen/nlr_t_cos/15/expressions.txt @@ -0,0 +1,2 @@ +x/_s +_s*x \ No newline at end of file diff --git a/fn_gen/nlr_t_cos/15/fn.py b/fn_gen/nlr_t_cos/15/fn.py new file mode 100644 index 0000000000000000000000000000000000000000..98a53c8beca19d432394036fad5d513831b99286 --- /dev/null +++ b/fn_gen/nlr_t_cos/15/fn.py @@ -0,0 +1,480 @@ +from __future__ import annotations + +import torch +from torch import amin # Necessary for arcsin +import copy +import torch.nn as nn +import numpy as np + +from scipy.optimize import curve_fit +from typing import Dict, Any, Tuple, List, Callable + + +def quantization(x, **params): + return (x * torch.div(1, replace_num(params['_s'], num=0, to=10000))) + + +def dequantization(x, **params): + return (params['_s'] * x) + + +def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]: + base_p0 = { + } + + base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs) + if 'post_init_hook' in kwargs: + kwargs['post_init_hook'](parameters=base_p0) + + params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_s'], **kwargs) + params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()} + if 'post_method_hook' in kwargs: + kwargs['post_method_hook'](parameters=params) + + params = learn_parameters(x, params, + qtz_func=quantization, + deqtz_func=dequantization, + bits=kwargs['bits'], + target_dtype=torch.int8, + epochs=500, + early_stop=False, + ) + if 'post_train_hook' in kwargs: + kwargs['post_train_hook'](parameters=params) + + return params + + +############### Numpy Qtz ############### + + +def np_quantization(x, _s): + return (x * np.divide(1, np_replace_num(_s, num=0, to=10000))) + + +def np_dequantization(x, _s): + return (_s * x) + + +def fit_func(x, _s): + x_ = np_quantization(x, _s) + x_ = np_dequantization(x_, _s) + return x_ + + + +############### HELPERS ############### + +def domain_guard( + x: torch.Tensor, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> torch.Tensor: + """Guard a tensor to a valid domain.""" + x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = torch.clamp(x, min=min, max=max) + return x + + +def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor: + """Replace a number in a tensor with another number. + + Args: + x (torch.Tensor): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + torch.Tensor: The tensor with the number replaced. + """ + return torch.where(x == num, to, x) + + +def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor: + """Guard the power operation to a valid domain.""" + return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp) + + +def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.ones_like(val, dtype=torch.float32, device=x.device) + + +def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.randn_like(val, dtype=torch.float32, device=x.device) + + +def init_space_search( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int): + """Generates the initial set of parameters. The first iteration generates 10 times more parameters.""" + for _ in range(n_params * 10): # The first iteration generates 10 times more parameters + yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial] + + def _search_param(tensors: List[torch.tensor], n_params): + """Takes the best parameters and generates new parameters around the mean of the best parameters.""" + torch_tensors = torch.stack(tensors) + min_vals, max_vals = torch.aminmax(torch_tensors, dim=0) + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + mean = torch.mean(torch_tensors, dim=0) + for _ in range(n_params): # Generates n_params around the mean of the tensors + yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean + + def _calc(x, qtz_func, deqtz_func, **params): + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params) + x_ = deqtz_func(x=x_, **params) + x_ = x_.transpose(0, 1) + return x_ + + assert "qtz_func" in kwargs, "qtz_func must be provided." + assert "deqtz_func" in kwargs, "deqtz_func must be provided." + assert "params_list" in kwargs, "params list must be provided." + assert "param" in kwargs, "param must be provided." + + qtz_func = kwargs.get('qtz_func') + deqtz_func = kwargs.get('deqtz_func') + params_list = kwargs.get('params_list') + param = kwargs.get('param') + + n_runs = 50 # Number of runs to try to find the best parameters + n_random_params = 50 # Number of random parameters to generate + n_best_to_pick = 5 # Number of best parameters to pick after each run + max_initial = 10000 # Maximum value to initialize the parameters + + # Initializes the parameters + base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param } + params = _build_initial_param(x, max_initial, n_random_params) + + # Performs the search + for _ in range(n_runs): + + best_params = [] + for param_ in params: + try: + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_}) + loss_ones = nn.MSELoss()(x, x_) + + if len(best_params) < n_best_to_pick: + best_params.append((param_, loss_ones.item())) + best_params = sorted(best_params, key=lambda x: x[1]) + elif loss_ones < best_params[-1][1]: + best_params[-1] = (param_, loss_ones.item()) + best_params = sorted(best_params, key=lambda x: x[1]) + + except Exception: # The parameters might not be valid for the function's domain + continue + + # Generates new parameters around the mean + params = _search_param([p for p, _ in best_params], n_random_params) + + # Checks if the best parameter is better than the init_ones + p_ones = init_ones(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones}) + loss_ones = nn.MSELoss()(x, x_) + + # Checks if the best parameter is better than the init_rand + p_rand = init_rand(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand}) + loss_rand = nn.MSELoss()(x, x_) + + if loss_rand < best_params[0][1] and loss_rand < loss_ones: + return p_rand + elif loss_ones < best_params[0][1] and loss_ones < loss_rand: + return p_ones + else: + return best_params[0][0] + + +def init_linear_scale( # Symmetric scale. From the study folder + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + assert "bits" in kwargs, "bits must be provided." + assert "params" in kwargs, "params must be provided." + assert "qtz_func" in kwargs, "qtz_func must be provided." + + bits = kwargs.get('bits') + params = kwargs.get('params') + qtz_func = kwargs.get('qtz_func') + + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs)) + x_ = x_.transpose(0, 1) + + quant_min, quant_max = get_min_max_from_bits_signed(bits) + min_vals, max_vals = torch.aminmax(x_, dim=1) + min_vals = torch.min(min_vals, torch.zeros_like(min_vals)) + max_vals = torch.max(max_vals, torch.zeros_like(max_vals)) + + eps = torch.finfo(torch.float32).eps + + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2) + + scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device) + + # Introduces some noise in scale + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything. + # NOTE(diogo): This has been disproven. The noise does not help the learning process but I still + # left it here for future reference. Will be removed later. + # scale = scale + 0.01 * torch.randn_like(scale) + + return scale + + +def init_non_linear_regression_fit( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + assert "params_list" in kwargs, "params list must be provided." + assert "np_fit_func" in kwargs, "np_fit_func must be provided." + assert "p0" in kwargs, "p0 must be provided." + np_fit_func = kwargs.get('np_fit_func') + params_list = kwargs.get('params_list') + p0 = kwargs.get('p0') + + def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]): + popt, _ = curve_fit( + func, + xdata, + ydata, + maxfev=1000, + p0=p0, + method='lm' + ) + return popt + + # 1. Needs to convert the torch tensor to numpy tensor + xdata = x.cpu().numpy() + + # 2. Sorts the data so that it makes it easier to fit to it + sorted_xdata = np.sort(xdata, axis=-1) + + p0 = {k: v.cpu().numpy() for k, v in p0.items()} + params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order + + # 3. Finds the best parameters for each channel + try: + params = [] + for i in range(sorted_xdata.shape[0]): + xdata_ = sorted_xdata[i] + p0_ = [p0[p][i] for p in params_list] + ch_params = _fit(xdata_, xdata_, np_fit_func, p0_) + params.append(ch_params) + + # 4. Builds the parameters + result = {} + for i, p in enumerate(params_list): + result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device) + + return result + + except ValueError as e: + print(f"Could not fit the function with error: {e}") + print(f"Using fallback result...") + return { + k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items() + } + + +def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.zeros_like(val, dtype=torch.float32, device=x.device) + + +def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor: + # Calculate the original minimum and maximum values + min_vals, max_vals = torch.aminmax(tensor, dim=-1) + x_min = torch.min(min_vals, torch.zeros_like(min_vals)) + x_max = torch.max(max_vals, torch.zeros_like(max_vals)) + + if _max is torch.inf: # We do not need to scale the tensor. Just need to move it + return torch.ones_like(x_min) + + # Calculate the scale factor + scale = (_max - _min) / (x_max - x_min) + return scale + + + +############## Quant ############### + +@torch.enable_grad() +def learn_parameters( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + qtz_func: nn.Module, + deqtz_func: nn.Module, + bits: int, + target_dtype: torch.dtype, + epochs: int = 1000, + early_stop: bool = True, + do_report: bool = False +) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]: + loss_fn = nn.MSELoss() + + # Determines the initial learning rate by computing the initial loss and multiplying it by + # the order of magnitude of the loss divided by 2 + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + base_lr = 0.1 + exponent = int(np.floor(np.log10(loss.item()))) + lr = base_lr * (10 ** (exponent // 2)) + + # Requires gradients in the parameters + for p in params.values(): + p.requires_grad = True + p.grad = None + + param_keys = list(params.keys()) + param_values = list(params.values()) + + # Defines optimizer and loss function + optimizer = torch.optim.Adam(param_values, lr=lr) + scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=epochs // 10, T_mult=1, eta_min=lr * 0.1, last_epoch=-1) + + # Contains the best loss and the best parameters + best_loss = float("inf") + best_params = None + + # Used to stop the search early + min_delta = 1e-7 + acc_loss = [] + percent_epochs_before_stop = 0.1 + + for i in range(epochs): + optimizer.zero_grad() + + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + if loss.isnan() or loss.isinf(): + raise Exception("Loss is NaN or Inf. Stopping the search.") + + loss.backward() + optimizer.step() + scheduler.step() + + acc_loss.append(loss.item()) + + # Reports loss every 10 steps + if i % 10 == 0 and do_report: + print(f"Epoch {i}: Loss {loss.item()}") + + # Optimizes the parameter search by storing the best loss and the parameters + if loss.item() < best_loss: + best_loss = loss.item() + best_params = copy.deepcopy({ + k: v for k, v in params.items() if k in param_keys + }) + + # We also stop the search if the loss has not considerably during the last 10% epochs + if early_stop: + epochs_before_stop = int(epochs * percent_epochs_before_stop) + if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta: + break + + # No longer requires gradients in the parameters + for p in best_params.values(): + p.requires_grad = False + p.grad = None + + if do_report: + print(f"Best loss: {best_loss}") + return best_params, acc_loss + else: + return best_params + + +def quantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + target_dtype: torch.dtype = torch.int8 +) -> torch.Tensor: + quant_min, quant_max = get_min_max_from_bits_signed(bits) + x = x.transpose(0, 1) # Aligns shapes + x = func(x=x, **params) + x = x.transpose(0, 1) + x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype) + return x + + +def dequantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + out_dtype: torch.dtype +) -> torch.Tensor: + x = x.to(dtype=out_dtype) + x = x.transpose(0, 1) + x = func(x=x, **params) + x = x.transpose(0, 1) + return x + + +def round_func_BPDA(input): + # This is equivalent to replacing round function (non-differentiable) with + # an identity function (differentiable) only when backward. + forward_value = torch.round(input) + out = input.clone() + out.data = forward_value.data + return out + + +def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]: + return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1 + + + +############## Numpy ############### + +def np_domain_guard( + x: np.ndarray, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> np.ndarray: + """Guard a tensor to a valid domain.""" + x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = np.clip(x, min, max) + return x + + +def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray: + """Replace a number in a tensor with another number. + + Args: + x (np.ndarray): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + np.ndarray: The tensor with the number replaced. + """ + return np.where(x == num, to, x) + + +def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray: + """Guard the power operation to a valid domain.""" + return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp) + diff --git a/fn_gen/nlr_t_cos/15/loss.png b/fn_gen/nlr_t_cos/15/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..c818d5e3874a3c6753a3c2a83a69c9b1df158949 Binary files /dev/null and b/fn_gen/nlr_t_cos/15/loss.png differ diff --git a/fn_gen/nlr_t_cos/15/quantization.png b/fn_gen/nlr_t_cos/15/quantization.png new file mode 100644 index 0000000000000000000000000000000000000000..2b0fe99804a7dc07a280bd122d7092f21ae172dd Binary files /dev/null and b/fn_gen/nlr_t_cos/15/quantization.png differ diff --git a/fn_gen/nlr_t_cos/16/distortion.png b/fn_gen/nlr_t_cos/16/distortion.png new file mode 100644 index 0000000000000000000000000000000000000000..d51a31b6ec19b24accb13bfa215af66bfac67a5a Binary files /dev/null and b/fn_gen/nlr_t_cos/16/distortion.png differ diff --git a/fn_gen/nlr_t_cos/16/expressions.txt b/fn_gen/nlr_t_cos/16/expressions.txt new file mode 100644 index 0000000000000000000000000000000000000000..576ec6a351e26f9982eb17e394804ca906d4b067 --- /dev/null +++ b/fn_gen/nlr_t_cos/16/expressions.txt @@ -0,0 +1,2 @@ +acos(_0*x)/_s +cos(_s*x)/_0 \ No newline at end of file diff --git a/fn_gen/nlr_t_cos/16/fn.py b/fn_gen/nlr_t_cos/16/fn.py new file mode 100644 index 0000000000000000000000000000000000000000..c150553b76f8c20d2f8cc9f8bba11534e0f5a149 --- /dev/null +++ b/fn_gen/nlr_t_cos/16/fn.py @@ -0,0 +1,481 @@ +from __future__ import annotations + +import torch +from torch import amin # Necessary for arcsin +import copy +import torch.nn as nn +import numpy as np + +from scipy.optimize import curve_fit +from typing import Dict, Any, Tuple, List, Callable + + +def quantization(x, **params): + return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * torch.acos(domain_guard((params['_0'] * x), min=-0.99999, max=0.99999, nan=0))) + + +def dequantization(x, **params): + return (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * torch.cos((params['_s'] * x))) + + +def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]: + base_p0 = { + '_0': init_space_search(x, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], param='_0', **kwargs), + } + + base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs) + if 'post_init_hook' in kwargs: + kwargs['post_init_hook'](parameters=base_p0) + + params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], **kwargs) + params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()} + if 'post_method_hook' in kwargs: + kwargs['post_method_hook'](parameters=params) + + params = learn_parameters(x, params, + qtz_func=quantization, + deqtz_func=dequantization, + bits=kwargs['bits'], + target_dtype=torch.int8, + epochs=500, + early_stop=False, + ) + if 'post_train_hook' in kwargs: + kwargs['post_train_hook'](parameters=params) + + return params + + +############### Numpy Qtz ############### + + +def np_quantization(x, _0, _s): + return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np.arccos(np_domain_guard((_0 * x), min=-0.99999, max=0.99999, nan=0))) + + +def np_dequantization(x, _0, _s): + return (np.divide(1, np_replace_num(_0, num=0, to=10000)) * np.cos((_s * x))) + + +def fit_func(x, _0, _s): + x_ = np_quantization(x, _0, _s) + x_ = np_dequantization(x_, _0, _s) + return x_ + + + +############### HELPERS ############### + +def domain_guard( + x: torch.Tensor, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> torch.Tensor: + """Guard a tensor to a valid domain.""" + x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = torch.clamp(x, min=min, max=max) + return x + + +def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor: + """Replace a number in a tensor with another number. + + Args: + x (torch.Tensor): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + torch.Tensor: The tensor with the number replaced. + """ + return torch.where(x == num, to, x) + + +def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor: + """Guard the power operation to a valid domain.""" + return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp) + + +def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.ones_like(val, dtype=torch.float32, device=x.device) + + +def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.randn_like(val, dtype=torch.float32, device=x.device) + + +def init_space_search( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int): + """Generates the initial set of parameters. The first iteration generates 10 times more parameters.""" + for _ in range(n_params * 10): # The first iteration generates 10 times more parameters + yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial] + + def _search_param(tensors: List[torch.tensor], n_params): + """Takes the best parameters and generates new parameters around the mean of the best parameters.""" + torch_tensors = torch.stack(tensors) + min_vals, max_vals = torch.aminmax(torch_tensors, dim=0) + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + mean = torch.mean(torch_tensors, dim=0) + for _ in range(n_params): # Generates n_params around the mean of the tensors + yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean + + def _calc(x, qtz_func, deqtz_func, **params): + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params) + x_ = deqtz_func(x=x_, **params) + x_ = x_.transpose(0, 1) + return x_ + + assert "qtz_func" in kwargs, "qtz_func must be provided." + assert "deqtz_func" in kwargs, "deqtz_func must be provided." + assert "params_list" in kwargs, "params list must be provided." + assert "param" in kwargs, "param must be provided." + + qtz_func = kwargs.get('qtz_func') + deqtz_func = kwargs.get('deqtz_func') + params_list = kwargs.get('params_list') + param = kwargs.get('param') + + n_runs = 50 # Number of runs to try to find the best parameters + n_random_params = 50 # Number of random parameters to generate + n_best_to_pick = 5 # Number of best parameters to pick after each run + max_initial = 10000 # Maximum value to initialize the parameters + + # Initializes the parameters + base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param } + params = _build_initial_param(x, max_initial, n_random_params) + + # Performs the search + for _ in range(n_runs): + + best_params = [] + for param_ in params: + try: + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_}) + loss_ones = nn.MSELoss()(x, x_) + + if len(best_params) < n_best_to_pick: + best_params.append((param_, loss_ones.item())) + best_params = sorted(best_params, key=lambda x: x[1]) + elif loss_ones < best_params[-1][1]: + best_params[-1] = (param_, loss_ones.item()) + best_params = sorted(best_params, key=lambda x: x[1]) + + except Exception: # The parameters might not be valid for the function's domain + continue + + # Generates new parameters around the mean + params = _search_param([p for p, _ in best_params], n_random_params) + + # Checks if the best parameter is better than the init_ones + p_ones = init_ones(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones}) + loss_ones = nn.MSELoss()(x, x_) + + # Checks if the best parameter is better than the init_rand + p_rand = init_rand(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand}) + loss_rand = nn.MSELoss()(x, x_) + + if loss_rand < best_params[0][1] and loss_rand < loss_ones: + return p_rand + elif loss_ones < best_params[0][1] and loss_ones < loss_rand: + return p_ones + else: + return best_params[0][0] + + +def init_linear_scale( # Symmetric scale. From the study folder + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + assert "bits" in kwargs, "bits must be provided." + assert "params" in kwargs, "params must be provided." + assert "qtz_func" in kwargs, "qtz_func must be provided." + + bits = kwargs.get('bits') + params = kwargs.get('params') + qtz_func = kwargs.get('qtz_func') + + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs)) + x_ = x_.transpose(0, 1) + + quant_min, quant_max = get_min_max_from_bits_signed(bits) + min_vals, max_vals = torch.aminmax(x_, dim=1) + min_vals = torch.min(min_vals, torch.zeros_like(min_vals)) + max_vals = torch.max(max_vals, torch.zeros_like(max_vals)) + + eps = torch.finfo(torch.float32).eps + + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2) + + scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device) + + # Introduces some noise in scale + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything. + # NOTE(diogo): This has been disproven. The noise does not help the learning process but I still + # left it here for future reference. Will be removed later. + # scale = scale + 0.01 * torch.randn_like(scale) + + return scale + + +def init_non_linear_regression_fit( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + assert "params_list" in kwargs, "params list must be provided." + assert "np_fit_func" in kwargs, "np_fit_func must be provided." + assert "p0" in kwargs, "p0 must be provided." + np_fit_func = kwargs.get('np_fit_func') + params_list = kwargs.get('params_list') + p0 = kwargs.get('p0') + + def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]): + popt, _ = curve_fit( + func, + xdata, + ydata, + maxfev=1000, + p0=p0, + method='lm' + ) + return popt + + # 1. Needs to convert the torch tensor to numpy tensor + xdata = x.cpu().numpy() + + # 2. Sorts the data so that it makes it easier to fit to it + sorted_xdata = np.sort(xdata, axis=-1) + + p0 = {k: v.cpu().numpy() for k, v in p0.items()} + params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order + + # 3. Finds the best parameters for each channel + try: + params = [] + for i in range(sorted_xdata.shape[0]): + xdata_ = sorted_xdata[i] + p0_ = [p0[p][i] for p in params_list] + ch_params = _fit(xdata_, xdata_, np_fit_func, p0_) + params.append(ch_params) + + # 4. Builds the parameters + result = {} + for i, p in enumerate(params_list): + result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device) + + return result + + except ValueError as e: + print(f"Could not fit the function with error: {e}") + print(f"Using fallback result...") + return { + k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items() + } + + +def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.zeros_like(val, dtype=torch.float32, device=x.device) + + +def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor: + # Calculate the original minimum and maximum values + min_vals, max_vals = torch.aminmax(tensor, dim=-1) + x_min = torch.min(min_vals, torch.zeros_like(min_vals)) + x_max = torch.max(max_vals, torch.zeros_like(max_vals)) + + if _max is torch.inf: # We do not need to scale the tensor. Just need to move it + return torch.ones_like(x_min) + + # Calculate the scale factor + scale = (_max - _min) / (x_max - x_min) + return scale + + + +############## Quant ############### + +@torch.enable_grad() +def learn_parameters( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + qtz_func: nn.Module, + deqtz_func: nn.Module, + bits: int, + target_dtype: torch.dtype, + epochs: int = 1000, + early_stop: bool = True, + do_report: bool = False +) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]: + loss_fn = nn.MSELoss() + + # Determines the initial learning rate by computing the initial loss and multiplying it by + # the order of magnitude of the loss divided by 2 + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + base_lr = 0.1 + exponent = int(np.floor(np.log10(loss.item()))) + lr = base_lr * (10 ** (exponent // 2)) + + # Requires gradients in the parameters + for p in params.values(): + p.requires_grad = True + p.grad = None + + param_keys = list(params.keys()) + param_values = list(params.values()) + + # Defines optimizer and loss function + optimizer = torch.optim.Adam(param_values, lr=lr) + scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=epochs // 10, T_mult=1, eta_min=lr * 0.1, last_epoch=-1) + + # Contains the best loss and the best parameters + best_loss = float("inf") + best_params = None + + # Used to stop the search early + min_delta = 1e-7 + acc_loss = [] + percent_epochs_before_stop = 0.1 + + for i in range(epochs): + optimizer.zero_grad() + + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + if loss.isnan() or loss.isinf(): + raise Exception("Loss is NaN or Inf. Stopping the search.") + + loss.backward() + optimizer.step() + scheduler.step() + + acc_loss.append(loss.item()) + + # Reports loss every 10 steps + if i % 10 == 0 and do_report: + print(f"Epoch {i}: Loss {loss.item()}") + + # Optimizes the parameter search by storing the best loss and the parameters + if loss.item() < best_loss: + best_loss = loss.item() + best_params = copy.deepcopy({ + k: v for k, v in params.items() if k in param_keys + }) + + # We also stop the search if the loss has not considerably during the last 10% epochs + if early_stop: + epochs_before_stop = int(epochs * percent_epochs_before_stop) + if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta: + break + + # No longer requires gradients in the parameters + for p in best_params.values(): + p.requires_grad = False + p.grad = None + + if do_report: + print(f"Best loss: {best_loss}") + return best_params, acc_loss + else: + return best_params + + +def quantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + target_dtype: torch.dtype = torch.int8 +) -> torch.Tensor: + quant_min, quant_max = get_min_max_from_bits_signed(bits) + x = x.transpose(0, 1) # Aligns shapes + x = func(x=x, **params) + x = x.transpose(0, 1) + x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype) + return x + + +def dequantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + out_dtype: torch.dtype +) -> torch.Tensor: + x = x.to(dtype=out_dtype) + x = x.transpose(0, 1) + x = func(x=x, **params) + x = x.transpose(0, 1) + return x + + +def round_func_BPDA(input): + # This is equivalent to replacing round function (non-differentiable) with + # an identity function (differentiable) only when backward. + forward_value = torch.round(input) + out = input.clone() + out.data = forward_value.data + return out + + +def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]: + return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1 + + + +############## Numpy ############### + +def np_domain_guard( + x: np.ndarray, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> np.ndarray: + """Guard a tensor to a valid domain.""" + x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = np.clip(x, min, max) + return x + + +def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray: + """Replace a number in a tensor with another number. + + Args: + x (np.ndarray): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + np.ndarray: The tensor with the number replaced. + """ + return np.where(x == num, to, x) + + +def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray: + """Guard the power operation to a valid domain.""" + return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp) + diff --git a/fn_gen/nlr_t_cos/16/loss.png b/fn_gen/nlr_t_cos/16/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..7225f1f245d7e5ad8950bffabfefa91a6576209f Binary files /dev/null and b/fn_gen/nlr_t_cos/16/loss.png differ diff --git a/fn_gen/nlr_t_cos/16/quantization.png b/fn_gen/nlr_t_cos/16/quantization.png new file mode 100644 index 0000000000000000000000000000000000000000..b672fdacbd6c648ab3da192734a690ed55f0230e Binary files /dev/null and b/fn_gen/nlr_t_cos/16/quantization.png differ diff --git a/fn_gen/nlr_t_cos/17/distortion.png b/fn_gen/nlr_t_cos/17/distortion.png new file mode 100644 index 0000000000000000000000000000000000000000..eea8ad0d55130bddc6e2a3f61916741235fa93c0 Binary files /dev/null and b/fn_gen/nlr_t_cos/17/distortion.png differ diff --git a/fn_gen/nlr_t_cos/17/expressions.txt b/fn_gen/nlr_t_cos/17/expressions.txt new file mode 100644 index 0000000000000000000000000000000000000000..23606e9f370f2e4adb43ed623c49d7fcaabd7355 --- /dev/null +++ b/fn_gen/nlr_t_cos/17/expressions.txt @@ -0,0 +1,2 @@ +tan(_0*x)/_s +atan(_s*x)/_0 \ No newline at end of file diff --git a/fn_gen/nlr_t_cos/17/fn.py b/fn_gen/nlr_t_cos/17/fn.py new file mode 100644 index 0000000000000000000000000000000000000000..3eec8ad91cfbbf1feb5ee93823bb0250283be003 --- /dev/null +++ b/fn_gen/nlr_t_cos/17/fn.py @@ -0,0 +1,481 @@ +from __future__ import annotations + +import torch +from torch import amin # Necessary for arcsin +import copy +import torch.nn as nn +import numpy as np + +from scipy.optimize import curve_fit +from typing import Dict, Any, Tuple, List, Callable + + +def quantization(x, **params): + return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * torch.tan(domain_guard((params['_0'] * x), posinf=1, neginf=-1, nan=0))) + + +def dequantization(x, **params): + return (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * torch.atan((params['_s'] * x))) + + +def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]: + base_p0 = { + '_0': init_space_search(x, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], param='_0', **kwargs), + } + + base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs) + if 'post_init_hook' in kwargs: + kwargs['post_init_hook'](parameters=base_p0) + + params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], **kwargs) + params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()} + if 'post_method_hook' in kwargs: + kwargs['post_method_hook'](parameters=params) + + params = learn_parameters(x, params, + qtz_func=quantization, + deqtz_func=dequantization, + bits=kwargs['bits'], + target_dtype=torch.int8, + epochs=500, + early_stop=False, + ) + if 'post_train_hook' in kwargs: + kwargs['post_train_hook'](parameters=params) + + return params + + +############### Numpy Qtz ############### + + +def np_quantization(x, _0, _s): + return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np.tan(np_domain_guard((_0 * x), posinf=1, neginf=-1, nan=0))) + + +def np_dequantization(x, _0, _s): + return (np.divide(1, np_replace_num(_0, num=0, to=10000)) * np.arctan((_s * x))) + + +def fit_func(x, _0, _s): + x_ = np_quantization(x, _0, _s) + x_ = np_dequantization(x_, _0, _s) + return x_ + + + +############### HELPERS ############### + +def domain_guard( + x: torch.Tensor, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> torch.Tensor: + """Guard a tensor to a valid domain.""" + x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = torch.clamp(x, min=min, max=max) + return x + + +def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor: + """Replace a number in a tensor with another number. + + Args: + x (torch.Tensor): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + torch.Tensor: The tensor with the number replaced. + """ + return torch.where(x == num, to, x) + + +def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor: + """Guard the power operation to a valid domain.""" + return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp) + + +def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.ones_like(val, dtype=torch.float32, device=x.device) + + +def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.randn_like(val, dtype=torch.float32, device=x.device) + + +def init_space_search( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int): + """Generates the initial set of parameters. The first iteration generates 10 times more parameters.""" + for _ in range(n_params * 10): # The first iteration generates 10 times more parameters + yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial] + + def _search_param(tensors: List[torch.tensor], n_params): + """Takes the best parameters and generates new parameters around the mean of the best parameters.""" + torch_tensors = torch.stack(tensors) + min_vals, max_vals = torch.aminmax(torch_tensors, dim=0) + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + mean = torch.mean(torch_tensors, dim=0) + for _ in range(n_params): # Generates n_params around the mean of the tensors + yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean + + def _calc(x, qtz_func, deqtz_func, **params): + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params) + x_ = deqtz_func(x=x_, **params) + x_ = x_.transpose(0, 1) + return x_ + + assert "qtz_func" in kwargs, "qtz_func must be provided." + assert "deqtz_func" in kwargs, "deqtz_func must be provided." + assert "params_list" in kwargs, "params list must be provided." + assert "param" in kwargs, "param must be provided." + + qtz_func = kwargs.get('qtz_func') + deqtz_func = kwargs.get('deqtz_func') + params_list = kwargs.get('params_list') + param = kwargs.get('param') + + n_runs = 50 # Number of runs to try to find the best parameters + n_random_params = 50 # Number of random parameters to generate + n_best_to_pick = 5 # Number of best parameters to pick after each run + max_initial = 10000 # Maximum value to initialize the parameters + + # Initializes the parameters + base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param } + params = _build_initial_param(x, max_initial, n_random_params) + + # Performs the search + for _ in range(n_runs): + + best_params = [] + for param_ in params: + try: + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_}) + loss_ones = nn.MSELoss()(x, x_) + + if len(best_params) < n_best_to_pick: + best_params.append((param_, loss_ones.item())) + best_params = sorted(best_params, key=lambda x: x[1]) + elif loss_ones < best_params[-1][1]: + best_params[-1] = (param_, loss_ones.item()) + best_params = sorted(best_params, key=lambda x: x[1]) + + except Exception: # The parameters might not be valid for the function's domain + continue + + # Generates new parameters around the mean + params = _search_param([p for p, _ in best_params], n_random_params) + + # Checks if the best parameter is better than the init_ones + p_ones = init_ones(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones}) + loss_ones = nn.MSELoss()(x, x_) + + # Checks if the best parameter is better than the init_rand + p_rand = init_rand(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand}) + loss_rand = nn.MSELoss()(x, x_) + + if loss_rand < best_params[0][1] and loss_rand < loss_ones: + return p_rand + elif loss_ones < best_params[0][1] and loss_ones < loss_rand: + return p_ones + else: + return best_params[0][0] + + +def init_linear_scale( # Symmetric scale. From the study folder + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + assert "bits" in kwargs, "bits must be provided." + assert "params" in kwargs, "params must be provided." + assert "qtz_func" in kwargs, "qtz_func must be provided." + + bits = kwargs.get('bits') + params = kwargs.get('params') + qtz_func = kwargs.get('qtz_func') + + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs)) + x_ = x_.transpose(0, 1) + + quant_min, quant_max = get_min_max_from_bits_signed(bits) + min_vals, max_vals = torch.aminmax(x_, dim=1) + min_vals = torch.min(min_vals, torch.zeros_like(min_vals)) + max_vals = torch.max(max_vals, torch.zeros_like(max_vals)) + + eps = torch.finfo(torch.float32).eps + + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2) + + scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device) + + # Introduces some noise in scale + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything. + # NOTE(diogo): This has been disproven. The noise does not help the learning process but I still + # left it here for future reference. Will be removed later. + # scale = scale + 0.01 * torch.randn_like(scale) + + return scale + + +def init_non_linear_regression_fit( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + assert "params_list" in kwargs, "params list must be provided." + assert "np_fit_func" in kwargs, "np_fit_func must be provided." + assert "p0" in kwargs, "p0 must be provided." + np_fit_func = kwargs.get('np_fit_func') + params_list = kwargs.get('params_list') + p0 = kwargs.get('p0') + + def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]): + popt, _ = curve_fit( + func, + xdata, + ydata, + maxfev=1000, + p0=p0, + method='lm' + ) + return popt + + # 1. Needs to convert the torch tensor to numpy tensor + xdata = x.cpu().numpy() + + # 2. Sorts the data so that it makes it easier to fit to it + sorted_xdata = np.sort(xdata, axis=-1) + + p0 = {k: v.cpu().numpy() for k, v in p0.items()} + params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order + + # 3. Finds the best parameters for each channel + try: + params = [] + for i in range(sorted_xdata.shape[0]): + xdata_ = sorted_xdata[i] + p0_ = [p0[p][i] for p in params_list] + ch_params = _fit(xdata_, xdata_, np_fit_func, p0_) + params.append(ch_params) + + # 4. Builds the parameters + result = {} + for i, p in enumerate(params_list): + result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device) + + return result + + except ValueError as e: + print(f"Could not fit the function with error: {e}") + print(f"Using fallback result...") + return { + k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items() + } + + +def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.zeros_like(val, dtype=torch.float32, device=x.device) + + +def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor: + # Calculate the original minimum and maximum values + min_vals, max_vals = torch.aminmax(tensor, dim=-1) + x_min = torch.min(min_vals, torch.zeros_like(min_vals)) + x_max = torch.max(max_vals, torch.zeros_like(max_vals)) + + if _max is torch.inf: # We do not need to scale the tensor. Just need to move it + return torch.ones_like(x_min) + + # Calculate the scale factor + scale = (_max - _min) / (x_max - x_min) + return scale + + + +############## Quant ############### + +@torch.enable_grad() +def learn_parameters( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + qtz_func: nn.Module, + deqtz_func: nn.Module, + bits: int, + target_dtype: torch.dtype, + epochs: int = 1000, + early_stop: bool = True, + do_report: bool = False +) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]: + loss_fn = nn.MSELoss() + + # Determines the initial learning rate by computing the initial loss and multiplying it by + # the order of magnitude of the loss divided by 2 + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + base_lr = 0.1 + exponent = int(np.floor(np.log10(loss.item()))) + lr = base_lr * (10 ** (exponent // 2)) + + # Requires gradients in the parameters + for p in params.values(): + p.requires_grad = True + p.grad = None + + param_keys = list(params.keys()) + param_values = list(params.values()) + + # Defines optimizer and loss function + optimizer = torch.optim.Adam(param_values, lr=lr) + scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=epochs // 10, T_mult=1, eta_min=lr * 0.1, last_epoch=-1) + + # Contains the best loss and the best parameters + best_loss = float("inf") + best_params = None + + # Used to stop the search early + min_delta = 1e-7 + acc_loss = [] + percent_epochs_before_stop = 0.1 + + for i in range(epochs): + optimizer.zero_grad() + + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + if loss.isnan() or loss.isinf(): + raise Exception("Loss is NaN or Inf. Stopping the search.") + + loss.backward() + optimizer.step() + scheduler.step() + + acc_loss.append(loss.item()) + + # Reports loss every 10 steps + if i % 10 == 0 and do_report: + print(f"Epoch {i}: Loss {loss.item()}") + + # Optimizes the parameter search by storing the best loss and the parameters + if loss.item() < best_loss: + best_loss = loss.item() + best_params = copy.deepcopy({ + k: v for k, v in params.items() if k in param_keys + }) + + # We also stop the search if the loss has not considerably during the last 10% epochs + if early_stop: + epochs_before_stop = int(epochs * percent_epochs_before_stop) + if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta: + break + + # No longer requires gradients in the parameters + for p in best_params.values(): + p.requires_grad = False + p.grad = None + + if do_report: + print(f"Best loss: {best_loss}") + return best_params, acc_loss + else: + return best_params + + +def quantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + target_dtype: torch.dtype = torch.int8 +) -> torch.Tensor: + quant_min, quant_max = get_min_max_from_bits_signed(bits) + x = x.transpose(0, 1) # Aligns shapes + x = func(x=x, **params) + x = x.transpose(0, 1) + x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype) + return x + + +def dequantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + out_dtype: torch.dtype +) -> torch.Tensor: + x = x.to(dtype=out_dtype) + x = x.transpose(0, 1) + x = func(x=x, **params) + x = x.transpose(0, 1) + return x + + +def round_func_BPDA(input): + # This is equivalent to replacing round function (non-differentiable) with + # an identity function (differentiable) only when backward. + forward_value = torch.round(input) + out = input.clone() + out.data = forward_value.data + return out + + +def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]: + return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1 + + + +############## Numpy ############### + +def np_domain_guard( + x: np.ndarray, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> np.ndarray: + """Guard a tensor to a valid domain.""" + x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = np.clip(x, min, max) + return x + + +def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray: + """Replace a number in a tensor with another number. + + Args: + x (np.ndarray): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + np.ndarray: The tensor with the number replaced. + """ + return np.where(x == num, to, x) + + +def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray: + """Guard the power operation to a valid domain.""" + return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp) + diff --git a/fn_gen/nlr_t_cos/17/loss.png b/fn_gen/nlr_t_cos/17/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..682f04a8560fdf91cfbad241d1ba6e5bbffa8791 Binary files /dev/null and b/fn_gen/nlr_t_cos/17/loss.png differ diff --git a/fn_gen/nlr_t_cos/17/quantization.png b/fn_gen/nlr_t_cos/17/quantization.png new file mode 100644 index 0000000000000000000000000000000000000000..1c11115bb2497ded9e285ce09b97a989156ffda3 Binary files /dev/null and b/fn_gen/nlr_t_cos/17/quantization.png differ diff --git a/fn_gen/nlr_t_cos/18/distortion.png b/fn_gen/nlr_t_cos/18/distortion.png new file mode 100644 index 0000000000000000000000000000000000000000..ea753638de2c9ba69b0c99b1c37b0890742c1960 Binary files /dev/null and b/fn_gen/nlr_t_cos/18/distortion.png differ diff --git a/fn_gen/nlr_t_cos/18/expressions.txt b/fn_gen/nlr_t_cos/18/expressions.txt new file mode 100644 index 0000000000000000000000000000000000000000..c545adce8b3c320e195336b81461c79d0cc385e6 --- /dev/null +++ b/fn_gen/nlr_t_cos/18/expressions.txt @@ -0,0 +1,2 @@ +asinh(_0*x)/_s +sinh(_s*x)/_0 \ No newline at end of file diff --git a/fn_gen/nlr_t_cos/18/fn.py b/fn_gen/nlr_t_cos/18/fn.py new file mode 100644 index 0000000000000000000000000000000000000000..c888ebf706b26ac49774a3453caeae57a7c768b2 --- /dev/null +++ b/fn_gen/nlr_t_cos/18/fn.py @@ -0,0 +1,481 @@ +from __future__ import annotations + +import torch +from torch import amin # Necessary for arcsin +import copy +import torch.nn as nn +import numpy as np + +from scipy.optimize import curve_fit +from typing import Dict, Any, Tuple, List, Callable + + +def quantization(x, **params): + return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * torch.asinh((params['_0'] * x))) + + +def dequantization(x, **params): + return (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * torch.sinh((params['_s'] * x))) + + +def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]: + base_p0 = { + '_0': init_space_search(x, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], param='_0', **kwargs), + } + + base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs) + if 'post_init_hook' in kwargs: + kwargs['post_init_hook'](parameters=base_p0) + + params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], **kwargs) + params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()} + if 'post_method_hook' in kwargs: + kwargs['post_method_hook'](parameters=params) + + params = learn_parameters(x, params, + qtz_func=quantization, + deqtz_func=dequantization, + bits=kwargs['bits'], + target_dtype=torch.int8, + epochs=500, + early_stop=False, + ) + if 'post_train_hook' in kwargs: + kwargs['post_train_hook'](parameters=params) + + return params + + +############### Numpy Qtz ############### + + +def np_quantization(x, _0, _s): + return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np.arcsinh((_0 * x))) + + +def np_dequantization(x, _0, _s): + return (np.divide(1, np_replace_num(_0, num=0, to=10000)) * np.sinh((_s * x))) + + +def fit_func(x, _0, _s): + x_ = np_quantization(x, _0, _s) + x_ = np_dequantization(x_, _0, _s) + return x_ + + + +############### HELPERS ############### + +def domain_guard( + x: torch.Tensor, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> torch.Tensor: + """Guard a tensor to a valid domain.""" + x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = torch.clamp(x, min=min, max=max) + return x + + +def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor: + """Replace a number in a tensor with another number. + + Args: + x (torch.Tensor): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + torch.Tensor: The tensor with the number replaced. + """ + return torch.where(x == num, to, x) + + +def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor: + """Guard the power operation to a valid domain.""" + return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp) + + +def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.ones_like(val, dtype=torch.float32, device=x.device) + + +def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.randn_like(val, dtype=torch.float32, device=x.device) + + +def init_space_search( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int): + """Generates the initial set of parameters. The first iteration generates 10 times more parameters.""" + for _ in range(n_params * 10): # The first iteration generates 10 times more parameters + yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial] + + def _search_param(tensors: List[torch.tensor], n_params): + """Takes the best parameters and generates new parameters around the mean of the best parameters.""" + torch_tensors = torch.stack(tensors) + min_vals, max_vals = torch.aminmax(torch_tensors, dim=0) + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + mean = torch.mean(torch_tensors, dim=0) + for _ in range(n_params): # Generates n_params around the mean of the tensors + yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean + + def _calc(x, qtz_func, deqtz_func, **params): + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params) + x_ = deqtz_func(x=x_, **params) + x_ = x_.transpose(0, 1) + return x_ + + assert "qtz_func" in kwargs, "qtz_func must be provided." + assert "deqtz_func" in kwargs, "deqtz_func must be provided." + assert "params_list" in kwargs, "params list must be provided." + assert "param" in kwargs, "param must be provided." + + qtz_func = kwargs.get('qtz_func') + deqtz_func = kwargs.get('deqtz_func') + params_list = kwargs.get('params_list') + param = kwargs.get('param') + + n_runs = 50 # Number of runs to try to find the best parameters + n_random_params = 50 # Number of random parameters to generate + n_best_to_pick = 5 # Number of best parameters to pick after each run + max_initial = 10000 # Maximum value to initialize the parameters + + # Initializes the parameters + base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param } + params = _build_initial_param(x, max_initial, n_random_params) + + # Performs the search + for _ in range(n_runs): + + best_params = [] + for param_ in params: + try: + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_}) + loss_ones = nn.MSELoss()(x, x_) + + if len(best_params) < n_best_to_pick: + best_params.append((param_, loss_ones.item())) + best_params = sorted(best_params, key=lambda x: x[1]) + elif loss_ones < best_params[-1][1]: + best_params[-1] = (param_, loss_ones.item()) + best_params = sorted(best_params, key=lambda x: x[1]) + + except Exception: # The parameters might not be valid for the function's domain + continue + + # Generates new parameters around the mean + params = _search_param([p for p, _ in best_params], n_random_params) + + # Checks if the best parameter is better than the init_ones + p_ones = init_ones(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones}) + loss_ones = nn.MSELoss()(x, x_) + + # Checks if the best parameter is better than the init_rand + p_rand = init_rand(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand}) + loss_rand = nn.MSELoss()(x, x_) + + if loss_rand < best_params[0][1] and loss_rand < loss_ones: + return p_rand + elif loss_ones < best_params[0][1] and loss_ones < loss_rand: + return p_ones + else: + return best_params[0][0] + + +def init_linear_scale( # Symmetric scale. From the study folder + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + assert "bits" in kwargs, "bits must be provided." + assert "params" in kwargs, "params must be provided." + assert "qtz_func" in kwargs, "qtz_func must be provided." + + bits = kwargs.get('bits') + params = kwargs.get('params') + qtz_func = kwargs.get('qtz_func') + + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs)) + x_ = x_.transpose(0, 1) + + quant_min, quant_max = get_min_max_from_bits_signed(bits) + min_vals, max_vals = torch.aminmax(x_, dim=1) + min_vals = torch.min(min_vals, torch.zeros_like(min_vals)) + max_vals = torch.max(max_vals, torch.zeros_like(max_vals)) + + eps = torch.finfo(torch.float32).eps + + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2) + + scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device) + + # Introduces some noise in scale + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything. + # NOTE(diogo): This has been disproven. The noise does not help the learning process but I still + # left it here for future reference. Will be removed later. + # scale = scale + 0.01 * torch.randn_like(scale) + + return scale + + +def init_non_linear_regression_fit( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + assert "params_list" in kwargs, "params list must be provided." + assert "np_fit_func" in kwargs, "np_fit_func must be provided." + assert "p0" in kwargs, "p0 must be provided." + np_fit_func = kwargs.get('np_fit_func') + params_list = kwargs.get('params_list') + p0 = kwargs.get('p0') + + def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]): + popt, _ = curve_fit( + func, + xdata, + ydata, + maxfev=1000, + p0=p0, + method='lm' + ) + return popt + + # 1. Needs to convert the torch tensor to numpy tensor + xdata = x.cpu().numpy() + + # 2. Sorts the data so that it makes it easier to fit to it + sorted_xdata = np.sort(xdata, axis=-1) + + p0 = {k: v.cpu().numpy() for k, v in p0.items()} + params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order + + # 3. Finds the best parameters for each channel + try: + params = [] + for i in range(sorted_xdata.shape[0]): + xdata_ = sorted_xdata[i] + p0_ = [p0[p][i] for p in params_list] + ch_params = _fit(xdata_, xdata_, np_fit_func, p0_) + params.append(ch_params) + + # 4. Builds the parameters + result = {} + for i, p in enumerate(params_list): + result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device) + + return result + + except ValueError as e: + print(f"Could not fit the function with error: {e}") + print(f"Using fallback result...") + return { + k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items() + } + + +def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.zeros_like(val, dtype=torch.float32, device=x.device) + + +def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor: + # Calculate the original minimum and maximum values + min_vals, max_vals = torch.aminmax(tensor, dim=-1) + x_min = torch.min(min_vals, torch.zeros_like(min_vals)) + x_max = torch.max(max_vals, torch.zeros_like(max_vals)) + + if _max is torch.inf: # We do not need to scale the tensor. Just need to move it + return torch.ones_like(x_min) + + # Calculate the scale factor + scale = (_max - _min) / (x_max - x_min) + return scale + + + +############## Quant ############### + +@torch.enable_grad() +def learn_parameters( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + qtz_func: nn.Module, + deqtz_func: nn.Module, + bits: int, + target_dtype: torch.dtype, + epochs: int = 1000, + early_stop: bool = True, + do_report: bool = False +) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]: + loss_fn = nn.MSELoss() + + # Determines the initial learning rate by computing the initial loss and multiplying it by + # the order of magnitude of the loss divided by 2 + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + base_lr = 0.1 + exponent = int(np.floor(np.log10(loss.item()))) + lr = base_lr * (10 ** (exponent // 2)) + + # Requires gradients in the parameters + for p in params.values(): + p.requires_grad = True + p.grad = None + + param_keys = list(params.keys()) + param_values = list(params.values()) + + # Defines optimizer and loss function + optimizer = torch.optim.Adam(param_values, lr=lr) + scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=epochs // 10, T_mult=1, eta_min=lr * 0.1, last_epoch=-1) + + # Contains the best loss and the best parameters + best_loss = float("inf") + best_params = None + + # Used to stop the search early + min_delta = 1e-7 + acc_loss = [] + percent_epochs_before_stop = 0.1 + + for i in range(epochs): + optimizer.zero_grad() + + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + if loss.isnan() or loss.isinf(): + raise Exception("Loss is NaN or Inf. Stopping the search.") + + loss.backward() + optimizer.step() + scheduler.step() + + acc_loss.append(loss.item()) + + # Reports loss every 10 steps + if i % 10 == 0 and do_report: + print(f"Epoch {i}: Loss {loss.item()}") + + # Optimizes the parameter search by storing the best loss and the parameters + if loss.item() < best_loss: + best_loss = loss.item() + best_params = copy.deepcopy({ + k: v for k, v in params.items() if k in param_keys + }) + + # We also stop the search if the loss has not considerably during the last 10% epochs + if early_stop: + epochs_before_stop = int(epochs * percent_epochs_before_stop) + if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta: + break + + # No longer requires gradients in the parameters + for p in best_params.values(): + p.requires_grad = False + p.grad = None + + if do_report: + print(f"Best loss: {best_loss}") + return best_params, acc_loss + else: + return best_params + + +def quantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + target_dtype: torch.dtype = torch.int8 +) -> torch.Tensor: + quant_min, quant_max = get_min_max_from_bits_signed(bits) + x = x.transpose(0, 1) # Aligns shapes + x = func(x=x, **params) + x = x.transpose(0, 1) + x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype) + return x + + +def dequantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + out_dtype: torch.dtype +) -> torch.Tensor: + x = x.to(dtype=out_dtype) + x = x.transpose(0, 1) + x = func(x=x, **params) + x = x.transpose(0, 1) + return x + + +def round_func_BPDA(input): + # This is equivalent to replacing round function (non-differentiable) with + # an identity function (differentiable) only when backward. + forward_value = torch.round(input) + out = input.clone() + out.data = forward_value.data + return out + + +def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]: + return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1 + + + +############## Numpy ############### + +def np_domain_guard( + x: np.ndarray, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> np.ndarray: + """Guard a tensor to a valid domain.""" + x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = np.clip(x, min, max) + return x + + +def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray: + """Replace a number in a tensor with another number. + + Args: + x (np.ndarray): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + np.ndarray: The tensor with the number replaced. + """ + return np.where(x == num, to, x) + + +def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray: + """Guard the power operation to a valid domain.""" + return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp) + diff --git a/fn_gen/nlr_t_cos/18/loss.png b/fn_gen/nlr_t_cos/18/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..3bf128b73ea128f11f328d663cb9e930dd609705 Binary files /dev/null and b/fn_gen/nlr_t_cos/18/loss.png differ diff --git a/fn_gen/nlr_t_cos/18/quantization.png b/fn_gen/nlr_t_cos/18/quantization.png new file mode 100644 index 0000000000000000000000000000000000000000..60241fa5c4d8cc0e0ba5f2cbdd50b303e3e34f87 Binary files /dev/null and b/fn_gen/nlr_t_cos/18/quantization.png differ diff --git a/fn_gen/nlr_t_cos/2/distortion.png b/fn_gen/nlr_t_cos/2/distortion.png new file mode 100644 index 0000000000000000000000000000000000000000..bafade3e7d668661feea20628e7ef55cca49281f Binary files /dev/null and b/fn_gen/nlr_t_cos/2/distortion.png differ diff --git a/fn_gen/nlr_t_cos/2/expressions.txt b/fn_gen/nlr_t_cos/2/expressions.txt new file mode 100644 index 0000000000000000000000000000000000000000..74791fc40576643d62f6366a8b4eda20eb1ad252 --- /dev/null +++ b/fn_gen/nlr_t_cos/2/expressions.txt @@ -0,0 +1,2 @@ +x**3/_s +(_s*x)**(1/3) \ No newline at end of file diff --git a/fn_gen/nlr_t_cos/2/fn.py b/fn_gen/nlr_t_cos/2/fn.py new file mode 100644 index 0000000000000000000000000000000000000000..216825d76db1add4054d181fd7b78cf037032235 --- /dev/null +++ b/fn_gen/nlr_t_cos/2/fn.py @@ -0,0 +1,480 @@ +from __future__ import annotations + +import torch +from torch import amin # Necessary for arcsin +import copy +import torch.nn as nn +import numpy as np + +from scipy.optimize import curve_fit +from typing import Dict, Any, Tuple, List, Callable + + +def quantization(x, **params): + return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * guarded_torch_power(x, torch.tensor(3))) + + +def dequantization(x, **params): + return guarded_torch_power((params['_s'] * x), 1 / 3) + + +def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]: + base_p0 = { + } + + base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs) + if 'post_init_hook' in kwargs: + kwargs['post_init_hook'](parameters=base_p0) + + params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_s'], **kwargs) + params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()} + if 'post_method_hook' in kwargs: + kwargs['post_method_hook'](parameters=params) + + params = learn_parameters(x, params, + qtz_func=quantization, + deqtz_func=dequantization, + bits=kwargs['bits'], + target_dtype=torch.int8, + epochs=500, + early_stop=False, + ) + if 'post_train_hook' in kwargs: + kwargs['post_train_hook'](parameters=params) + + return params + + +############### Numpy Qtz ############### + + +def np_quantization(x, _s): + return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np_guarded_power(x, np.array(3))) + + +def np_dequantization(x, _s): + return np_guarded_power((_s * x), 1 / 3) + + +def fit_func(x, _s): + x_ = np_quantization(x, _s) + x_ = np_dequantization(x_, _s) + return x_ + + + +############### HELPERS ############### + +def domain_guard( + x: torch.Tensor, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> torch.Tensor: + """Guard a tensor to a valid domain.""" + x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = torch.clamp(x, min=min, max=max) + return x + + +def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor: + """Replace a number in a tensor with another number. + + Args: + x (torch.Tensor): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + torch.Tensor: The tensor with the number replaced. + """ + return torch.where(x == num, to, x) + + +def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor: + """Guard the power operation to a valid domain.""" + return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp) + + +def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.ones_like(val, dtype=torch.float32, device=x.device) + + +def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.randn_like(val, dtype=torch.float32, device=x.device) + + +def init_space_search( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int): + """Generates the initial set of parameters. The first iteration generates 10 times more parameters.""" + for _ in range(n_params * 10): # The first iteration generates 10 times more parameters + yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial] + + def _search_param(tensors: List[torch.tensor], n_params): + """Takes the best parameters and generates new parameters around the mean of the best parameters.""" + torch_tensors = torch.stack(tensors) + min_vals, max_vals = torch.aminmax(torch_tensors, dim=0) + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + mean = torch.mean(torch_tensors, dim=0) + for _ in range(n_params): # Generates n_params around the mean of the tensors + yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean + + def _calc(x, qtz_func, deqtz_func, **params): + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params) + x_ = deqtz_func(x=x_, **params) + x_ = x_.transpose(0, 1) + return x_ + + assert "qtz_func" in kwargs, "qtz_func must be provided." + assert "deqtz_func" in kwargs, "deqtz_func must be provided." + assert "params_list" in kwargs, "params list must be provided." + assert "param" in kwargs, "param must be provided." + + qtz_func = kwargs.get('qtz_func') + deqtz_func = kwargs.get('deqtz_func') + params_list = kwargs.get('params_list') + param = kwargs.get('param') + + n_runs = 50 # Number of runs to try to find the best parameters + n_random_params = 50 # Number of random parameters to generate + n_best_to_pick = 5 # Number of best parameters to pick after each run + max_initial = 10000 # Maximum value to initialize the parameters + + # Initializes the parameters + base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param } + params = _build_initial_param(x, max_initial, n_random_params) + + # Performs the search + for _ in range(n_runs): + + best_params = [] + for param_ in params: + try: + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_}) + loss_ones = nn.MSELoss()(x, x_) + + if len(best_params) < n_best_to_pick: + best_params.append((param_, loss_ones.item())) + best_params = sorted(best_params, key=lambda x: x[1]) + elif loss_ones < best_params[-1][1]: + best_params[-1] = (param_, loss_ones.item()) + best_params = sorted(best_params, key=lambda x: x[1]) + + except Exception: # The parameters might not be valid for the function's domain + continue + + # Generates new parameters around the mean + params = _search_param([p for p, _ in best_params], n_random_params) + + # Checks if the best parameter is better than the init_ones + p_ones = init_ones(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones}) + loss_ones = nn.MSELoss()(x, x_) + + # Checks if the best parameter is better than the init_rand + p_rand = init_rand(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand}) + loss_rand = nn.MSELoss()(x, x_) + + if loss_rand < best_params[0][1] and loss_rand < loss_ones: + return p_rand + elif loss_ones < best_params[0][1] and loss_ones < loss_rand: + return p_ones + else: + return best_params[0][0] + + +def init_linear_scale( # Symmetric scale. From the study folder + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + assert "bits" in kwargs, "bits must be provided." + assert "params" in kwargs, "params must be provided." + assert "qtz_func" in kwargs, "qtz_func must be provided." + + bits = kwargs.get('bits') + params = kwargs.get('params') + qtz_func = kwargs.get('qtz_func') + + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs)) + x_ = x_.transpose(0, 1) + + quant_min, quant_max = get_min_max_from_bits_signed(bits) + min_vals, max_vals = torch.aminmax(x_, dim=1) + min_vals = torch.min(min_vals, torch.zeros_like(min_vals)) + max_vals = torch.max(max_vals, torch.zeros_like(max_vals)) + + eps = torch.finfo(torch.float32).eps + + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2) + + scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device) + + # Introduces some noise in scale + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything. + # NOTE(diogo): This has been disproven. The noise does not help the learning process but I still + # left it here for future reference. Will be removed later. + # scale = scale + 0.01 * torch.randn_like(scale) + + return scale + + +def init_non_linear_regression_fit( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + assert "params_list" in kwargs, "params list must be provided." + assert "np_fit_func" in kwargs, "np_fit_func must be provided." + assert "p0" in kwargs, "p0 must be provided." + np_fit_func = kwargs.get('np_fit_func') + params_list = kwargs.get('params_list') + p0 = kwargs.get('p0') + + def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]): + popt, _ = curve_fit( + func, + xdata, + ydata, + maxfev=1000, + p0=p0, + method='lm' + ) + return popt + + # 1. Needs to convert the torch tensor to numpy tensor + xdata = x.cpu().numpy() + + # 2. Sorts the data so that it makes it easier to fit to it + sorted_xdata = np.sort(xdata, axis=-1) + + p0 = {k: v.cpu().numpy() for k, v in p0.items()} + params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order + + # 3. Finds the best parameters for each channel + try: + params = [] + for i in range(sorted_xdata.shape[0]): + xdata_ = sorted_xdata[i] + p0_ = [p0[p][i] for p in params_list] + ch_params = _fit(xdata_, xdata_, np_fit_func, p0_) + params.append(ch_params) + + # 4. Builds the parameters + result = {} + for i, p in enumerate(params_list): + result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device) + + return result + + except ValueError as e: + print(f"Could not fit the function with error: {e}") + print(f"Using fallback result...") + return { + k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items() + } + + +def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.zeros_like(val, dtype=torch.float32, device=x.device) + + +def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor: + # Calculate the original minimum and maximum values + min_vals, max_vals = torch.aminmax(tensor, dim=-1) + x_min = torch.min(min_vals, torch.zeros_like(min_vals)) + x_max = torch.max(max_vals, torch.zeros_like(max_vals)) + + if _max is torch.inf: # We do not need to scale the tensor. Just need to move it + return torch.ones_like(x_min) + + # Calculate the scale factor + scale = (_max - _min) / (x_max - x_min) + return scale + + + +############## Quant ############### + +@torch.enable_grad() +def learn_parameters( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + qtz_func: nn.Module, + deqtz_func: nn.Module, + bits: int, + target_dtype: torch.dtype, + epochs: int = 1000, + early_stop: bool = True, + do_report: bool = False +) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]: + loss_fn = nn.MSELoss() + + # Determines the initial learning rate by computing the initial loss and multiplying it by + # the order of magnitude of the loss divided by 2 + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + base_lr = 0.1 + exponent = int(np.floor(np.log10(loss.item()))) + lr = base_lr * (10 ** (exponent // 2)) + + # Requires gradients in the parameters + for p in params.values(): + p.requires_grad = True + p.grad = None + + param_keys = list(params.keys()) + param_values = list(params.values()) + + # Defines optimizer and loss function + optimizer = torch.optim.Adam(param_values, lr=lr) + scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=epochs // 10, T_mult=1, eta_min=lr * 0.1, last_epoch=-1) + + # Contains the best loss and the best parameters + best_loss = float("inf") + best_params = None + + # Used to stop the search early + min_delta = 1e-7 + acc_loss = [] + percent_epochs_before_stop = 0.1 + + for i in range(epochs): + optimizer.zero_grad() + + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + if loss.isnan() or loss.isinf(): + raise Exception("Loss is NaN or Inf. Stopping the search.") + + loss.backward() + optimizer.step() + scheduler.step() + + acc_loss.append(loss.item()) + + # Reports loss every 10 steps + if i % 10 == 0 and do_report: + print(f"Epoch {i}: Loss {loss.item()}") + + # Optimizes the parameter search by storing the best loss and the parameters + if loss.item() < best_loss: + best_loss = loss.item() + best_params = copy.deepcopy({ + k: v for k, v in params.items() if k in param_keys + }) + + # We also stop the search if the loss has not considerably during the last 10% epochs + if early_stop: + epochs_before_stop = int(epochs * percent_epochs_before_stop) + if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta: + break + + # No longer requires gradients in the parameters + for p in best_params.values(): + p.requires_grad = False + p.grad = None + + if do_report: + print(f"Best loss: {best_loss}") + return best_params, acc_loss + else: + return best_params + + +def quantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + target_dtype: torch.dtype = torch.int8 +) -> torch.Tensor: + quant_min, quant_max = get_min_max_from_bits_signed(bits) + x = x.transpose(0, 1) # Aligns shapes + x = func(x=x, **params) + x = x.transpose(0, 1) + x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype) + return x + + +def dequantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + out_dtype: torch.dtype +) -> torch.Tensor: + x = x.to(dtype=out_dtype) + x = x.transpose(0, 1) + x = func(x=x, **params) + x = x.transpose(0, 1) + return x + + +def round_func_BPDA(input): + # This is equivalent to replacing round function (non-differentiable) with + # an identity function (differentiable) only when backward. + forward_value = torch.round(input) + out = input.clone() + out.data = forward_value.data + return out + + +def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]: + return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1 + + + +############## Numpy ############### + +def np_domain_guard( + x: np.ndarray, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> np.ndarray: + """Guard a tensor to a valid domain.""" + x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = np.clip(x, min, max) + return x + + +def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray: + """Replace a number in a tensor with another number. + + Args: + x (np.ndarray): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + np.ndarray: The tensor with the number replaced. + """ + return np.where(x == num, to, x) + + +def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray: + """Guard the power operation to a valid domain.""" + return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp) + diff --git a/fn_gen/nlr_t_cos/2/loss.png b/fn_gen/nlr_t_cos/2/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..6f3b5b9c368b4439bab00804dceb4be6276971b3 Binary files /dev/null and b/fn_gen/nlr_t_cos/2/loss.png differ diff --git a/fn_gen/nlr_t_cos/2/quantization.png b/fn_gen/nlr_t_cos/2/quantization.png new file mode 100644 index 0000000000000000000000000000000000000000..025e0dd8ac44e2e9810a207b55bec64d19b80e6b Binary files /dev/null and b/fn_gen/nlr_t_cos/2/quantization.png differ diff --git a/fn_gen/nlr_t_cos/3/distortion.png b/fn_gen/nlr_t_cos/3/distortion.png new file mode 100644 index 0000000000000000000000000000000000000000..674369eed1fae6a9382f1d0ae759d5459cb7e2c3 Binary files /dev/null and b/fn_gen/nlr_t_cos/3/distortion.png differ diff --git a/fn_gen/nlr_t_cos/3/expressions.txt b/fn_gen/nlr_t_cos/3/expressions.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a7abbbdac98c7d53123fe0b9807e7644bc00acf --- /dev/null +++ b/fn_gen/nlr_t_cos/3/expressions.txt @@ -0,0 +1,2 @@ +acosh(_0*x)/_s +cosh(_s*x)/_0 \ No newline at end of file diff --git a/fn_gen/nlr_t_cos/3/fn.py b/fn_gen/nlr_t_cos/3/fn.py new file mode 100644 index 0000000000000000000000000000000000000000..adfb524a988b4e5ec3828aade091ef442d37ec1f --- /dev/null +++ b/fn_gen/nlr_t_cos/3/fn.py @@ -0,0 +1,481 @@ +from __future__ import annotations + +import torch +from torch import amin # Necessary for arcsin +import copy +import torch.nn as nn +import numpy as np + +from scipy.optimize import curve_fit +from typing import Dict, Any, Tuple, List, Callable + + +def quantization(x, **params): + return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * torch.acosh(domain_guard((params['_0'] * x), min=1, nan=1))) + + +def dequantization(x, **params): + return (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * torch.cosh((params['_s'] * x))) + + +def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]: + base_p0 = { + '_0': init_space_search(x, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], param='_0', **kwargs), + } + + base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs) + if 'post_init_hook' in kwargs: + kwargs['post_init_hook'](parameters=base_p0) + + params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], **kwargs) + params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()} + if 'post_method_hook' in kwargs: + kwargs['post_method_hook'](parameters=params) + + params = learn_parameters(x, params, + qtz_func=quantization, + deqtz_func=dequantization, + bits=kwargs['bits'], + target_dtype=torch.int8, + epochs=500, + early_stop=False, + ) + if 'post_train_hook' in kwargs: + kwargs['post_train_hook'](parameters=params) + + return params + + +############### Numpy Qtz ############### + + +def np_quantization(x, _0, _s): + return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np.arccosh(np_domain_guard((_0 * x), min=1, nan=1))) + + +def np_dequantization(x, _0, _s): + return (np.divide(1, np_replace_num(_0, num=0, to=10000)) * np.cosh((_s * x))) + + +def fit_func(x, _0, _s): + x_ = np_quantization(x, _0, _s) + x_ = np_dequantization(x_, _0, _s) + return x_ + + + +############### HELPERS ############### + +def domain_guard( + x: torch.Tensor, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> torch.Tensor: + """Guard a tensor to a valid domain.""" + x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = torch.clamp(x, min=min, max=max) + return x + + +def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor: + """Replace a number in a tensor with another number. + + Args: + x (torch.Tensor): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + torch.Tensor: The tensor with the number replaced. + """ + return torch.where(x == num, to, x) + + +def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor: + """Guard the power operation to a valid domain.""" + return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp) + + +def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.ones_like(val, dtype=torch.float32, device=x.device) + + +def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.randn_like(val, dtype=torch.float32, device=x.device) + + +def init_space_search( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int): + """Generates the initial set of parameters. The first iteration generates 10 times more parameters.""" + for _ in range(n_params * 10): # The first iteration generates 10 times more parameters + yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial] + + def _search_param(tensors: List[torch.tensor], n_params): + """Takes the best parameters and generates new parameters around the mean of the best parameters.""" + torch_tensors = torch.stack(tensors) + min_vals, max_vals = torch.aminmax(torch_tensors, dim=0) + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + mean = torch.mean(torch_tensors, dim=0) + for _ in range(n_params): # Generates n_params around the mean of the tensors + yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean + + def _calc(x, qtz_func, deqtz_func, **params): + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params) + x_ = deqtz_func(x=x_, **params) + x_ = x_.transpose(0, 1) + return x_ + + assert "qtz_func" in kwargs, "qtz_func must be provided." + assert "deqtz_func" in kwargs, "deqtz_func must be provided." + assert "params_list" in kwargs, "params list must be provided." + assert "param" in kwargs, "param must be provided." + + qtz_func = kwargs.get('qtz_func') + deqtz_func = kwargs.get('deqtz_func') + params_list = kwargs.get('params_list') + param = kwargs.get('param') + + n_runs = 50 # Number of runs to try to find the best parameters + n_random_params = 50 # Number of random parameters to generate + n_best_to_pick = 5 # Number of best parameters to pick after each run + max_initial = 10000 # Maximum value to initialize the parameters + + # Initializes the parameters + base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param } + params = _build_initial_param(x, max_initial, n_random_params) + + # Performs the search + for _ in range(n_runs): + + best_params = [] + for param_ in params: + try: + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_}) + loss_ones = nn.MSELoss()(x, x_) + + if len(best_params) < n_best_to_pick: + best_params.append((param_, loss_ones.item())) + best_params = sorted(best_params, key=lambda x: x[1]) + elif loss_ones < best_params[-1][1]: + best_params[-1] = (param_, loss_ones.item()) + best_params = sorted(best_params, key=lambda x: x[1]) + + except Exception: # The parameters might not be valid for the function's domain + continue + + # Generates new parameters around the mean + params = _search_param([p for p, _ in best_params], n_random_params) + + # Checks if the best parameter is better than the init_ones + p_ones = init_ones(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones}) + loss_ones = nn.MSELoss()(x, x_) + + # Checks if the best parameter is better than the init_rand + p_rand = init_rand(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand}) + loss_rand = nn.MSELoss()(x, x_) + + if loss_rand < best_params[0][1] and loss_rand < loss_ones: + return p_rand + elif loss_ones < best_params[0][1] and loss_ones < loss_rand: + return p_ones + else: + return best_params[0][0] + + +def init_linear_scale( # Symmetric scale. From the study folder + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + assert "bits" in kwargs, "bits must be provided." + assert "params" in kwargs, "params must be provided." + assert "qtz_func" in kwargs, "qtz_func must be provided." + + bits = kwargs.get('bits') + params = kwargs.get('params') + qtz_func = kwargs.get('qtz_func') + + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs)) + x_ = x_.transpose(0, 1) + + quant_min, quant_max = get_min_max_from_bits_signed(bits) + min_vals, max_vals = torch.aminmax(x_, dim=1) + min_vals = torch.min(min_vals, torch.zeros_like(min_vals)) + max_vals = torch.max(max_vals, torch.zeros_like(max_vals)) + + eps = torch.finfo(torch.float32).eps + + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2) + + scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device) + + # Introduces some noise in scale + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything. + # NOTE(diogo): This has been disproven. The noise does not help the learning process but I still + # left it here for future reference. Will be removed later. + # scale = scale + 0.01 * torch.randn_like(scale) + + return scale + + +def init_non_linear_regression_fit( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + assert "params_list" in kwargs, "params list must be provided." + assert "np_fit_func" in kwargs, "np_fit_func must be provided." + assert "p0" in kwargs, "p0 must be provided." + np_fit_func = kwargs.get('np_fit_func') + params_list = kwargs.get('params_list') + p0 = kwargs.get('p0') + + def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]): + popt, _ = curve_fit( + func, + xdata, + ydata, + maxfev=1000, + p0=p0, + method='lm' + ) + return popt + + # 1. Needs to convert the torch tensor to numpy tensor + xdata = x.cpu().numpy() + + # 2. Sorts the data so that it makes it easier to fit to it + sorted_xdata = np.sort(xdata, axis=-1) + + p0 = {k: v.cpu().numpy() for k, v in p0.items()} + params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order + + # 3. Finds the best parameters for each channel + try: + params = [] + for i in range(sorted_xdata.shape[0]): + xdata_ = sorted_xdata[i] + p0_ = [p0[p][i] for p in params_list] + ch_params = _fit(xdata_, xdata_, np_fit_func, p0_) + params.append(ch_params) + + # 4. Builds the parameters + result = {} + for i, p in enumerate(params_list): + result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device) + + return result + + except ValueError as e: + print(f"Could not fit the function with error: {e}") + print(f"Using fallback result...") + return { + k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items() + } + + +def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.zeros_like(val, dtype=torch.float32, device=x.device) + + +def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor: + # Calculate the original minimum and maximum values + min_vals, max_vals = torch.aminmax(tensor, dim=-1) + x_min = torch.min(min_vals, torch.zeros_like(min_vals)) + x_max = torch.max(max_vals, torch.zeros_like(max_vals)) + + if _max is torch.inf: # We do not need to scale the tensor. Just need to move it + return torch.ones_like(x_min) + + # Calculate the scale factor + scale = (_max - _min) / (x_max - x_min) + return scale + + + +############## Quant ############### + +@torch.enable_grad() +def learn_parameters( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + qtz_func: nn.Module, + deqtz_func: nn.Module, + bits: int, + target_dtype: torch.dtype, + epochs: int = 1000, + early_stop: bool = True, + do_report: bool = False +) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]: + loss_fn = nn.MSELoss() + + # Determines the initial learning rate by computing the initial loss and multiplying it by + # the order of magnitude of the loss divided by 2 + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + base_lr = 0.1 + exponent = int(np.floor(np.log10(loss.item()))) + lr = base_lr * (10 ** (exponent // 2)) + + # Requires gradients in the parameters + for p in params.values(): + p.requires_grad = True + p.grad = None + + param_keys = list(params.keys()) + param_values = list(params.values()) + + # Defines optimizer and loss function + optimizer = torch.optim.Adam(param_values, lr=lr) + scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=epochs // 10, T_mult=1, eta_min=lr * 0.1, last_epoch=-1) + + # Contains the best loss and the best parameters + best_loss = float("inf") + best_params = None + + # Used to stop the search early + min_delta = 1e-7 + acc_loss = [] + percent_epochs_before_stop = 0.1 + + for i in range(epochs): + optimizer.zero_grad() + + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + if loss.isnan() or loss.isinf(): + raise Exception("Loss is NaN or Inf. Stopping the search.") + + loss.backward() + optimizer.step() + scheduler.step() + + acc_loss.append(loss.item()) + + # Reports loss every 10 steps + if i % 10 == 0 and do_report: + print(f"Epoch {i}: Loss {loss.item()}") + + # Optimizes the parameter search by storing the best loss and the parameters + if loss.item() < best_loss: + best_loss = loss.item() + best_params = copy.deepcopy({ + k: v for k, v in params.items() if k in param_keys + }) + + # We also stop the search if the loss has not considerably during the last 10% epochs + if early_stop: + epochs_before_stop = int(epochs * percent_epochs_before_stop) + if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta: + break + + # No longer requires gradients in the parameters + for p in best_params.values(): + p.requires_grad = False + p.grad = None + + if do_report: + print(f"Best loss: {best_loss}") + return best_params, acc_loss + else: + return best_params + + +def quantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + target_dtype: torch.dtype = torch.int8 +) -> torch.Tensor: + quant_min, quant_max = get_min_max_from_bits_signed(bits) + x = x.transpose(0, 1) # Aligns shapes + x = func(x=x, **params) + x = x.transpose(0, 1) + x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype) + return x + + +def dequantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + out_dtype: torch.dtype +) -> torch.Tensor: + x = x.to(dtype=out_dtype) + x = x.transpose(0, 1) + x = func(x=x, **params) + x = x.transpose(0, 1) + return x + + +def round_func_BPDA(input): + # This is equivalent to replacing round function (non-differentiable) with + # an identity function (differentiable) only when backward. + forward_value = torch.round(input) + out = input.clone() + out.data = forward_value.data + return out + + +def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]: + return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1 + + + +############## Numpy ############### + +def np_domain_guard( + x: np.ndarray, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> np.ndarray: + """Guard a tensor to a valid domain.""" + x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = np.clip(x, min, max) + return x + + +def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray: + """Replace a number in a tensor with another number. + + Args: + x (np.ndarray): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + np.ndarray: The tensor with the number replaced. + """ + return np.where(x == num, to, x) + + +def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray: + """Guard the power operation to a valid domain.""" + return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp) + diff --git a/fn_gen/nlr_t_cos/3/loss.png b/fn_gen/nlr_t_cos/3/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..7d7a5afe5eecef518023ffecb74793a6e9cd45f7 Binary files /dev/null and b/fn_gen/nlr_t_cos/3/loss.png differ diff --git a/fn_gen/nlr_t_cos/3/quantization.png b/fn_gen/nlr_t_cos/3/quantization.png new file mode 100644 index 0000000000000000000000000000000000000000..a87d8c09d35da1dcbacdc49ad716c48296b12bbb Binary files /dev/null and b/fn_gen/nlr_t_cos/3/quantization.png differ diff --git a/fn_gen/nlr_t_cos/4/distortion.png b/fn_gen/nlr_t_cos/4/distortion.png new file mode 100644 index 0000000000000000000000000000000000000000..e45b51137a75221a6e17ff94c824fd1615aa51a1 Binary files /dev/null and b/fn_gen/nlr_t_cos/4/distortion.png differ diff --git a/fn_gen/nlr_t_cos/4/expressions.txt b/fn_gen/nlr_t_cos/4/expressions.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a7e5be4566beeb4727d82f95d24241966d158dc --- /dev/null +++ b/fn_gen/nlr_t_cos/4/expressions.txt @@ -0,0 +1,2 @@ +log(_0*x)/_s +exp(_s*x)/_0 \ No newline at end of file diff --git a/fn_gen/nlr_t_cos/4/fn.py b/fn_gen/nlr_t_cos/4/fn.py new file mode 100644 index 0000000000000000000000000000000000000000..9c6e4dc45229a8bd0ba58749e47db378bb57b895 --- /dev/null +++ b/fn_gen/nlr_t_cos/4/fn.py @@ -0,0 +1,481 @@ +from __future__ import annotations + +import torch +from torch import amin # Necessary for arcsin +import copy +import torch.nn as nn +import numpy as np + +from scipy.optimize import curve_fit +from typing import Dict, Any, Tuple, List, Callable + + +def quantization(x, **params): + return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * torch.log(domain_guard((params['_0'] * x), min=1e-5, nan=1e-5))) + + +def dequantization(x, **params): + return (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * torch.exp((params['_s'] * x))) + + +def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]: + base_p0 = { + '_0': init_space_search(x, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], param='_0', **kwargs), + } + + base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs) + if 'post_init_hook' in kwargs: + kwargs['post_init_hook'](parameters=base_p0) + + params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], **kwargs) + params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()} + if 'post_method_hook' in kwargs: + kwargs['post_method_hook'](parameters=params) + + params = learn_parameters(x, params, + qtz_func=quantization, + deqtz_func=dequantization, + bits=kwargs['bits'], + target_dtype=torch.int8, + epochs=500, + early_stop=False, + ) + if 'post_train_hook' in kwargs: + kwargs['post_train_hook'](parameters=params) + + return params + + +############### Numpy Qtz ############### + + +def np_quantization(x, _0, _s): + return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np.log(np_domain_guard((_0 * x), min=1e-5, nan=1e-5))) + + +def np_dequantization(x, _0, _s): + return (np.divide(1, np_replace_num(_0, num=0, to=10000)) * np.exp((_s * x))) + + +def fit_func(x, _0, _s): + x_ = np_quantization(x, _0, _s) + x_ = np_dequantization(x_, _0, _s) + return x_ + + + +############### HELPERS ############### + +def domain_guard( + x: torch.Tensor, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> torch.Tensor: + """Guard a tensor to a valid domain.""" + x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = torch.clamp(x, min=min, max=max) + return x + + +def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor: + """Replace a number in a tensor with another number. + + Args: + x (torch.Tensor): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + torch.Tensor: The tensor with the number replaced. + """ + return torch.where(x == num, to, x) + + +def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor: + """Guard the power operation to a valid domain.""" + return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp) + + +def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.ones_like(val, dtype=torch.float32, device=x.device) + + +def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.randn_like(val, dtype=torch.float32, device=x.device) + + +def init_space_search( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int): + """Generates the initial set of parameters. The first iteration generates 10 times more parameters.""" + for _ in range(n_params * 10): # The first iteration generates 10 times more parameters + yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial] + + def _search_param(tensors: List[torch.tensor], n_params): + """Takes the best parameters and generates new parameters around the mean of the best parameters.""" + torch_tensors = torch.stack(tensors) + min_vals, max_vals = torch.aminmax(torch_tensors, dim=0) + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + mean = torch.mean(torch_tensors, dim=0) + for _ in range(n_params): # Generates n_params around the mean of the tensors + yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean + + def _calc(x, qtz_func, deqtz_func, **params): + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params) + x_ = deqtz_func(x=x_, **params) + x_ = x_.transpose(0, 1) + return x_ + + assert "qtz_func" in kwargs, "qtz_func must be provided." + assert "deqtz_func" in kwargs, "deqtz_func must be provided." + assert "params_list" in kwargs, "params list must be provided." + assert "param" in kwargs, "param must be provided." + + qtz_func = kwargs.get('qtz_func') + deqtz_func = kwargs.get('deqtz_func') + params_list = kwargs.get('params_list') + param = kwargs.get('param') + + n_runs = 50 # Number of runs to try to find the best parameters + n_random_params = 50 # Number of random parameters to generate + n_best_to_pick = 5 # Number of best parameters to pick after each run + max_initial = 10000 # Maximum value to initialize the parameters + + # Initializes the parameters + base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param } + params = _build_initial_param(x, max_initial, n_random_params) + + # Performs the search + for _ in range(n_runs): + + best_params = [] + for param_ in params: + try: + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_}) + loss_ones = nn.MSELoss()(x, x_) + + if len(best_params) < n_best_to_pick: + best_params.append((param_, loss_ones.item())) + best_params = sorted(best_params, key=lambda x: x[1]) + elif loss_ones < best_params[-1][1]: + best_params[-1] = (param_, loss_ones.item()) + best_params = sorted(best_params, key=lambda x: x[1]) + + except Exception: # The parameters might not be valid for the function's domain + continue + + # Generates new parameters around the mean + params = _search_param([p for p, _ in best_params], n_random_params) + + # Checks if the best parameter is better than the init_ones + p_ones = init_ones(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones}) + loss_ones = nn.MSELoss()(x, x_) + + # Checks if the best parameter is better than the init_rand + p_rand = init_rand(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand}) + loss_rand = nn.MSELoss()(x, x_) + + if loss_rand < best_params[0][1] and loss_rand < loss_ones: + return p_rand + elif loss_ones < best_params[0][1] and loss_ones < loss_rand: + return p_ones + else: + return best_params[0][0] + + +def init_linear_scale( # Symmetric scale. From the study folder + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + assert "bits" in kwargs, "bits must be provided." + assert "params" in kwargs, "params must be provided." + assert "qtz_func" in kwargs, "qtz_func must be provided." + + bits = kwargs.get('bits') + params = kwargs.get('params') + qtz_func = kwargs.get('qtz_func') + + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs)) + x_ = x_.transpose(0, 1) + + quant_min, quant_max = get_min_max_from_bits_signed(bits) + min_vals, max_vals = torch.aminmax(x_, dim=1) + min_vals = torch.min(min_vals, torch.zeros_like(min_vals)) + max_vals = torch.max(max_vals, torch.zeros_like(max_vals)) + + eps = torch.finfo(torch.float32).eps + + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2) + + scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device) + + # Introduces some noise in scale + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything. + # NOTE(diogo): This has been disproven. The noise does not help the learning process but I still + # left it here for future reference. Will be removed later. + # scale = scale + 0.01 * torch.randn_like(scale) + + return scale + + +def init_non_linear_regression_fit( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + assert "params_list" in kwargs, "params list must be provided." + assert "np_fit_func" in kwargs, "np_fit_func must be provided." + assert "p0" in kwargs, "p0 must be provided." + np_fit_func = kwargs.get('np_fit_func') + params_list = kwargs.get('params_list') + p0 = kwargs.get('p0') + + def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]): + popt, _ = curve_fit( + func, + xdata, + ydata, + maxfev=1000, + p0=p0, + method='lm' + ) + return popt + + # 1. Needs to convert the torch tensor to numpy tensor + xdata = x.cpu().numpy() + + # 2. Sorts the data so that it makes it easier to fit to it + sorted_xdata = np.sort(xdata, axis=-1) + + p0 = {k: v.cpu().numpy() for k, v in p0.items()} + params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order + + # 3. Finds the best parameters for each channel + try: + params = [] + for i in range(sorted_xdata.shape[0]): + xdata_ = sorted_xdata[i] + p0_ = [p0[p][i] for p in params_list] + ch_params = _fit(xdata_, xdata_, np_fit_func, p0_) + params.append(ch_params) + + # 4. Builds the parameters + result = {} + for i, p in enumerate(params_list): + result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device) + + return result + + except ValueError as e: + print(f"Could not fit the function with error: {e}") + print(f"Using fallback result...") + return { + k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items() + } + + +def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.zeros_like(val, dtype=torch.float32, device=x.device) + + +def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor: + # Calculate the original minimum and maximum values + min_vals, max_vals = torch.aminmax(tensor, dim=-1) + x_min = torch.min(min_vals, torch.zeros_like(min_vals)) + x_max = torch.max(max_vals, torch.zeros_like(max_vals)) + + if _max is torch.inf: # We do not need to scale the tensor. Just need to move it + return torch.ones_like(x_min) + + # Calculate the scale factor + scale = (_max - _min) / (x_max - x_min) + return scale + + + +############## Quant ############### + +@torch.enable_grad() +def learn_parameters( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + qtz_func: nn.Module, + deqtz_func: nn.Module, + bits: int, + target_dtype: torch.dtype, + epochs: int = 1000, + early_stop: bool = True, + do_report: bool = False +) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]: + loss_fn = nn.MSELoss() + + # Determines the initial learning rate by computing the initial loss and multiplying it by + # the order of magnitude of the loss divided by 2 + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + base_lr = 0.1 + exponent = int(np.floor(np.log10(loss.item()))) + lr = base_lr * (10 ** (exponent // 2)) + + # Requires gradients in the parameters + for p in params.values(): + p.requires_grad = True + p.grad = None + + param_keys = list(params.keys()) + param_values = list(params.values()) + + # Defines optimizer and loss function + optimizer = torch.optim.Adam(param_values, lr=lr) + scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=epochs // 10, T_mult=1, eta_min=lr * 0.1, last_epoch=-1) + + # Contains the best loss and the best parameters + best_loss = float("inf") + best_params = None + + # Used to stop the search early + min_delta = 1e-7 + acc_loss = [] + percent_epochs_before_stop = 0.1 + + for i in range(epochs): + optimizer.zero_grad() + + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + if loss.isnan() or loss.isinf(): + raise Exception("Loss is NaN or Inf. Stopping the search.") + + loss.backward() + optimizer.step() + scheduler.step() + + acc_loss.append(loss.item()) + + # Reports loss every 10 steps + if i % 10 == 0 and do_report: + print(f"Epoch {i}: Loss {loss.item()}") + + # Optimizes the parameter search by storing the best loss and the parameters + if loss.item() < best_loss: + best_loss = loss.item() + best_params = copy.deepcopy({ + k: v for k, v in params.items() if k in param_keys + }) + + # We also stop the search if the loss has not considerably during the last 10% epochs + if early_stop: + epochs_before_stop = int(epochs * percent_epochs_before_stop) + if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta: + break + + # No longer requires gradients in the parameters + for p in best_params.values(): + p.requires_grad = False + p.grad = None + + if do_report: + print(f"Best loss: {best_loss}") + return best_params, acc_loss + else: + return best_params + + +def quantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + target_dtype: torch.dtype = torch.int8 +) -> torch.Tensor: + quant_min, quant_max = get_min_max_from_bits_signed(bits) + x = x.transpose(0, 1) # Aligns shapes + x = func(x=x, **params) + x = x.transpose(0, 1) + x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype) + return x + + +def dequantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + out_dtype: torch.dtype +) -> torch.Tensor: + x = x.to(dtype=out_dtype) + x = x.transpose(0, 1) + x = func(x=x, **params) + x = x.transpose(0, 1) + return x + + +def round_func_BPDA(input): + # This is equivalent to replacing round function (non-differentiable) with + # an identity function (differentiable) only when backward. + forward_value = torch.round(input) + out = input.clone() + out.data = forward_value.data + return out + + +def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]: + return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1 + + + +############## Numpy ############### + +def np_domain_guard( + x: np.ndarray, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> np.ndarray: + """Guard a tensor to a valid domain.""" + x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = np.clip(x, min, max) + return x + + +def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray: + """Replace a number in a tensor with another number. + + Args: + x (np.ndarray): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + np.ndarray: The tensor with the number replaced. + """ + return np.where(x == num, to, x) + + +def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray: + """Guard the power operation to a valid domain.""" + return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp) + diff --git a/fn_gen/nlr_t_cos/4/loss.png b/fn_gen/nlr_t_cos/4/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..8039190fdbc2409e853013d3a0339944967594a0 Binary files /dev/null and b/fn_gen/nlr_t_cos/4/loss.png differ diff --git a/fn_gen/nlr_t_cos/4/quantization.png b/fn_gen/nlr_t_cos/4/quantization.png new file mode 100644 index 0000000000000000000000000000000000000000..dc3d6d9631445002b0bba7f98d3ab94c1521c079 Binary files /dev/null and b/fn_gen/nlr_t_cos/4/quantization.png differ diff --git a/fn_gen/nlr_t_cos/5/distortion.png b/fn_gen/nlr_t_cos/5/distortion.png new file mode 100644 index 0000000000000000000000000000000000000000..17e3bdd89df824678cc50f3f3fdc935e6c73ff96 Binary files /dev/null and b/fn_gen/nlr_t_cos/5/distortion.png differ diff --git a/fn_gen/nlr_t_cos/5/expressions.txt b/fn_gen/nlr_t_cos/5/expressions.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa32b575e8c654dbc457c94f36222e70d86dc940 --- /dev/null +++ b/fn_gen/nlr_t_cos/5/expressions.txt @@ -0,0 +1,2 @@ +atan(_0*x)/_s +tan(_s*x)/_0 \ No newline at end of file diff --git a/fn_gen/nlr_t_cos/5/fn.py b/fn_gen/nlr_t_cos/5/fn.py new file mode 100644 index 0000000000000000000000000000000000000000..3edf6dab86d269c6b6d36a66b440fe5786216ece --- /dev/null +++ b/fn_gen/nlr_t_cos/5/fn.py @@ -0,0 +1,481 @@ +from __future__ import annotations + +import torch +from torch import amin # Necessary for arcsin +import copy +import torch.nn as nn +import numpy as np + +from scipy.optimize import curve_fit +from typing import Dict, Any, Tuple, List, Callable + + +def quantization(x, **params): + return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * torch.atan((params['_0'] * x))) + + +def dequantization(x, **params): + return (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * torch.tan(domain_guard((params['_s'] * x), posinf=1, neginf=-1, nan=0))) + + +def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]: + base_p0 = { + '_0': init_space_search(x, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], param='_0', **kwargs), + } + + base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs) + if 'post_init_hook' in kwargs: + kwargs['post_init_hook'](parameters=base_p0) + + params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], **kwargs) + params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()} + if 'post_method_hook' in kwargs: + kwargs['post_method_hook'](parameters=params) + + params = learn_parameters(x, params, + qtz_func=quantization, + deqtz_func=dequantization, + bits=kwargs['bits'], + target_dtype=torch.int8, + epochs=500, + early_stop=False, + ) + if 'post_train_hook' in kwargs: + kwargs['post_train_hook'](parameters=params) + + return params + + +############### Numpy Qtz ############### + + +def np_quantization(x, _0, _s): + return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np.arctan((_0 * x))) + + +def np_dequantization(x, _0, _s): + return (np.divide(1, np_replace_num(_0, num=0, to=10000)) * np.tan(np_domain_guard((_s * x), posinf=1, neginf=-1, nan=0))) + + +def fit_func(x, _0, _s): + x_ = np_quantization(x, _0, _s) + x_ = np_dequantization(x_, _0, _s) + return x_ + + + +############### HELPERS ############### + +def domain_guard( + x: torch.Tensor, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> torch.Tensor: + """Guard a tensor to a valid domain.""" + x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = torch.clamp(x, min=min, max=max) + return x + + +def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor: + """Replace a number in a tensor with another number. + + Args: + x (torch.Tensor): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + torch.Tensor: The tensor with the number replaced. + """ + return torch.where(x == num, to, x) + + +def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor: + """Guard the power operation to a valid domain.""" + return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp) + + +def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.ones_like(val, dtype=torch.float32, device=x.device) + + +def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.randn_like(val, dtype=torch.float32, device=x.device) + + +def init_space_search( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int): + """Generates the initial set of parameters. The first iteration generates 10 times more parameters.""" + for _ in range(n_params * 10): # The first iteration generates 10 times more parameters + yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial] + + def _search_param(tensors: List[torch.tensor], n_params): + """Takes the best parameters and generates new parameters around the mean of the best parameters.""" + torch_tensors = torch.stack(tensors) + min_vals, max_vals = torch.aminmax(torch_tensors, dim=0) + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + mean = torch.mean(torch_tensors, dim=0) + for _ in range(n_params): # Generates n_params around the mean of the tensors + yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean + + def _calc(x, qtz_func, deqtz_func, **params): + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params) + x_ = deqtz_func(x=x_, **params) + x_ = x_.transpose(0, 1) + return x_ + + assert "qtz_func" in kwargs, "qtz_func must be provided." + assert "deqtz_func" in kwargs, "deqtz_func must be provided." + assert "params_list" in kwargs, "params list must be provided." + assert "param" in kwargs, "param must be provided." + + qtz_func = kwargs.get('qtz_func') + deqtz_func = kwargs.get('deqtz_func') + params_list = kwargs.get('params_list') + param = kwargs.get('param') + + n_runs = 50 # Number of runs to try to find the best parameters + n_random_params = 50 # Number of random parameters to generate + n_best_to_pick = 5 # Number of best parameters to pick after each run + max_initial = 10000 # Maximum value to initialize the parameters + + # Initializes the parameters + base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param } + params = _build_initial_param(x, max_initial, n_random_params) + + # Performs the search + for _ in range(n_runs): + + best_params = [] + for param_ in params: + try: + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_}) + loss_ones = nn.MSELoss()(x, x_) + + if len(best_params) < n_best_to_pick: + best_params.append((param_, loss_ones.item())) + best_params = sorted(best_params, key=lambda x: x[1]) + elif loss_ones < best_params[-1][1]: + best_params[-1] = (param_, loss_ones.item()) + best_params = sorted(best_params, key=lambda x: x[1]) + + except Exception: # The parameters might not be valid for the function's domain + continue + + # Generates new parameters around the mean + params = _search_param([p for p, _ in best_params], n_random_params) + + # Checks if the best parameter is better than the init_ones + p_ones = init_ones(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones}) + loss_ones = nn.MSELoss()(x, x_) + + # Checks if the best parameter is better than the init_rand + p_rand = init_rand(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand}) + loss_rand = nn.MSELoss()(x, x_) + + if loss_rand < best_params[0][1] and loss_rand < loss_ones: + return p_rand + elif loss_ones < best_params[0][1] and loss_ones < loss_rand: + return p_ones + else: + return best_params[0][0] + + +def init_linear_scale( # Symmetric scale. From the study folder + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + assert "bits" in kwargs, "bits must be provided." + assert "params" in kwargs, "params must be provided." + assert "qtz_func" in kwargs, "qtz_func must be provided." + + bits = kwargs.get('bits') + params = kwargs.get('params') + qtz_func = kwargs.get('qtz_func') + + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs)) + x_ = x_.transpose(0, 1) + + quant_min, quant_max = get_min_max_from_bits_signed(bits) + min_vals, max_vals = torch.aminmax(x_, dim=1) + min_vals = torch.min(min_vals, torch.zeros_like(min_vals)) + max_vals = torch.max(max_vals, torch.zeros_like(max_vals)) + + eps = torch.finfo(torch.float32).eps + + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2) + + scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device) + + # Introduces some noise in scale + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything. + # NOTE(diogo): This has been disproven. The noise does not help the learning process but I still + # left it here for future reference. Will be removed later. + # scale = scale + 0.01 * torch.randn_like(scale) + + return scale + + +def init_non_linear_regression_fit( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + assert "params_list" in kwargs, "params list must be provided." + assert "np_fit_func" in kwargs, "np_fit_func must be provided." + assert "p0" in kwargs, "p0 must be provided." + np_fit_func = kwargs.get('np_fit_func') + params_list = kwargs.get('params_list') + p0 = kwargs.get('p0') + + def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]): + popt, _ = curve_fit( + func, + xdata, + ydata, + maxfev=1000, + p0=p0, + method='lm' + ) + return popt + + # 1. Needs to convert the torch tensor to numpy tensor + xdata = x.cpu().numpy() + + # 2. Sorts the data so that it makes it easier to fit to it + sorted_xdata = np.sort(xdata, axis=-1) + + p0 = {k: v.cpu().numpy() for k, v in p0.items()} + params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order + + # 3. Finds the best parameters for each channel + try: + params = [] + for i in range(sorted_xdata.shape[0]): + xdata_ = sorted_xdata[i] + p0_ = [p0[p][i] for p in params_list] + ch_params = _fit(xdata_, xdata_, np_fit_func, p0_) + params.append(ch_params) + + # 4. Builds the parameters + result = {} + for i, p in enumerate(params_list): + result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device) + + return result + + except ValueError as e: + print(f"Could not fit the function with error: {e}") + print(f"Using fallback result...") + return { + k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items() + } + + +def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.zeros_like(val, dtype=torch.float32, device=x.device) + + +def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor: + # Calculate the original minimum and maximum values + min_vals, max_vals = torch.aminmax(tensor, dim=-1) + x_min = torch.min(min_vals, torch.zeros_like(min_vals)) + x_max = torch.max(max_vals, torch.zeros_like(max_vals)) + + if _max is torch.inf: # We do not need to scale the tensor. Just need to move it + return torch.ones_like(x_min) + + # Calculate the scale factor + scale = (_max - _min) / (x_max - x_min) + return scale + + + +############## Quant ############### + +@torch.enable_grad() +def learn_parameters( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + qtz_func: nn.Module, + deqtz_func: nn.Module, + bits: int, + target_dtype: torch.dtype, + epochs: int = 1000, + early_stop: bool = True, + do_report: bool = False +) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]: + loss_fn = nn.MSELoss() + + # Determines the initial learning rate by computing the initial loss and multiplying it by + # the order of magnitude of the loss divided by 2 + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + base_lr = 0.1 + exponent = int(np.floor(np.log10(loss.item()))) + lr = base_lr * (10 ** (exponent // 2)) + + # Requires gradients in the parameters + for p in params.values(): + p.requires_grad = True + p.grad = None + + param_keys = list(params.keys()) + param_values = list(params.values()) + + # Defines optimizer and loss function + optimizer = torch.optim.Adam(param_values, lr=lr) + scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=epochs // 10, T_mult=1, eta_min=lr * 0.1, last_epoch=-1) + + # Contains the best loss and the best parameters + best_loss = float("inf") + best_params = None + + # Used to stop the search early + min_delta = 1e-7 + acc_loss = [] + percent_epochs_before_stop = 0.1 + + for i in range(epochs): + optimizer.zero_grad() + + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + if loss.isnan() or loss.isinf(): + raise Exception("Loss is NaN or Inf. Stopping the search.") + + loss.backward() + optimizer.step() + scheduler.step() + + acc_loss.append(loss.item()) + + # Reports loss every 10 steps + if i % 10 == 0 and do_report: + print(f"Epoch {i}: Loss {loss.item()}") + + # Optimizes the parameter search by storing the best loss and the parameters + if loss.item() < best_loss: + best_loss = loss.item() + best_params = copy.deepcopy({ + k: v for k, v in params.items() if k in param_keys + }) + + # We also stop the search if the loss has not considerably during the last 10% epochs + if early_stop: + epochs_before_stop = int(epochs * percent_epochs_before_stop) + if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta: + break + + # No longer requires gradients in the parameters + for p in best_params.values(): + p.requires_grad = False + p.grad = None + + if do_report: + print(f"Best loss: {best_loss}") + return best_params, acc_loss + else: + return best_params + + +def quantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + target_dtype: torch.dtype = torch.int8 +) -> torch.Tensor: + quant_min, quant_max = get_min_max_from_bits_signed(bits) + x = x.transpose(0, 1) # Aligns shapes + x = func(x=x, **params) + x = x.transpose(0, 1) + x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype) + return x + + +def dequantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + out_dtype: torch.dtype +) -> torch.Tensor: + x = x.to(dtype=out_dtype) + x = x.transpose(0, 1) + x = func(x=x, **params) + x = x.transpose(0, 1) + return x + + +def round_func_BPDA(input): + # This is equivalent to replacing round function (non-differentiable) with + # an identity function (differentiable) only when backward. + forward_value = torch.round(input) + out = input.clone() + out.data = forward_value.data + return out + + +def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]: + return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1 + + + +############## Numpy ############### + +def np_domain_guard( + x: np.ndarray, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> np.ndarray: + """Guard a tensor to a valid domain.""" + x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = np.clip(x, min, max) + return x + + +def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray: + """Replace a number in a tensor with another number. + + Args: + x (np.ndarray): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + np.ndarray: The tensor with the number replaced. + """ + return np.where(x == num, to, x) + + +def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray: + """Guard the power operation to a valid domain.""" + return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp) + diff --git a/fn_gen/nlr_t_cos/5/loss.png b/fn_gen/nlr_t_cos/5/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..3cd7cc4d69bc7742a1dde45b74fe86eab16a919b Binary files /dev/null and b/fn_gen/nlr_t_cos/5/loss.png differ diff --git a/fn_gen/nlr_t_cos/5/quantization.png b/fn_gen/nlr_t_cos/5/quantization.png new file mode 100644 index 0000000000000000000000000000000000000000..1c8d3d201089ac9f49df509af2ce062f8116f74e Binary files /dev/null and b/fn_gen/nlr_t_cos/5/quantization.png differ diff --git a/fn_gen/nlr_t_cos/6/distortion.png b/fn_gen/nlr_t_cos/6/distortion.png new file mode 100644 index 0000000000000000000000000000000000000000..a2ee371b1e8485c42a4c71d82570d92f84b5ccd0 Binary files /dev/null and b/fn_gen/nlr_t_cos/6/distortion.png differ diff --git a/fn_gen/nlr_t_cos/6/expressions.txt b/fn_gen/nlr_t_cos/6/expressions.txt new file mode 100644 index 0000000000000000000000000000000000000000..dbb6da0fc54c6f23dc12daf2e2c3a395819e1bf4 --- /dev/null +++ b/fn_gen/nlr_t_cos/6/expressions.txt @@ -0,0 +1,2 @@ +x**2/_s +sqrt(_s*x) \ No newline at end of file diff --git a/fn_gen/nlr_t_cos/6/fn.py b/fn_gen/nlr_t_cos/6/fn.py new file mode 100644 index 0000000000000000000000000000000000000000..45fe5188518d4bd22d774fff11c30ded3768e3b7 --- /dev/null +++ b/fn_gen/nlr_t_cos/6/fn.py @@ -0,0 +1,480 @@ +from __future__ import annotations + +import torch +from torch import amin # Necessary for arcsin +import copy +import torch.nn as nn +import numpy as np + +from scipy.optimize import curve_fit +from typing import Dict, Any, Tuple, List, Callable + + +def quantization(x, **params): + return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * guarded_torch_power(x, torch.tensor(2))) + + +def dequantization(x, **params): + return torch.sqrt(domain_guard((params['_s'] * x), min=0.1, nan=0.1)) + + +def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]: + base_p0 = { + } + + base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs) + if 'post_init_hook' in kwargs: + kwargs['post_init_hook'](parameters=base_p0) + + params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_s'], **kwargs) + params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()} + if 'post_method_hook' in kwargs: + kwargs['post_method_hook'](parameters=params) + + params = learn_parameters(x, params, + qtz_func=quantization, + deqtz_func=dequantization, + bits=kwargs['bits'], + target_dtype=torch.int8, + epochs=500, + early_stop=False, + ) + if 'post_train_hook' in kwargs: + kwargs['post_train_hook'](parameters=params) + + return params + + +############### Numpy Qtz ############### + + +def np_quantization(x, _s): + return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np_guarded_power(x, np.array(2))) + + +def np_dequantization(x, _s): + return np.sqrt(np_domain_guard((_s * x), min=0.1, nan=0.1)) + + +def fit_func(x, _s): + x_ = np_quantization(x, _s) + x_ = np_dequantization(x_, _s) + return x_ + + + +############### HELPERS ############### + +def domain_guard( + x: torch.Tensor, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> torch.Tensor: + """Guard a tensor to a valid domain.""" + x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = torch.clamp(x, min=min, max=max) + return x + + +def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor: + """Replace a number in a tensor with another number. + + Args: + x (torch.Tensor): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + torch.Tensor: The tensor with the number replaced. + """ + return torch.where(x == num, to, x) + + +def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor: + """Guard the power operation to a valid domain.""" + return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp) + + +def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.ones_like(val, dtype=torch.float32, device=x.device) + + +def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.randn_like(val, dtype=torch.float32, device=x.device) + + +def init_space_search( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int): + """Generates the initial set of parameters. The first iteration generates 10 times more parameters.""" + for _ in range(n_params * 10): # The first iteration generates 10 times more parameters + yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial] + + def _search_param(tensors: List[torch.tensor], n_params): + """Takes the best parameters and generates new parameters around the mean of the best parameters.""" + torch_tensors = torch.stack(tensors) + min_vals, max_vals = torch.aminmax(torch_tensors, dim=0) + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + mean = torch.mean(torch_tensors, dim=0) + for _ in range(n_params): # Generates n_params around the mean of the tensors + yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean + + def _calc(x, qtz_func, deqtz_func, **params): + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params) + x_ = deqtz_func(x=x_, **params) + x_ = x_.transpose(0, 1) + return x_ + + assert "qtz_func" in kwargs, "qtz_func must be provided." + assert "deqtz_func" in kwargs, "deqtz_func must be provided." + assert "params_list" in kwargs, "params list must be provided." + assert "param" in kwargs, "param must be provided." + + qtz_func = kwargs.get('qtz_func') + deqtz_func = kwargs.get('deqtz_func') + params_list = kwargs.get('params_list') + param = kwargs.get('param') + + n_runs = 50 # Number of runs to try to find the best parameters + n_random_params = 50 # Number of random parameters to generate + n_best_to_pick = 5 # Number of best parameters to pick after each run + max_initial = 10000 # Maximum value to initialize the parameters + + # Initializes the parameters + base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param } + params = _build_initial_param(x, max_initial, n_random_params) + + # Performs the search + for _ in range(n_runs): + + best_params = [] + for param_ in params: + try: + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_}) + loss_ones = nn.MSELoss()(x, x_) + + if len(best_params) < n_best_to_pick: + best_params.append((param_, loss_ones.item())) + best_params = sorted(best_params, key=lambda x: x[1]) + elif loss_ones < best_params[-1][1]: + best_params[-1] = (param_, loss_ones.item()) + best_params = sorted(best_params, key=lambda x: x[1]) + + except Exception: # The parameters might not be valid for the function's domain + continue + + # Generates new parameters around the mean + params = _search_param([p for p, _ in best_params], n_random_params) + + # Checks if the best parameter is better than the init_ones + p_ones = init_ones(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones}) + loss_ones = nn.MSELoss()(x, x_) + + # Checks if the best parameter is better than the init_rand + p_rand = init_rand(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand}) + loss_rand = nn.MSELoss()(x, x_) + + if loss_rand < best_params[0][1] and loss_rand < loss_ones: + return p_rand + elif loss_ones < best_params[0][1] and loss_ones < loss_rand: + return p_ones + else: + return best_params[0][0] + + +def init_linear_scale( # Symmetric scale. From the study folder + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + assert "bits" in kwargs, "bits must be provided." + assert "params" in kwargs, "params must be provided." + assert "qtz_func" in kwargs, "qtz_func must be provided." + + bits = kwargs.get('bits') + params = kwargs.get('params') + qtz_func = kwargs.get('qtz_func') + + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs)) + x_ = x_.transpose(0, 1) + + quant_min, quant_max = get_min_max_from_bits_signed(bits) + min_vals, max_vals = torch.aminmax(x_, dim=1) + min_vals = torch.min(min_vals, torch.zeros_like(min_vals)) + max_vals = torch.max(max_vals, torch.zeros_like(max_vals)) + + eps = torch.finfo(torch.float32).eps + + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2) + + scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device) + + # Introduces some noise in scale + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything. + # NOTE(diogo): This has been disproven. The noise does not help the learning process but I still + # left it here for future reference. Will be removed later. + # scale = scale + 0.01 * torch.randn_like(scale) + + return scale + + +def init_non_linear_regression_fit( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + assert "params_list" in kwargs, "params list must be provided." + assert "np_fit_func" in kwargs, "np_fit_func must be provided." + assert "p0" in kwargs, "p0 must be provided." + np_fit_func = kwargs.get('np_fit_func') + params_list = kwargs.get('params_list') + p0 = kwargs.get('p0') + + def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]): + popt, _ = curve_fit( + func, + xdata, + ydata, + maxfev=1000, + p0=p0, + method='lm' + ) + return popt + + # 1. Needs to convert the torch tensor to numpy tensor + xdata = x.cpu().numpy() + + # 2. Sorts the data so that it makes it easier to fit to it + sorted_xdata = np.sort(xdata, axis=-1) + + p0 = {k: v.cpu().numpy() for k, v in p0.items()} + params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order + + # 3. Finds the best parameters for each channel + try: + params = [] + for i in range(sorted_xdata.shape[0]): + xdata_ = sorted_xdata[i] + p0_ = [p0[p][i] for p in params_list] + ch_params = _fit(xdata_, xdata_, np_fit_func, p0_) + params.append(ch_params) + + # 4. Builds the parameters + result = {} + for i, p in enumerate(params_list): + result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device) + + return result + + except ValueError as e: + print(f"Could not fit the function with error: {e}") + print(f"Using fallback result...") + return { + k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items() + } + + +def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.zeros_like(val, dtype=torch.float32, device=x.device) + + +def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor: + # Calculate the original minimum and maximum values + min_vals, max_vals = torch.aminmax(tensor, dim=-1) + x_min = torch.min(min_vals, torch.zeros_like(min_vals)) + x_max = torch.max(max_vals, torch.zeros_like(max_vals)) + + if _max is torch.inf: # We do not need to scale the tensor. Just need to move it + return torch.ones_like(x_min) + + # Calculate the scale factor + scale = (_max - _min) / (x_max - x_min) + return scale + + + +############## Quant ############### + +@torch.enable_grad() +def learn_parameters( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + qtz_func: nn.Module, + deqtz_func: nn.Module, + bits: int, + target_dtype: torch.dtype, + epochs: int = 1000, + early_stop: bool = True, + do_report: bool = False +) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]: + loss_fn = nn.MSELoss() + + # Determines the initial learning rate by computing the initial loss and multiplying it by + # the order of magnitude of the loss divided by 2 + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + base_lr = 0.1 + exponent = int(np.floor(np.log10(loss.item()))) + lr = base_lr * (10 ** (exponent // 2)) + + # Requires gradients in the parameters + for p in params.values(): + p.requires_grad = True + p.grad = None + + param_keys = list(params.keys()) + param_values = list(params.values()) + + # Defines optimizer and loss function + optimizer = torch.optim.Adam(param_values, lr=lr) + scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=epochs // 10, T_mult=1, eta_min=lr * 0.1, last_epoch=-1) + + # Contains the best loss and the best parameters + best_loss = float("inf") + best_params = None + + # Used to stop the search early + min_delta = 1e-7 + acc_loss = [] + percent_epochs_before_stop = 0.1 + + for i in range(epochs): + optimizer.zero_grad() + + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + if loss.isnan() or loss.isinf(): + raise Exception("Loss is NaN or Inf. Stopping the search.") + + loss.backward() + optimizer.step() + scheduler.step() + + acc_loss.append(loss.item()) + + # Reports loss every 10 steps + if i % 10 == 0 and do_report: + print(f"Epoch {i}: Loss {loss.item()}") + + # Optimizes the parameter search by storing the best loss and the parameters + if loss.item() < best_loss: + best_loss = loss.item() + best_params = copy.deepcopy({ + k: v for k, v in params.items() if k in param_keys + }) + + # We also stop the search if the loss has not considerably during the last 10% epochs + if early_stop: + epochs_before_stop = int(epochs * percent_epochs_before_stop) + if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta: + break + + # No longer requires gradients in the parameters + for p in best_params.values(): + p.requires_grad = False + p.grad = None + + if do_report: + print(f"Best loss: {best_loss}") + return best_params, acc_loss + else: + return best_params + + +def quantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + target_dtype: torch.dtype = torch.int8 +) -> torch.Tensor: + quant_min, quant_max = get_min_max_from_bits_signed(bits) + x = x.transpose(0, 1) # Aligns shapes + x = func(x=x, **params) + x = x.transpose(0, 1) + x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype) + return x + + +def dequantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + out_dtype: torch.dtype +) -> torch.Tensor: + x = x.to(dtype=out_dtype) + x = x.transpose(0, 1) + x = func(x=x, **params) + x = x.transpose(0, 1) + return x + + +def round_func_BPDA(input): + # This is equivalent to replacing round function (non-differentiable) with + # an identity function (differentiable) only when backward. + forward_value = torch.round(input) + out = input.clone() + out.data = forward_value.data + return out + + +def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]: + return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1 + + + +############## Numpy ############### + +def np_domain_guard( + x: np.ndarray, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> np.ndarray: + """Guard a tensor to a valid domain.""" + x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = np.clip(x, min, max) + return x + + +def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray: + """Replace a number in a tensor with another number. + + Args: + x (np.ndarray): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + np.ndarray: The tensor with the number replaced. + """ + return np.where(x == num, to, x) + + +def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray: + """Guard the power operation to a valid domain.""" + return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp) + diff --git a/fn_gen/nlr_t_cos/6/loss.png b/fn_gen/nlr_t_cos/6/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..57ab7315f322a72c9c321638cf59b3cbd12bdb6b Binary files /dev/null and b/fn_gen/nlr_t_cos/6/loss.png differ diff --git a/fn_gen/nlr_t_cos/6/quantization.png b/fn_gen/nlr_t_cos/6/quantization.png new file mode 100644 index 0000000000000000000000000000000000000000..114c44936bf8e8e42acc1426bb6a9100f9f30d25 Binary files /dev/null and b/fn_gen/nlr_t_cos/6/quantization.png differ diff --git a/fn_gen/nlr_t_cos/7/distortion.png b/fn_gen/nlr_t_cos/7/distortion.png new file mode 100644 index 0000000000000000000000000000000000000000..8294b9669ca57e9ac3f196b59416b40934a2725f Binary files /dev/null and b/fn_gen/nlr_t_cos/7/distortion.png differ diff --git a/fn_gen/nlr_t_cos/7/expressions.txt b/fn_gen/nlr_t_cos/7/expressions.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c0b1579c06c048d5603aa39c80e392c5906a879 --- /dev/null +++ b/fn_gen/nlr_t_cos/7/expressions.txt @@ -0,0 +1,2 @@ +cos(_0*x)/_s +acos(_s*x)/_0 \ No newline at end of file diff --git a/fn_gen/nlr_t_cos/7/fn.py b/fn_gen/nlr_t_cos/7/fn.py new file mode 100644 index 0000000000000000000000000000000000000000..999860d115a292350b7f0ab69de4c93fe4b9311d --- /dev/null +++ b/fn_gen/nlr_t_cos/7/fn.py @@ -0,0 +1,481 @@ +from __future__ import annotations + +import torch +from torch import amin # Necessary for arcsin +import copy +import torch.nn as nn +import numpy as np + +from scipy.optimize import curve_fit +from typing import Dict, Any, Tuple, List, Callable + + +def quantization(x, **params): + return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * torch.cos((params['_0'] * x))) + + +def dequantization(x, **params): + return (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * torch.acos(domain_guard((params['_s'] * x), min=-0.99999, max=0.99999, nan=0))) + + +def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]: + base_p0 = { + '_0': init_space_search(x, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], param='_0', **kwargs), + } + + base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs) + if 'post_init_hook' in kwargs: + kwargs['post_init_hook'](parameters=base_p0) + + params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], **kwargs) + params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()} + if 'post_method_hook' in kwargs: + kwargs['post_method_hook'](parameters=params) + + params = learn_parameters(x, params, + qtz_func=quantization, + deqtz_func=dequantization, + bits=kwargs['bits'], + target_dtype=torch.int8, + epochs=500, + early_stop=False, + ) + if 'post_train_hook' in kwargs: + kwargs['post_train_hook'](parameters=params) + + return params + + +############### Numpy Qtz ############### + + +def np_quantization(x, _0, _s): + return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np.cos((_0 * x))) + + +def np_dequantization(x, _0, _s): + return (np.divide(1, np_replace_num(_0, num=0, to=10000)) * np.arccos(np_domain_guard((_s * x), min=-0.99999, max=0.99999, nan=0))) + + +def fit_func(x, _0, _s): + x_ = np_quantization(x, _0, _s) + x_ = np_dequantization(x_, _0, _s) + return x_ + + + +############### HELPERS ############### + +def domain_guard( + x: torch.Tensor, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> torch.Tensor: + """Guard a tensor to a valid domain.""" + x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = torch.clamp(x, min=min, max=max) + return x + + +def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor: + """Replace a number in a tensor with another number. + + Args: + x (torch.Tensor): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + torch.Tensor: The tensor with the number replaced. + """ + return torch.where(x == num, to, x) + + +def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor: + """Guard the power operation to a valid domain.""" + return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp) + + +def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.ones_like(val, dtype=torch.float32, device=x.device) + + +def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.randn_like(val, dtype=torch.float32, device=x.device) + + +def init_space_search( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int): + """Generates the initial set of parameters. The first iteration generates 10 times more parameters.""" + for _ in range(n_params * 10): # The first iteration generates 10 times more parameters + yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial] + + def _search_param(tensors: List[torch.tensor], n_params): + """Takes the best parameters and generates new parameters around the mean of the best parameters.""" + torch_tensors = torch.stack(tensors) + min_vals, max_vals = torch.aminmax(torch_tensors, dim=0) + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + mean = torch.mean(torch_tensors, dim=0) + for _ in range(n_params): # Generates n_params around the mean of the tensors + yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean + + def _calc(x, qtz_func, deqtz_func, **params): + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params) + x_ = deqtz_func(x=x_, **params) + x_ = x_.transpose(0, 1) + return x_ + + assert "qtz_func" in kwargs, "qtz_func must be provided." + assert "deqtz_func" in kwargs, "deqtz_func must be provided." + assert "params_list" in kwargs, "params list must be provided." + assert "param" in kwargs, "param must be provided." + + qtz_func = kwargs.get('qtz_func') + deqtz_func = kwargs.get('deqtz_func') + params_list = kwargs.get('params_list') + param = kwargs.get('param') + + n_runs = 50 # Number of runs to try to find the best parameters + n_random_params = 50 # Number of random parameters to generate + n_best_to_pick = 5 # Number of best parameters to pick after each run + max_initial = 10000 # Maximum value to initialize the parameters + + # Initializes the parameters + base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param } + params = _build_initial_param(x, max_initial, n_random_params) + + # Performs the search + for _ in range(n_runs): + + best_params = [] + for param_ in params: + try: + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_}) + loss_ones = nn.MSELoss()(x, x_) + + if len(best_params) < n_best_to_pick: + best_params.append((param_, loss_ones.item())) + best_params = sorted(best_params, key=lambda x: x[1]) + elif loss_ones < best_params[-1][1]: + best_params[-1] = (param_, loss_ones.item()) + best_params = sorted(best_params, key=lambda x: x[1]) + + except Exception: # The parameters might not be valid for the function's domain + continue + + # Generates new parameters around the mean + params = _search_param([p for p, _ in best_params], n_random_params) + + # Checks if the best parameter is better than the init_ones + p_ones = init_ones(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones}) + loss_ones = nn.MSELoss()(x, x_) + + # Checks if the best parameter is better than the init_rand + p_rand = init_rand(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand}) + loss_rand = nn.MSELoss()(x, x_) + + if loss_rand < best_params[0][1] and loss_rand < loss_ones: + return p_rand + elif loss_ones < best_params[0][1] and loss_ones < loss_rand: + return p_ones + else: + return best_params[0][0] + + +def init_linear_scale( # Symmetric scale. From the study folder + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + assert "bits" in kwargs, "bits must be provided." + assert "params" in kwargs, "params must be provided." + assert "qtz_func" in kwargs, "qtz_func must be provided." + + bits = kwargs.get('bits') + params = kwargs.get('params') + qtz_func = kwargs.get('qtz_func') + + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs)) + x_ = x_.transpose(0, 1) + + quant_min, quant_max = get_min_max_from_bits_signed(bits) + min_vals, max_vals = torch.aminmax(x_, dim=1) + min_vals = torch.min(min_vals, torch.zeros_like(min_vals)) + max_vals = torch.max(max_vals, torch.zeros_like(max_vals)) + + eps = torch.finfo(torch.float32).eps + + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2) + + scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device) + + # Introduces some noise in scale + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything. + # NOTE(diogo): This has been disproven. The noise does not help the learning process but I still + # left it here for future reference. Will be removed later. + # scale = scale + 0.01 * torch.randn_like(scale) + + return scale + + +def init_non_linear_regression_fit( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + assert "params_list" in kwargs, "params list must be provided." + assert "np_fit_func" in kwargs, "np_fit_func must be provided." + assert "p0" in kwargs, "p0 must be provided." + np_fit_func = kwargs.get('np_fit_func') + params_list = kwargs.get('params_list') + p0 = kwargs.get('p0') + + def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]): + popt, _ = curve_fit( + func, + xdata, + ydata, + maxfev=1000, + p0=p0, + method='lm' + ) + return popt + + # 1. Needs to convert the torch tensor to numpy tensor + xdata = x.cpu().numpy() + + # 2. Sorts the data so that it makes it easier to fit to it + sorted_xdata = np.sort(xdata, axis=-1) + + p0 = {k: v.cpu().numpy() for k, v in p0.items()} + params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order + + # 3. Finds the best parameters for each channel + try: + params = [] + for i in range(sorted_xdata.shape[0]): + xdata_ = sorted_xdata[i] + p0_ = [p0[p][i] for p in params_list] + ch_params = _fit(xdata_, xdata_, np_fit_func, p0_) + params.append(ch_params) + + # 4. Builds the parameters + result = {} + for i, p in enumerate(params_list): + result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device) + + return result + + except ValueError as e: + print(f"Could not fit the function with error: {e}") + print(f"Using fallback result...") + return { + k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items() + } + + +def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.zeros_like(val, dtype=torch.float32, device=x.device) + + +def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor: + # Calculate the original minimum and maximum values + min_vals, max_vals = torch.aminmax(tensor, dim=-1) + x_min = torch.min(min_vals, torch.zeros_like(min_vals)) + x_max = torch.max(max_vals, torch.zeros_like(max_vals)) + + if _max is torch.inf: # We do not need to scale the tensor. Just need to move it + return torch.ones_like(x_min) + + # Calculate the scale factor + scale = (_max - _min) / (x_max - x_min) + return scale + + + +############## Quant ############### + +@torch.enable_grad() +def learn_parameters( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + qtz_func: nn.Module, + deqtz_func: nn.Module, + bits: int, + target_dtype: torch.dtype, + epochs: int = 1000, + early_stop: bool = True, + do_report: bool = False +) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]: + loss_fn = nn.MSELoss() + + # Determines the initial learning rate by computing the initial loss and multiplying it by + # the order of magnitude of the loss divided by 2 + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + base_lr = 0.1 + exponent = int(np.floor(np.log10(loss.item()))) + lr = base_lr * (10 ** (exponent // 2)) + + # Requires gradients in the parameters + for p in params.values(): + p.requires_grad = True + p.grad = None + + param_keys = list(params.keys()) + param_values = list(params.values()) + + # Defines optimizer and loss function + optimizer = torch.optim.Adam(param_values, lr=lr) + scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=epochs // 10, T_mult=1, eta_min=lr * 0.1, last_epoch=-1) + + # Contains the best loss and the best parameters + best_loss = float("inf") + best_params = None + + # Used to stop the search early + min_delta = 1e-7 + acc_loss = [] + percent_epochs_before_stop = 0.1 + + for i in range(epochs): + optimizer.zero_grad() + + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + if loss.isnan() or loss.isinf(): + raise Exception("Loss is NaN or Inf. Stopping the search.") + + loss.backward() + optimizer.step() + scheduler.step() + + acc_loss.append(loss.item()) + + # Reports loss every 10 steps + if i % 10 == 0 and do_report: + print(f"Epoch {i}: Loss {loss.item()}") + + # Optimizes the parameter search by storing the best loss and the parameters + if loss.item() < best_loss: + best_loss = loss.item() + best_params = copy.deepcopy({ + k: v for k, v in params.items() if k in param_keys + }) + + # We also stop the search if the loss has not considerably during the last 10% epochs + if early_stop: + epochs_before_stop = int(epochs * percent_epochs_before_stop) + if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta: + break + + # No longer requires gradients in the parameters + for p in best_params.values(): + p.requires_grad = False + p.grad = None + + if do_report: + print(f"Best loss: {best_loss}") + return best_params, acc_loss + else: + return best_params + + +def quantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + target_dtype: torch.dtype = torch.int8 +) -> torch.Tensor: + quant_min, quant_max = get_min_max_from_bits_signed(bits) + x = x.transpose(0, 1) # Aligns shapes + x = func(x=x, **params) + x = x.transpose(0, 1) + x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype) + return x + + +def dequantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + out_dtype: torch.dtype +) -> torch.Tensor: + x = x.to(dtype=out_dtype) + x = x.transpose(0, 1) + x = func(x=x, **params) + x = x.transpose(0, 1) + return x + + +def round_func_BPDA(input): + # This is equivalent to replacing round function (non-differentiable) with + # an identity function (differentiable) only when backward. + forward_value = torch.round(input) + out = input.clone() + out.data = forward_value.data + return out + + +def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]: + return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1 + + + +############## Numpy ############### + +def np_domain_guard( + x: np.ndarray, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> np.ndarray: + """Guard a tensor to a valid domain.""" + x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = np.clip(x, min, max) + return x + + +def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray: + """Replace a number in a tensor with another number. + + Args: + x (np.ndarray): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + np.ndarray: The tensor with the number replaced. + """ + return np.where(x == num, to, x) + + +def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray: + """Guard the power operation to a valid domain.""" + return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp) + diff --git a/fn_gen/nlr_t_cos/7/loss.png b/fn_gen/nlr_t_cos/7/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..f178c5975550d4cdb0571704d62762d73bbc2f0e Binary files /dev/null and b/fn_gen/nlr_t_cos/7/loss.png differ diff --git a/fn_gen/nlr_t_cos/7/quantization.png b/fn_gen/nlr_t_cos/7/quantization.png new file mode 100644 index 0000000000000000000000000000000000000000..38396fbaebd0a4a07100bc167c4a88d6e526a1d0 Binary files /dev/null and b/fn_gen/nlr_t_cos/7/quantization.png differ diff --git a/fn_gen/nlr_t_cos/8/distortion.png b/fn_gen/nlr_t_cos/8/distortion.png new file mode 100644 index 0000000000000000000000000000000000000000..ed57391b57587c10e611889560cab9a943c5d2cf Binary files /dev/null and b/fn_gen/nlr_t_cos/8/distortion.png differ diff --git a/fn_gen/nlr_t_cos/8/expressions.txt b/fn_gen/nlr_t_cos/8/expressions.txt new file mode 100644 index 0000000000000000000000000000000000000000..ecd6e238827dcdb95f4bcb390c1c300696f34254 --- /dev/null +++ b/fn_gen/nlr_t_cos/8/expressions.txt @@ -0,0 +1,2 @@ +sin(_0*x)/_s +asin(_s*x)/_0 \ No newline at end of file diff --git a/fn_gen/nlr_t_cos/8/fn.py b/fn_gen/nlr_t_cos/8/fn.py new file mode 100644 index 0000000000000000000000000000000000000000..20cd63a2d37aed23b21c4f3d59a3e4991e6401ca --- /dev/null +++ b/fn_gen/nlr_t_cos/8/fn.py @@ -0,0 +1,481 @@ +from __future__ import annotations + +import torch +from torch import amin # Necessary for arcsin +import copy +import torch.nn as nn +import numpy as np + +from scipy.optimize import curve_fit +from typing import Dict, Any, Tuple, List, Callable + + +def quantization(x, **params): + return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * torch.sin((params['_0'] * x))) + + +def dequantization(x, **params): + return (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * torch.asin(domain_guard((params['_s'] * x), min=-0.99999, max=0.99999, nan=0))) + + +def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]: + base_p0 = { + '_0': init_space_search(x, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], param='_0', **kwargs), + } + + base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs) + if 'post_init_hook' in kwargs: + kwargs['post_init_hook'](parameters=base_p0) + + params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], **kwargs) + params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()} + if 'post_method_hook' in kwargs: + kwargs['post_method_hook'](parameters=params) + + params = learn_parameters(x, params, + qtz_func=quantization, + deqtz_func=dequantization, + bits=kwargs['bits'], + target_dtype=torch.int8, + epochs=500, + early_stop=False, + ) + if 'post_train_hook' in kwargs: + kwargs['post_train_hook'](parameters=params) + + return params + + +############### Numpy Qtz ############### + + +def np_quantization(x, _0, _s): + return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np.sin((_0 * x))) + + +def np_dequantization(x, _0, _s): + return (np.divide(1, np_replace_num(_0, num=0, to=10000)) * np.arcsin(np_domain_guard((_s * x), min=-0.99999, max=0.99999, nan=0))) + + +def fit_func(x, _0, _s): + x_ = np_quantization(x, _0, _s) + x_ = np_dequantization(x_, _0, _s) + return x_ + + + +############### HELPERS ############### + +def domain_guard( + x: torch.Tensor, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> torch.Tensor: + """Guard a tensor to a valid domain.""" + x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = torch.clamp(x, min=min, max=max) + return x + + +def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor: + """Replace a number in a tensor with another number. + + Args: + x (torch.Tensor): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + torch.Tensor: The tensor with the number replaced. + """ + return torch.where(x == num, to, x) + + +def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor: + """Guard the power operation to a valid domain.""" + return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp) + + +def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.ones_like(val, dtype=torch.float32, device=x.device) + + +def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.randn_like(val, dtype=torch.float32, device=x.device) + + +def init_space_search( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int): + """Generates the initial set of parameters. The first iteration generates 10 times more parameters.""" + for _ in range(n_params * 10): # The first iteration generates 10 times more parameters + yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial] + + def _search_param(tensors: List[torch.tensor], n_params): + """Takes the best parameters and generates new parameters around the mean of the best parameters.""" + torch_tensors = torch.stack(tensors) + min_vals, max_vals = torch.aminmax(torch_tensors, dim=0) + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + mean = torch.mean(torch_tensors, dim=0) + for _ in range(n_params): # Generates n_params around the mean of the tensors + yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean + + def _calc(x, qtz_func, deqtz_func, **params): + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params) + x_ = deqtz_func(x=x_, **params) + x_ = x_.transpose(0, 1) + return x_ + + assert "qtz_func" in kwargs, "qtz_func must be provided." + assert "deqtz_func" in kwargs, "deqtz_func must be provided." + assert "params_list" in kwargs, "params list must be provided." + assert "param" in kwargs, "param must be provided." + + qtz_func = kwargs.get('qtz_func') + deqtz_func = kwargs.get('deqtz_func') + params_list = kwargs.get('params_list') + param = kwargs.get('param') + + n_runs = 50 # Number of runs to try to find the best parameters + n_random_params = 50 # Number of random parameters to generate + n_best_to_pick = 5 # Number of best parameters to pick after each run + max_initial = 10000 # Maximum value to initialize the parameters + + # Initializes the parameters + base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param } + params = _build_initial_param(x, max_initial, n_random_params) + + # Performs the search + for _ in range(n_runs): + + best_params = [] + for param_ in params: + try: + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_}) + loss_ones = nn.MSELoss()(x, x_) + + if len(best_params) < n_best_to_pick: + best_params.append((param_, loss_ones.item())) + best_params = sorted(best_params, key=lambda x: x[1]) + elif loss_ones < best_params[-1][1]: + best_params[-1] = (param_, loss_ones.item()) + best_params = sorted(best_params, key=lambda x: x[1]) + + except Exception: # The parameters might not be valid for the function's domain + continue + + # Generates new parameters around the mean + params = _search_param([p for p, _ in best_params], n_random_params) + + # Checks if the best parameter is better than the init_ones + p_ones = init_ones(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones}) + loss_ones = nn.MSELoss()(x, x_) + + # Checks if the best parameter is better than the init_rand + p_rand = init_rand(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand}) + loss_rand = nn.MSELoss()(x, x_) + + if loss_rand < best_params[0][1] and loss_rand < loss_ones: + return p_rand + elif loss_ones < best_params[0][1] and loss_ones < loss_rand: + return p_ones + else: + return best_params[0][0] + + +def init_linear_scale( # Symmetric scale. From the study folder + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + assert "bits" in kwargs, "bits must be provided." + assert "params" in kwargs, "params must be provided." + assert "qtz_func" in kwargs, "qtz_func must be provided." + + bits = kwargs.get('bits') + params = kwargs.get('params') + qtz_func = kwargs.get('qtz_func') + + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs)) + x_ = x_.transpose(0, 1) + + quant_min, quant_max = get_min_max_from_bits_signed(bits) + min_vals, max_vals = torch.aminmax(x_, dim=1) + min_vals = torch.min(min_vals, torch.zeros_like(min_vals)) + max_vals = torch.max(max_vals, torch.zeros_like(max_vals)) + + eps = torch.finfo(torch.float32).eps + + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2) + + scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device) + + # Introduces some noise in scale + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything. + # NOTE(diogo): This has been disproven. The noise does not help the learning process but I still + # left it here for future reference. Will be removed later. + # scale = scale + 0.01 * torch.randn_like(scale) + + return scale + + +def init_non_linear_regression_fit( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + assert "params_list" in kwargs, "params list must be provided." + assert "np_fit_func" in kwargs, "np_fit_func must be provided." + assert "p0" in kwargs, "p0 must be provided." + np_fit_func = kwargs.get('np_fit_func') + params_list = kwargs.get('params_list') + p0 = kwargs.get('p0') + + def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]): + popt, _ = curve_fit( + func, + xdata, + ydata, + maxfev=1000, + p0=p0, + method='lm' + ) + return popt + + # 1. Needs to convert the torch tensor to numpy tensor + xdata = x.cpu().numpy() + + # 2. Sorts the data so that it makes it easier to fit to it + sorted_xdata = np.sort(xdata, axis=-1) + + p0 = {k: v.cpu().numpy() for k, v in p0.items()} + params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order + + # 3. Finds the best parameters for each channel + try: + params = [] + for i in range(sorted_xdata.shape[0]): + xdata_ = sorted_xdata[i] + p0_ = [p0[p][i] for p in params_list] + ch_params = _fit(xdata_, xdata_, np_fit_func, p0_) + params.append(ch_params) + + # 4. Builds the parameters + result = {} + for i, p in enumerate(params_list): + result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device) + + return result + + except ValueError as e: + print(f"Could not fit the function with error: {e}") + print(f"Using fallback result...") + return { + k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items() + } + + +def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.zeros_like(val, dtype=torch.float32, device=x.device) + + +def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor: + # Calculate the original minimum and maximum values + min_vals, max_vals = torch.aminmax(tensor, dim=-1) + x_min = torch.min(min_vals, torch.zeros_like(min_vals)) + x_max = torch.max(max_vals, torch.zeros_like(max_vals)) + + if _max is torch.inf: # We do not need to scale the tensor. Just need to move it + return torch.ones_like(x_min) + + # Calculate the scale factor + scale = (_max - _min) / (x_max - x_min) + return scale + + + +############## Quant ############### + +@torch.enable_grad() +def learn_parameters( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + qtz_func: nn.Module, + deqtz_func: nn.Module, + bits: int, + target_dtype: torch.dtype, + epochs: int = 1000, + early_stop: bool = True, + do_report: bool = False +) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]: + loss_fn = nn.MSELoss() + + # Determines the initial learning rate by computing the initial loss and multiplying it by + # the order of magnitude of the loss divided by 2 + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + base_lr = 0.1 + exponent = int(np.floor(np.log10(loss.item()))) + lr = base_lr * (10 ** (exponent // 2)) + + # Requires gradients in the parameters + for p in params.values(): + p.requires_grad = True + p.grad = None + + param_keys = list(params.keys()) + param_values = list(params.values()) + + # Defines optimizer and loss function + optimizer = torch.optim.Adam(param_values, lr=lr) + scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=epochs // 10, T_mult=1, eta_min=lr * 0.1, last_epoch=-1) + + # Contains the best loss and the best parameters + best_loss = float("inf") + best_params = None + + # Used to stop the search early + min_delta = 1e-7 + acc_loss = [] + percent_epochs_before_stop = 0.1 + + for i in range(epochs): + optimizer.zero_grad() + + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + if loss.isnan() or loss.isinf(): + raise Exception("Loss is NaN or Inf. Stopping the search.") + + loss.backward() + optimizer.step() + scheduler.step() + + acc_loss.append(loss.item()) + + # Reports loss every 10 steps + if i % 10 == 0 and do_report: + print(f"Epoch {i}: Loss {loss.item()}") + + # Optimizes the parameter search by storing the best loss and the parameters + if loss.item() < best_loss: + best_loss = loss.item() + best_params = copy.deepcopy({ + k: v for k, v in params.items() if k in param_keys + }) + + # We also stop the search if the loss has not considerably during the last 10% epochs + if early_stop: + epochs_before_stop = int(epochs * percent_epochs_before_stop) + if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta: + break + + # No longer requires gradients in the parameters + for p in best_params.values(): + p.requires_grad = False + p.grad = None + + if do_report: + print(f"Best loss: {best_loss}") + return best_params, acc_loss + else: + return best_params + + +def quantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + target_dtype: torch.dtype = torch.int8 +) -> torch.Tensor: + quant_min, quant_max = get_min_max_from_bits_signed(bits) + x = x.transpose(0, 1) # Aligns shapes + x = func(x=x, **params) + x = x.transpose(0, 1) + x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype) + return x + + +def dequantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + out_dtype: torch.dtype +) -> torch.Tensor: + x = x.to(dtype=out_dtype) + x = x.transpose(0, 1) + x = func(x=x, **params) + x = x.transpose(0, 1) + return x + + +def round_func_BPDA(input): + # This is equivalent to replacing round function (non-differentiable) with + # an identity function (differentiable) only when backward. + forward_value = torch.round(input) + out = input.clone() + out.data = forward_value.data + return out + + +def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]: + return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1 + + + +############## Numpy ############### + +def np_domain_guard( + x: np.ndarray, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> np.ndarray: + """Guard a tensor to a valid domain.""" + x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = np.clip(x, min, max) + return x + + +def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray: + """Replace a number in a tensor with another number. + + Args: + x (np.ndarray): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + np.ndarray: The tensor with the number replaced. + """ + return np.where(x == num, to, x) + + +def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray: + """Guard the power operation to a valid domain.""" + return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp) + diff --git a/fn_gen/nlr_t_cos/8/loss.png b/fn_gen/nlr_t_cos/8/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..55522f0ff34ea3543690c0c906e4e72ca6aea378 Binary files /dev/null and b/fn_gen/nlr_t_cos/8/loss.png differ diff --git a/fn_gen/nlr_t_cos/8/quantization.png b/fn_gen/nlr_t_cos/8/quantization.png new file mode 100644 index 0000000000000000000000000000000000000000..a20d5340aa3b4d7de610efe447264cfe85ed6187 Binary files /dev/null and b/fn_gen/nlr_t_cos/8/quantization.png differ diff --git a/fn_gen/nlr_t_cos/9/distortion.png b/fn_gen/nlr_t_cos/9/distortion.png new file mode 100644 index 0000000000000000000000000000000000000000..3d5678a0f73d0731216b1ccc57428d582b8207f1 Binary files /dev/null and b/fn_gen/nlr_t_cos/9/distortion.png differ diff --git a/fn_gen/nlr_t_cos/9/expressions.txt b/fn_gen/nlr_t_cos/9/expressions.txt new file mode 100644 index 0000000000000000000000000000000000000000..9aa25379a9d1d5a93d60659c6609b2e24e79234d --- /dev/null +++ b/fn_gen/nlr_t_cos/9/expressions.txt @@ -0,0 +1,2 @@ +exp(_0*x)/_s +log(_s*x)/_0 \ No newline at end of file diff --git a/fn_gen/nlr_t_cos/9/fn.py b/fn_gen/nlr_t_cos/9/fn.py new file mode 100644 index 0000000000000000000000000000000000000000..1acb5f6384388649e15f469da4aeef9a9150bbde --- /dev/null +++ b/fn_gen/nlr_t_cos/9/fn.py @@ -0,0 +1,481 @@ +from __future__ import annotations + +import torch +from torch import amin # Necessary for arcsin +import copy +import torch.nn as nn +import numpy as np + +from scipy.optimize import curve_fit +from typing import Dict, Any, Tuple, List, Callable + + +def quantization(x, **params): + return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * torch.exp((params['_0'] * x))) + + +def dequantization(x, **params): + return (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * torch.log(domain_guard((params['_s'] * x), min=1e-5, nan=1e-5))) + + +def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]: + base_p0 = { + '_0': init_space_search(x, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], param='_0', **kwargs), + } + + base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs) + if 'post_init_hook' in kwargs: + kwargs['post_init_hook'](parameters=base_p0) + + params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], **kwargs) + params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()} + if 'post_method_hook' in kwargs: + kwargs['post_method_hook'](parameters=params) + + params = learn_parameters(x, params, + qtz_func=quantization, + deqtz_func=dequantization, + bits=kwargs['bits'], + target_dtype=torch.int8, + epochs=500, + early_stop=False, + ) + if 'post_train_hook' in kwargs: + kwargs['post_train_hook'](parameters=params) + + return params + + +############### Numpy Qtz ############### + + +def np_quantization(x, _0, _s): + return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np.exp((_0 * x))) + + +def np_dequantization(x, _0, _s): + return (np.divide(1, np_replace_num(_0, num=0, to=10000)) * np.log(np_domain_guard((_s * x), min=1e-5, nan=1e-5))) + + +def fit_func(x, _0, _s): + x_ = np_quantization(x, _0, _s) + x_ = np_dequantization(x_, _0, _s) + return x_ + + + +############### HELPERS ############### + +def domain_guard( + x: torch.Tensor, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> torch.Tensor: + """Guard a tensor to a valid domain.""" + x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = torch.clamp(x, min=min, max=max) + return x + + +def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor: + """Replace a number in a tensor with another number. + + Args: + x (torch.Tensor): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + torch.Tensor: The tensor with the number replaced. + """ + return torch.where(x == num, to, x) + + +def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor: + """Guard the power operation to a valid domain.""" + return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp) + + +def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.ones_like(val, dtype=torch.float32, device=x.device) + + +def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.randn_like(val, dtype=torch.float32, device=x.device) + + +def init_space_search( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int): + """Generates the initial set of parameters. The first iteration generates 10 times more parameters.""" + for _ in range(n_params * 10): # The first iteration generates 10 times more parameters + yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial] + + def _search_param(tensors: List[torch.tensor], n_params): + """Takes the best parameters and generates new parameters around the mean of the best parameters.""" + torch_tensors = torch.stack(tensors) + min_vals, max_vals = torch.aminmax(torch_tensors, dim=0) + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + mean = torch.mean(torch_tensors, dim=0) + for _ in range(n_params): # Generates n_params around the mean of the tensors + yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean + + def _calc(x, qtz_func, deqtz_func, **params): + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params) + x_ = deqtz_func(x=x_, **params) + x_ = x_.transpose(0, 1) + return x_ + + assert "qtz_func" in kwargs, "qtz_func must be provided." + assert "deqtz_func" in kwargs, "deqtz_func must be provided." + assert "params_list" in kwargs, "params list must be provided." + assert "param" in kwargs, "param must be provided." + + qtz_func = kwargs.get('qtz_func') + deqtz_func = kwargs.get('deqtz_func') + params_list = kwargs.get('params_list') + param = kwargs.get('param') + + n_runs = 50 # Number of runs to try to find the best parameters + n_random_params = 50 # Number of random parameters to generate + n_best_to_pick = 5 # Number of best parameters to pick after each run + max_initial = 10000 # Maximum value to initialize the parameters + + # Initializes the parameters + base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param } + params = _build_initial_param(x, max_initial, n_random_params) + + # Performs the search + for _ in range(n_runs): + + best_params = [] + for param_ in params: + try: + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_}) + loss_ones = nn.MSELoss()(x, x_) + + if len(best_params) < n_best_to_pick: + best_params.append((param_, loss_ones.item())) + best_params = sorted(best_params, key=lambda x: x[1]) + elif loss_ones < best_params[-1][1]: + best_params[-1] = (param_, loss_ones.item()) + best_params = sorted(best_params, key=lambda x: x[1]) + + except Exception: # The parameters might not be valid for the function's domain + continue + + # Generates new parameters around the mean + params = _search_param([p for p, _ in best_params], n_random_params) + + # Checks if the best parameter is better than the init_ones + p_ones = init_ones(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones}) + loss_ones = nn.MSELoss()(x, x_) + + # Checks if the best parameter is better than the init_rand + p_rand = init_rand(x, **kwargs) + x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand}) + loss_rand = nn.MSELoss()(x, x_) + + if loss_rand < best_params[0][1] and loss_rand < loss_ones: + return p_rand + elif loss_ones < best_params[0][1] and loss_ones < loss_rand: + return p_ones + else: + return best_params[0][0] + + +def init_linear_scale( # Symmetric scale. From the study folder + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + assert "bits" in kwargs, "bits must be provided." + assert "params" in kwargs, "params must be provided." + assert "qtz_func" in kwargs, "qtz_func must be provided." + + bits = kwargs.get('bits') + params = kwargs.get('params') + qtz_func = kwargs.get('qtz_func') + + x_ = x.transpose(0, 1) + x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs)) + x_ = x_.transpose(0, 1) + + quant_min, quant_max = get_min_max_from_bits_signed(bits) + min_vals, max_vals = torch.aminmax(x_, dim=1) + min_vals = torch.min(min_vals, torch.zeros_like(min_vals)) + max_vals = torch.max(max_vals, torch.zeros_like(max_vals)) + + eps = torch.finfo(torch.float32).eps + + abs_max_val_per_ch = torch.max(-min_vals, max_vals) + scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2) + + scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device) + + # Introduces some noise in scale + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything + # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything. + # NOTE(diogo): This has been disproven. The noise does not help the learning process but I still + # left it here for future reference. Will be removed later. + # scale = scale + 0.01 * torch.randn_like(scale) + + return scale + + +def init_non_linear_regression_fit( + x: torch.Tensor, + **kwargs: Dict[str, Any], + ) -> torch.Tensor: + + assert "params_list" in kwargs, "params list must be provided." + assert "np_fit_func" in kwargs, "np_fit_func must be provided." + assert "p0" in kwargs, "p0 must be provided." + np_fit_func = kwargs.get('np_fit_func') + params_list = kwargs.get('params_list') + p0 = kwargs.get('p0') + + def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]): + popt, _ = curve_fit( + func, + xdata, + ydata, + maxfev=1000, + p0=p0, + method='lm' + ) + return popt + + # 1. Needs to convert the torch tensor to numpy tensor + xdata = x.cpu().numpy() + + # 2. Sorts the data so that it makes it easier to fit to it + sorted_xdata = np.sort(xdata, axis=-1) + + p0 = {k: v.cpu().numpy() for k, v in p0.items()} + params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order + + # 3. Finds the best parameters for each channel + try: + params = [] + for i in range(sorted_xdata.shape[0]): + xdata_ = sorted_xdata[i] + p0_ = [p0[p][i] for p in params_list] + ch_params = _fit(xdata_, xdata_, np_fit_func, p0_) + params.append(ch_params) + + # 4. Builds the parameters + result = {} + for i, p in enumerate(params_list): + result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device) + + return result + + except ValueError as e: + print(f"Could not fit the function with error: {e}") + print(f"Using fallback result...") + return { + k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items() + } + + +def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor: + val = torch.amin(x, dim=1) + return torch.zeros_like(val, dtype=torch.float32, device=x.device) + + +def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor: + # Calculate the original minimum and maximum values + min_vals, max_vals = torch.aminmax(tensor, dim=-1) + x_min = torch.min(min_vals, torch.zeros_like(min_vals)) + x_max = torch.max(max_vals, torch.zeros_like(max_vals)) + + if _max is torch.inf: # We do not need to scale the tensor. Just need to move it + return torch.ones_like(x_min) + + # Calculate the scale factor + scale = (_max - _min) / (x_max - x_min) + return scale + + + +############## Quant ############### + +@torch.enable_grad() +def learn_parameters( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + qtz_func: nn.Module, + deqtz_func: nn.Module, + bits: int, + target_dtype: torch.dtype, + epochs: int = 1000, + early_stop: bool = True, + do_report: bool = False +) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]: + loss_fn = nn.MSELoss() + + # Determines the initial learning rate by computing the initial loss and multiplying it by + # the order of magnitude of the loss divided by 2 + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + base_lr = 0.1 + exponent = int(np.floor(np.log10(loss.item()))) + lr = base_lr * (10 ** (exponent // 2)) + + # Requires gradients in the parameters + for p in params.values(): + p.requires_grad = True + p.grad = None + + param_keys = list(params.keys()) + param_values = list(params.values()) + + # Defines optimizer and loss function + optimizer = torch.optim.Adam(param_values, lr=lr) + scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=epochs // 10, T_mult=1, eta_min=lr * 0.1, last_epoch=-1) + + # Contains the best loss and the best parameters + best_loss = float("inf") + best_params = None + + # Used to stop the search early + min_delta = 1e-7 + acc_loss = [] + percent_epochs_before_stop = 0.1 + + for i in range(epochs): + optimizer.zero_grad() + + quant = quantize(x, params, qtz_func, bits, target_dtype) + dequant = dequantize(quant, params, deqtz_func, bits, x.dtype) + loss = loss_fn(x, dequant) + + if loss.isnan() or loss.isinf(): + raise Exception("Loss is NaN or Inf. Stopping the search.") + + loss.backward() + optimizer.step() + scheduler.step() + + acc_loss.append(loss.item()) + + # Reports loss every 10 steps + if i % 10 == 0 and do_report: + print(f"Epoch {i}: Loss {loss.item()}") + + # Optimizes the parameter search by storing the best loss and the parameters + if loss.item() < best_loss: + best_loss = loss.item() + best_params = copy.deepcopy({ + k: v for k, v in params.items() if k in param_keys + }) + + # We also stop the search if the loss has not considerably during the last 10% epochs + if early_stop: + epochs_before_stop = int(epochs * percent_epochs_before_stop) + if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta: + break + + # No longer requires gradients in the parameters + for p in best_params.values(): + p.requires_grad = False + p.grad = None + + if do_report: + print(f"Best loss: {best_loss}") + return best_params, acc_loss + else: + return best_params + + +def quantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + target_dtype: torch.dtype = torch.int8 +) -> torch.Tensor: + quant_min, quant_max = get_min_max_from_bits_signed(bits) + x = x.transpose(0, 1) # Aligns shapes + x = func(x=x, **params) + x = x.transpose(0, 1) + x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype) + return x + + +def dequantize( + x: torch.Tensor, + params: Dict[str, nn.Parameter], + func: nn.Module, + bits: int, + out_dtype: torch.dtype +) -> torch.Tensor: + x = x.to(dtype=out_dtype) + x = x.transpose(0, 1) + x = func(x=x, **params) + x = x.transpose(0, 1) + return x + + +def round_func_BPDA(input): + # This is equivalent to replacing round function (non-differentiable) with + # an identity function (differentiable) only when backward. + forward_value = torch.round(input) + out = input.clone() + out.data = forward_value.data + return out + + +def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]: + return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1 + + + +############## Numpy ############### + +def np_domain_guard( + x: np.ndarray, + min: float = None, + max: float = None, + posinf: float = None, + neginf: float = None, + nan: float = None + ) -> np.ndarray: + """Guard a tensor to a valid domain.""" + x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan) + if min is not None or max is not None: + x = np.clip(x, min, max) + return x + + +def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray: + """Replace a number in a tensor with another number. + + Args: + x (np.ndarray): The input tensor. + num (float): The number to replace. + to (float): The number to replace with. + + Returns: + np.ndarray: The tensor with the number replaced. + """ + return np.where(x == num, to, x) + + +def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray: + """Guard the power operation to a valid domain.""" + return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp) + diff --git a/fn_gen/nlr_t_cos/9/loss.png b/fn_gen/nlr_t_cos/9/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..c2925611452f6dbd4643ed23efe1e57fe8128e0a Binary files /dev/null and b/fn_gen/nlr_t_cos/9/loss.png differ diff --git a/fn_gen/nlr_t_cos/9/quantization.png b/fn_gen/nlr_t_cos/9/quantization.png new file mode 100644 index 0000000000000000000000000000000000000000..764b238edab55a76d82e85f73a39723835757366 Binary files /dev/null and b/fn_gen/nlr_t_cos/9/quantization.png differ