Upload learned functions
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- fn_gen/norm_nlr/0/__pycache__/fn.cpython-311.pyc +0 -0
- fn_gen/norm_nlr/0/distortion.png +0 -0
- fn_gen/norm_nlr/0/expressions.txt +2 -0
- fn_gen/norm_nlr/0/fn.py +469 -0
- fn_gen/norm_nlr/0/loss.png +0 -0
- fn_gen/norm_nlr/0/quantization.png +0 -0
- fn_gen/norm_nlr/1/__pycache__/fn.cpython-311.pyc +0 -0
- fn_gen/norm_nlr/1/distortion.png +0 -0
- fn_gen/norm_nlr/1/expressions.txt +2 -0
- fn_gen/norm_nlr/1/fn.py +468 -0
- fn_gen/norm_nlr/1/loss.png +0 -0
- fn_gen/norm_nlr/1/quantization.png +0 -0
- fn_gen/norm_nlr/10/__pycache__/fn.cpython-311.pyc +0 -0
- fn_gen/norm_nlr/10/distortion.png +0 -0
- fn_gen/norm_nlr/10/expressions.txt +2 -0
- fn_gen/norm_nlr/10/fn.py +470 -0
- fn_gen/norm_nlr/10/loss.png +0 -0
- fn_gen/norm_nlr/10/quantization.png +0 -0
- fn_gen/norm_nlr/11/__pycache__/fn.cpython-311.pyc +0 -0
- fn_gen/norm_nlr/11/distortion.png +0 -0
- fn_gen/norm_nlr/11/expressions.txt +2 -0
- fn_gen/norm_nlr/11/fn.py +469 -0
- fn_gen/norm_nlr/11/loss.png +0 -0
- fn_gen/norm_nlr/11/quantization.png +0 -0
- fn_gen/norm_nlr/12/__pycache__/fn.cpython-311.pyc +0 -0
- fn_gen/norm_nlr/12/distortion.png +0 -0
- fn_gen/norm_nlr/12/expressions.txt +2 -0
- fn_gen/norm_nlr/12/fn.py +470 -0
- fn_gen/norm_nlr/12/loss.png +0 -0
- fn_gen/norm_nlr/12/quantization.png +0 -0
- fn_gen/norm_nlr/13/__pycache__/fn.cpython-311.pyc +0 -0
- fn_gen/norm_nlr/13/distortion.png +0 -0
- fn_gen/norm_nlr/13/expressions.txt +2 -0
- fn_gen/norm_nlr/13/fn.py +470 -0
- fn_gen/norm_nlr/13/loss.png +0 -0
- fn_gen/norm_nlr/13/quantization.png +0 -0
- fn_gen/norm_nlr/14/__pycache__/fn.cpython-311.pyc +0 -0
- fn_gen/norm_nlr/14/distortion.png +0 -0
- fn_gen/norm_nlr/14/expressions.txt +2 -0
- fn_gen/norm_nlr/14/fn.py +470 -0
- fn_gen/norm_nlr/14/loss.png +0 -0
- fn_gen/norm_nlr/14/quantization.png +0 -0
- fn_gen/norm_nlr/15/__pycache__/fn.cpython-311.pyc +0 -0
- fn_gen/norm_nlr/15/distortion.png +0 -0
- fn_gen/norm_nlr/15/expressions.txt +2 -0
- fn_gen/norm_nlr/15/fn.py +469 -0
- fn_gen/norm_nlr/15/loss.png +0 -0
- fn_gen/norm_nlr/15/quantization.png +0 -0
- fn_gen/norm_nlr/16/__pycache__/fn.cpython-311.pyc +0 -0
- fn_gen/norm_nlr/16/distortion.png +0 -0
fn_gen/norm_nlr/0/__pycache__/fn.cpython-311.pyc
ADDED
Binary file (24.7 kB). View file
|
|
fn_gen/norm_nlr/0/distortion.png
ADDED
fn_gen/norm_nlr/0/expressions.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
cos(_0*x)/_s
|
2 |
+
acos(_s*x)/_0
|
fn_gen/norm_nlr/0/fn.py
ADDED
@@ -0,0 +1,469 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from torch import amin # Necessary for arcsin
|
5 |
+
import copy
|
6 |
+
import torch.nn as nn
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
from scipy.optimize import curve_fit
|
10 |
+
from typing import Dict, Any, Tuple, List, Callable
|
11 |
+
|
12 |
+
|
13 |
+
def quantization(x, **params):
|
14 |
+
return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * torch.cos((params['_0'] * x)))
|
15 |
+
|
16 |
+
|
17 |
+
def dequantization(x, **params):
|
18 |
+
return (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * torch.acos(domain_guard((params['_s'] * x), min=-0.99999, max=0.99999, nan=0)))
|
19 |
+
|
20 |
+
|
21 |
+
def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]:
|
22 |
+
base_p0 = {
|
23 |
+
'_0': init_space_search(x, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], param='_0', **kwargs),
|
24 |
+
}
|
25 |
+
|
26 |
+
base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs)
|
27 |
+
if 'post_init_hook' in kwargs:
|
28 |
+
kwargs['post_init_hook'](parameters=base_p0)
|
29 |
+
|
30 |
+
params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], **kwargs)
|
31 |
+
params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()}
|
32 |
+
if 'post_method_hook' in kwargs:
|
33 |
+
kwargs['post_method_hook'](parameters=params)
|
34 |
+
|
35 |
+
|
36 |
+
if 'post_train_hook' in kwargs:
|
37 |
+
kwargs['post_train_hook'](parameters=params)
|
38 |
+
|
39 |
+
return params
|
40 |
+
|
41 |
+
|
42 |
+
############### Numpy Qtz ###############
|
43 |
+
|
44 |
+
|
45 |
+
def np_quantization(x, _0, _s):
|
46 |
+
return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np.cos((_0 * x)))
|
47 |
+
|
48 |
+
|
49 |
+
def np_dequantization(x, _0, _s):
|
50 |
+
return (np.divide(1, np_replace_num(_0, num=0, to=10000)) * np.arccos(np_domain_guard((_s * x), min=-0.99999, max=0.99999, nan=0)))
|
51 |
+
|
52 |
+
|
53 |
+
def fit_func(x, _0, _s):
|
54 |
+
x_ = np_quantization(x, _0, _s)
|
55 |
+
x_ = np_dequantization(x_, _0, _s)
|
56 |
+
return x_
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
############### HELPERS ###############
|
61 |
+
|
62 |
+
def domain_guard(
|
63 |
+
x: torch.Tensor,
|
64 |
+
min: float = None,
|
65 |
+
max: float = None,
|
66 |
+
posinf: float = None,
|
67 |
+
neginf: float = None,
|
68 |
+
nan: float = None
|
69 |
+
) -> torch.Tensor:
|
70 |
+
"""Guard a tensor to a valid domain."""
|
71 |
+
x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan)
|
72 |
+
if min is not None or max is not None:
|
73 |
+
x = torch.clamp(x, min=min, max=max)
|
74 |
+
return x
|
75 |
+
|
76 |
+
|
77 |
+
def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor:
|
78 |
+
"""Replace a number in a tensor with another number.
|
79 |
+
|
80 |
+
Args:
|
81 |
+
x (torch.Tensor): The input tensor.
|
82 |
+
num (float): The number to replace.
|
83 |
+
to (float): The number to replace with.
|
84 |
+
|
85 |
+
Returns:
|
86 |
+
torch.Tensor: The tensor with the number replaced.
|
87 |
+
"""
|
88 |
+
return torch.where(x == num, to, x)
|
89 |
+
|
90 |
+
|
91 |
+
def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor:
|
92 |
+
"""Guard the power operation to a valid domain."""
|
93 |
+
return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp)
|
94 |
+
|
95 |
+
|
96 |
+
def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
97 |
+
val = torch.amin(x, dim=1)
|
98 |
+
return torch.ones_like(val, dtype=torch.float32, device=x.device)
|
99 |
+
|
100 |
+
|
101 |
+
def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
102 |
+
val = torch.amin(x, dim=1)
|
103 |
+
return torch.randn_like(val, dtype=torch.float32, device=x.device)
|
104 |
+
|
105 |
+
|
106 |
+
def init_space_search(
|
107 |
+
x: torch.Tensor,
|
108 |
+
**kwargs: Dict[str, Any],
|
109 |
+
) -> torch.Tensor:
|
110 |
+
|
111 |
+
def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int):
|
112 |
+
"""Generates the initial set of parameters. The first iteration generates 10 times more parameters."""
|
113 |
+
for _ in range(n_params * 10): # The first iteration generates 10 times more parameters
|
114 |
+
yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial]
|
115 |
+
|
116 |
+
def _search_param(tensors: List[torch.tensor], n_params):
|
117 |
+
"""Takes the best parameters and generates new parameters around the mean of the best parameters."""
|
118 |
+
torch_tensors = torch.stack(tensors)
|
119 |
+
min_vals, max_vals = torch.aminmax(torch_tensors, dim=0)
|
120 |
+
abs_max_val_per_ch = torch.max(-min_vals, max_vals)
|
121 |
+
mean = torch.mean(torch_tensors, dim=0)
|
122 |
+
for _ in range(n_params): # Generates n_params around the mean of the tensors
|
123 |
+
yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean
|
124 |
+
|
125 |
+
def _calc(x, qtz_func, deqtz_func, **params):
|
126 |
+
x_ = x.transpose(0, 1)
|
127 |
+
x_ = qtz_func(x=x_, **params)
|
128 |
+
x_ = deqtz_func(x=x_, **params)
|
129 |
+
x_ = x_.transpose(0, 1)
|
130 |
+
return x_
|
131 |
+
|
132 |
+
assert "qtz_func" in kwargs, "qtz_func must be provided."
|
133 |
+
assert "deqtz_func" in kwargs, "deqtz_func must be provided."
|
134 |
+
assert "params_list" in kwargs, "params list must be provided."
|
135 |
+
assert "param" in kwargs, "param must be provided."
|
136 |
+
|
137 |
+
qtz_func = kwargs.get('qtz_func')
|
138 |
+
deqtz_func = kwargs.get('deqtz_func')
|
139 |
+
params_list = kwargs.get('params_list')
|
140 |
+
param = kwargs.get('param')
|
141 |
+
|
142 |
+
n_runs = 50 # Number of runs to try to find the best parameters
|
143 |
+
n_random_params = 50 # Number of random parameters to generate
|
144 |
+
n_best_to_pick = 5 # Number of best parameters to pick after each run
|
145 |
+
max_initial = 10000 # Maximum value to initialize the parameters
|
146 |
+
|
147 |
+
# Initializes the parameters
|
148 |
+
base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param }
|
149 |
+
params = _build_initial_param(x, max_initial, n_random_params)
|
150 |
+
|
151 |
+
# Performs the search
|
152 |
+
for _ in range(n_runs):
|
153 |
+
|
154 |
+
best_params = []
|
155 |
+
for param_ in params:
|
156 |
+
try:
|
157 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_})
|
158 |
+
loss_ones = nn.MSELoss()(x, x_)
|
159 |
+
|
160 |
+
if len(best_params) < n_best_to_pick:
|
161 |
+
best_params.append((param_, loss_ones.item()))
|
162 |
+
best_params = sorted(best_params, key=lambda x: x[1])
|
163 |
+
elif loss_ones < best_params[-1][1]:
|
164 |
+
best_params[-1] = (param_, loss_ones.item())
|
165 |
+
best_params = sorted(best_params, key=lambda x: x[1])
|
166 |
+
|
167 |
+
except Exception: # The parameters might not be valid for the function's domain
|
168 |
+
continue
|
169 |
+
|
170 |
+
# Generates new parameters around the mean
|
171 |
+
params = _search_param([p for p, _ in best_params], n_random_params)
|
172 |
+
|
173 |
+
# Checks if the best parameter is better than the init_ones
|
174 |
+
p_ones = init_ones(x, **kwargs)
|
175 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones})
|
176 |
+
loss_ones = nn.MSELoss()(x, x_)
|
177 |
+
|
178 |
+
# Checks if the best parameter is better than the init_rand
|
179 |
+
p_rand = init_rand(x, **kwargs)
|
180 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand})
|
181 |
+
loss_rand = nn.MSELoss()(x, x_)
|
182 |
+
|
183 |
+
if loss_rand < best_params[0][1] and loss_rand < loss_ones:
|
184 |
+
return p_rand
|
185 |
+
elif loss_ones < best_params[0][1] and loss_ones < loss_rand:
|
186 |
+
return p_ones
|
187 |
+
else:
|
188 |
+
return best_params[0][0]
|
189 |
+
|
190 |
+
|
191 |
+
def init_linear_scale( # Symmetric scale. From the study folder
|
192 |
+
x: torch.Tensor,
|
193 |
+
**kwargs: Dict[str, Any],
|
194 |
+
) -> torch.Tensor:
|
195 |
+
assert "bits" in kwargs, "bits must be provided."
|
196 |
+
assert "params" in kwargs, "params must be provided."
|
197 |
+
assert "qtz_func" in kwargs, "qtz_func must be provided."
|
198 |
+
|
199 |
+
bits = kwargs.get('bits')
|
200 |
+
params = kwargs.get('params')
|
201 |
+
qtz_func = kwargs.get('qtz_func')
|
202 |
+
|
203 |
+
x_ = x.transpose(0, 1)
|
204 |
+
x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs))
|
205 |
+
x_ = x_.transpose(0, 1)
|
206 |
+
|
207 |
+
quant_min, quant_max = get_min_max_from_bits_signed(bits)
|
208 |
+
min_vals, max_vals = torch.aminmax(x_, dim=1)
|
209 |
+
min_vals = torch.min(min_vals, torch.zeros_like(min_vals))
|
210 |
+
max_vals = torch.max(max_vals, torch.zeros_like(max_vals))
|
211 |
+
|
212 |
+
eps = torch.finfo(torch.float32).eps
|
213 |
+
|
214 |
+
abs_max_val_per_ch = torch.max(-min_vals, max_vals)
|
215 |
+
scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2)
|
216 |
+
|
217 |
+
scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device)
|
218 |
+
|
219 |
+
# Introduces some noise in scale
|
220 |
+
# If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything
|
221 |
+
# scale = scale + 0.01 * torch.randn_like(scale)
|
222 |
+
return scale
|
223 |
+
|
224 |
+
|
225 |
+
def init_non_linear_regression_fit(
|
226 |
+
x: torch.Tensor,
|
227 |
+
**kwargs: Dict[str, Any],
|
228 |
+
) -> torch.Tensor:
|
229 |
+
|
230 |
+
assert "params_list" in kwargs, "params list must be provided."
|
231 |
+
assert "np_fit_func" in kwargs, "np_fit_func must be provided."
|
232 |
+
assert "p0" in kwargs, "p0 must be provided."
|
233 |
+
np_fit_func = kwargs.get('np_fit_func')
|
234 |
+
params_list = kwargs.get('params_list')
|
235 |
+
p0 = kwargs.get('p0')
|
236 |
+
|
237 |
+
def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]):
|
238 |
+
popt, _ = curve_fit(
|
239 |
+
func,
|
240 |
+
xdata,
|
241 |
+
ydata,
|
242 |
+
maxfev=1000,
|
243 |
+
p0=p0,
|
244 |
+
method='lm'
|
245 |
+
)
|
246 |
+
return popt
|
247 |
+
|
248 |
+
# 1. Needs to convert the torch tensor to numpy tensor
|
249 |
+
xdata = x.cpu().numpy()
|
250 |
+
|
251 |
+
# 2. Sorts the data so that it makes it easier to fit to it
|
252 |
+
sorted_xdata = np.sort(xdata, axis=-1)
|
253 |
+
|
254 |
+
p0 = {k: v.cpu().numpy() for k, v in p0.items()}
|
255 |
+
params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order
|
256 |
+
|
257 |
+
# 3. Finds the best parameters for each channel
|
258 |
+
try:
|
259 |
+
params = []
|
260 |
+
for i in range(sorted_xdata.shape[0]):
|
261 |
+
xdata_ = sorted_xdata[i]
|
262 |
+
p0_ = [p0[p][i] for p in params_list]
|
263 |
+
ch_params = _fit(xdata_, xdata_, np_fit_func, p0_)
|
264 |
+
params.append(ch_params)
|
265 |
+
|
266 |
+
# 4. Builds the parameters
|
267 |
+
result = {}
|
268 |
+
for i, p in enumerate(params_list):
|
269 |
+
result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device)
|
270 |
+
|
271 |
+
return result
|
272 |
+
|
273 |
+
except ValueError as e:
|
274 |
+
print(f"Could not fit the function with error: {e}")
|
275 |
+
print(f"Using fallback result...")
|
276 |
+
return {
|
277 |
+
k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items()
|
278 |
+
}
|
279 |
+
|
280 |
+
|
281 |
+
def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
282 |
+
val = torch.amin(x, dim=1)
|
283 |
+
return torch.zeros_like(val, dtype=torch.float32, device=x.device)
|
284 |
+
|
285 |
+
|
286 |
+
def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor:
|
287 |
+
# Calculate the original minimum and maximum values
|
288 |
+
min_vals, max_vals = torch.aminmax(tensor, dim=-1)
|
289 |
+
x_min = torch.min(min_vals, torch.zeros_like(min_vals))
|
290 |
+
x_max = torch.max(max_vals, torch.zeros_like(max_vals))
|
291 |
+
|
292 |
+
if _max is torch.inf: # We do not need to scale the tensor. Just need to move it
|
293 |
+
return torch.ones_like(x_min)
|
294 |
+
|
295 |
+
# Calculate the scale factor
|
296 |
+
scale = (_max - _min) / (x_max - x_min)
|
297 |
+
return scale
|
298 |
+
|
299 |
+
|
300 |
+
|
301 |
+
############## Quant ###############
|
302 |
+
|
303 |
+
@torch.enable_grad()
|
304 |
+
def learn_parameters(
|
305 |
+
x: torch.Tensor,
|
306 |
+
params: Dict[str, nn.Parameter],
|
307 |
+
qtz_func: nn.Module,
|
308 |
+
deqtz_func: nn.Module,
|
309 |
+
bits: int,
|
310 |
+
target_dtype: torch.dtype,
|
311 |
+
epochs: int = 1000,
|
312 |
+
early_stop: bool = True,
|
313 |
+
do_report: bool = False
|
314 |
+
) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]:
|
315 |
+
loss_fn = nn.MSELoss()
|
316 |
+
|
317 |
+
# Determines the initial learning rate by computing the initial loss and multiplying it by
|
318 |
+
# the order of magnitude of the loss divided by 2
|
319 |
+
quant = quantize(x, params, qtz_func, bits, target_dtype)
|
320 |
+
dequant = dequantize(quant, params, deqtz_func, bits, x.dtype)
|
321 |
+
loss = loss_fn(x, dequant)
|
322 |
+
|
323 |
+
base_lr = 0.1
|
324 |
+
exponent = int(np.floor(np.log10(loss.item())))
|
325 |
+
lr = base_lr * (10 ** (exponent // 2))
|
326 |
+
|
327 |
+
# Requires gradients in the parameters
|
328 |
+
for p in params.values():
|
329 |
+
p.requires_grad = True
|
330 |
+
p.grad = None
|
331 |
+
|
332 |
+
param_keys = list(params.keys())
|
333 |
+
param_values = list(params.values())
|
334 |
+
|
335 |
+
# Defines optimizer and loss function
|
336 |
+
optimizer = torch.optim.Adam(param_values, lr=lr)
|
337 |
+
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.01, total_iters=epochs // 10)
|
338 |
+
|
339 |
+
# Contains the best loss and the best parameters
|
340 |
+
best_loss = float("inf")
|
341 |
+
best_params = None
|
342 |
+
|
343 |
+
# Used to stop the search early
|
344 |
+
min_delta = 1e-7
|
345 |
+
acc_loss = []
|
346 |
+
percent_epochs_before_stop = 0.1
|
347 |
+
|
348 |
+
for i in range(epochs):
|
349 |
+
optimizer.zero_grad()
|
350 |
+
|
351 |
+
quant = quantize(x, params, qtz_func, bits, target_dtype)
|
352 |
+
dequant = dequantize(quant, params, deqtz_func, bits, x.dtype)
|
353 |
+
loss = loss_fn(x, dequant)
|
354 |
+
|
355 |
+
if loss.isnan() or loss.isinf():
|
356 |
+
raise Exception("Loss is NaN or Inf. Stopping the search.")
|
357 |
+
|
358 |
+
loss.backward()
|
359 |
+
optimizer.step()
|
360 |
+
scheduler.step()
|
361 |
+
|
362 |
+
acc_loss.append(loss.item())
|
363 |
+
|
364 |
+
# Reports loss every 10 steps
|
365 |
+
if i % 10 == 0 and do_report:
|
366 |
+
print(f"Epoch {i}: Loss {loss.item()}")
|
367 |
+
|
368 |
+
# Optimizes the parameter search by storing the best loss and the parameters
|
369 |
+
if loss.item() < best_loss:
|
370 |
+
best_loss = loss.item()
|
371 |
+
best_params = copy.deepcopy({
|
372 |
+
k: v for k, v in params.items() if k in param_keys
|
373 |
+
})
|
374 |
+
|
375 |
+
# We also stop the search if the loss has not considerably during the last 10% epochs
|
376 |
+
if early_stop:
|
377 |
+
epochs_before_stop = int(epochs * percent_epochs_before_stop)
|
378 |
+
if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta:
|
379 |
+
break
|
380 |
+
|
381 |
+
# No longer requires gradients in the parameters
|
382 |
+
for p in best_params.values():
|
383 |
+
p.requires_grad = False
|
384 |
+
p.grad = None
|
385 |
+
|
386 |
+
if do_report:
|
387 |
+
return best_params, acc_loss
|
388 |
+
else:
|
389 |
+
return best_params
|
390 |
+
|
391 |
+
|
392 |
+
def quantize(
|
393 |
+
x: torch.Tensor,
|
394 |
+
params: Dict[str, nn.Parameter],
|
395 |
+
func: nn.Module,
|
396 |
+
bits: int,
|
397 |
+
target_dtype: torch.dtype = torch.int8
|
398 |
+
) -> torch.Tensor:
|
399 |
+
quant_min, quant_max = get_min_max_from_bits_signed(bits)
|
400 |
+
x = x.transpose(0, 1) # Aligns shapes
|
401 |
+
x = func(x=x, **params)
|
402 |
+
x = x.transpose(0, 1)
|
403 |
+
x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype)
|
404 |
+
return x
|
405 |
+
|
406 |
+
|
407 |
+
def dequantize(
|
408 |
+
x: torch.Tensor,
|
409 |
+
params: Dict[str, nn.Parameter],
|
410 |
+
func: nn.Module,
|
411 |
+
bits: int,
|
412 |
+
out_dtype: torch.dtype
|
413 |
+
) -> torch.Tensor:
|
414 |
+
x = x.to(dtype=out_dtype)
|
415 |
+
x = x.transpose(0, 1)
|
416 |
+
x = func(x=x, **params)
|
417 |
+
x = x.transpose(0, 1)
|
418 |
+
return x
|
419 |
+
|
420 |
+
|
421 |
+
def round_func_BPDA(input):
|
422 |
+
# This is equivalent to replacing round function (non-differentiable) with
|
423 |
+
# an identity function (differentiable) only when backward.
|
424 |
+
forward_value = torch.round(input)
|
425 |
+
out = input.clone()
|
426 |
+
out.data = forward_value.data
|
427 |
+
return out
|
428 |
+
|
429 |
+
|
430 |
+
def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]:
|
431 |
+
return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1
|
432 |
+
|
433 |
+
|
434 |
+
|
435 |
+
############## Numpy ###############
|
436 |
+
|
437 |
+
def np_domain_guard(
|
438 |
+
x: np.ndarray,
|
439 |
+
min: float = None,
|
440 |
+
max: float = None,
|
441 |
+
posinf: float = None,
|
442 |
+
neginf: float = None,
|
443 |
+
nan: float = None
|
444 |
+
) -> np.ndarray:
|
445 |
+
"""Guard a tensor to a valid domain."""
|
446 |
+
x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan)
|
447 |
+
if min is not None or max is not None:
|
448 |
+
x = np.clip(x, min, max)
|
449 |
+
return x
|
450 |
+
|
451 |
+
|
452 |
+
def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray:
|
453 |
+
"""Replace a number in a tensor with another number.
|
454 |
+
|
455 |
+
Args:
|
456 |
+
x (np.ndarray): The input tensor.
|
457 |
+
num (float): The number to replace.
|
458 |
+
to (float): The number to replace with.
|
459 |
+
|
460 |
+
Returns:
|
461 |
+
np.ndarray: The tensor with the number replaced.
|
462 |
+
"""
|
463 |
+
return np.where(x == num, to, x)
|
464 |
+
|
465 |
+
|
466 |
+
def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray:
|
467 |
+
"""Guard the power operation to a valid domain."""
|
468 |
+
return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp)
|
469 |
+
|
fn_gen/norm_nlr/0/loss.png
ADDED
fn_gen/norm_nlr/0/quantization.png
ADDED
fn_gen/norm_nlr/1/__pycache__/fn.cpython-311.pyc
ADDED
Binary file (24 kB). View file
|
|
fn_gen/norm_nlr/1/distortion.png
ADDED
fn_gen/norm_nlr/1/expressions.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
x**3/_s
|
2 |
+
(_s*x)**(1/3)
|
fn_gen/norm_nlr/1/fn.py
ADDED
@@ -0,0 +1,468 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from torch import amin # Necessary for arcsin
|
5 |
+
import copy
|
6 |
+
import torch.nn as nn
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
from scipy.optimize import curve_fit
|
10 |
+
from typing import Dict, Any, Tuple, List, Callable
|
11 |
+
|
12 |
+
|
13 |
+
def quantization(x, **params):
|
14 |
+
return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * guarded_torch_power(x, torch.tensor(3)))
|
15 |
+
|
16 |
+
|
17 |
+
def dequantization(x, **params):
|
18 |
+
return guarded_torch_power((params['_s'] * x), 1 / 3)
|
19 |
+
|
20 |
+
|
21 |
+
def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]:
|
22 |
+
base_p0 = {
|
23 |
+
}
|
24 |
+
|
25 |
+
base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs)
|
26 |
+
if 'post_init_hook' in kwargs:
|
27 |
+
kwargs['post_init_hook'](parameters=base_p0)
|
28 |
+
|
29 |
+
params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_s'], **kwargs)
|
30 |
+
params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()}
|
31 |
+
if 'post_method_hook' in kwargs:
|
32 |
+
kwargs['post_method_hook'](parameters=params)
|
33 |
+
|
34 |
+
|
35 |
+
if 'post_train_hook' in kwargs:
|
36 |
+
kwargs['post_train_hook'](parameters=params)
|
37 |
+
|
38 |
+
return params
|
39 |
+
|
40 |
+
|
41 |
+
############### Numpy Qtz ###############
|
42 |
+
|
43 |
+
|
44 |
+
def np_quantization(x, _s):
|
45 |
+
return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np_guarded_power(x, np.array(3)))
|
46 |
+
|
47 |
+
|
48 |
+
def np_dequantization(x, _s):
|
49 |
+
return np_guarded_power((_s * x), 1 / 3)
|
50 |
+
|
51 |
+
|
52 |
+
def fit_func(x, _s):
|
53 |
+
x_ = np_quantization(x, _s)
|
54 |
+
x_ = np_dequantization(x_, _s)
|
55 |
+
return x_
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
############### HELPERS ###############
|
60 |
+
|
61 |
+
def domain_guard(
|
62 |
+
x: torch.Tensor,
|
63 |
+
min: float = None,
|
64 |
+
max: float = None,
|
65 |
+
posinf: float = None,
|
66 |
+
neginf: float = None,
|
67 |
+
nan: float = None
|
68 |
+
) -> torch.Tensor:
|
69 |
+
"""Guard a tensor to a valid domain."""
|
70 |
+
x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan)
|
71 |
+
if min is not None or max is not None:
|
72 |
+
x = torch.clamp(x, min=min, max=max)
|
73 |
+
return x
|
74 |
+
|
75 |
+
|
76 |
+
def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor:
|
77 |
+
"""Replace a number in a tensor with another number.
|
78 |
+
|
79 |
+
Args:
|
80 |
+
x (torch.Tensor): The input tensor.
|
81 |
+
num (float): The number to replace.
|
82 |
+
to (float): The number to replace with.
|
83 |
+
|
84 |
+
Returns:
|
85 |
+
torch.Tensor: The tensor with the number replaced.
|
86 |
+
"""
|
87 |
+
return torch.where(x == num, to, x)
|
88 |
+
|
89 |
+
|
90 |
+
def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor:
|
91 |
+
"""Guard the power operation to a valid domain."""
|
92 |
+
return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp)
|
93 |
+
|
94 |
+
|
95 |
+
def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
96 |
+
val = torch.amin(x, dim=1)
|
97 |
+
return torch.ones_like(val, dtype=torch.float32, device=x.device)
|
98 |
+
|
99 |
+
|
100 |
+
def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
101 |
+
val = torch.amin(x, dim=1)
|
102 |
+
return torch.randn_like(val, dtype=torch.float32, device=x.device)
|
103 |
+
|
104 |
+
|
105 |
+
def init_space_search(
|
106 |
+
x: torch.Tensor,
|
107 |
+
**kwargs: Dict[str, Any],
|
108 |
+
) -> torch.Tensor:
|
109 |
+
|
110 |
+
def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int):
|
111 |
+
"""Generates the initial set of parameters. The first iteration generates 10 times more parameters."""
|
112 |
+
for _ in range(n_params * 10): # The first iteration generates 10 times more parameters
|
113 |
+
yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial]
|
114 |
+
|
115 |
+
def _search_param(tensors: List[torch.tensor], n_params):
|
116 |
+
"""Takes the best parameters and generates new parameters around the mean of the best parameters."""
|
117 |
+
torch_tensors = torch.stack(tensors)
|
118 |
+
min_vals, max_vals = torch.aminmax(torch_tensors, dim=0)
|
119 |
+
abs_max_val_per_ch = torch.max(-min_vals, max_vals)
|
120 |
+
mean = torch.mean(torch_tensors, dim=0)
|
121 |
+
for _ in range(n_params): # Generates n_params around the mean of the tensors
|
122 |
+
yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean
|
123 |
+
|
124 |
+
def _calc(x, qtz_func, deqtz_func, **params):
|
125 |
+
x_ = x.transpose(0, 1)
|
126 |
+
x_ = qtz_func(x=x_, **params)
|
127 |
+
x_ = deqtz_func(x=x_, **params)
|
128 |
+
x_ = x_.transpose(0, 1)
|
129 |
+
return x_
|
130 |
+
|
131 |
+
assert "qtz_func" in kwargs, "qtz_func must be provided."
|
132 |
+
assert "deqtz_func" in kwargs, "deqtz_func must be provided."
|
133 |
+
assert "params_list" in kwargs, "params list must be provided."
|
134 |
+
assert "param" in kwargs, "param must be provided."
|
135 |
+
|
136 |
+
qtz_func = kwargs.get('qtz_func')
|
137 |
+
deqtz_func = kwargs.get('deqtz_func')
|
138 |
+
params_list = kwargs.get('params_list')
|
139 |
+
param = kwargs.get('param')
|
140 |
+
|
141 |
+
n_runs = 50 # Number of runs to try to find the best parameters
|
142 |
+
n_random_params = 50 # Number of random parameters to generate
|
143 |
+
n_best_to_pick = 5 # Number of best parameters to pick after each run
|
144 |
+
max_initial = 10000 # Maximum value to initialize the parameters
|
145 |
+
|
146 |
+
# Initializes the parameters
|
147 |
+
base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param }
|
148 |
+
params = _build_initial_param(x, max_initial, n_random_params)
|
149 |
+
|
150 |
+
# Performs the search
|
151 |
+
for _ in range(n_runs):
|
152 |
+
|
153 |
+
best_params = []
|
154 |
+
for param_ in params:
|
155 |
+
try:
|
156 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_})
|
157 |
+
loss_ones = nn.MSELoss()(x, x_)
|
158 |
+
|
159 |
+
if len(best_params) < n_best_to_pick:
|
160 |
+
best_params.append((param_, loss_ones.item()))
|
161 |
+
best_params = sorted(best_params, key=lambda x: x[1])
|
162 |
+
elif loss_ones < best_params[-1][1]:
|
163 |
+
best_params[-1] = (param_, loss_ones.item())
|
164 |
+
best_params = sorted(best_params, key=lambda x: x[1])
|
165 |
+
|
166 |
+
except Exception: # The parameters might not be valid for the function's domain
|
167 |
+
continue
|
168 |
+
|
169 |
+
# Generates new parameters around the mean
|
170 |
+
params = _search_param([p for p, _ in best_params], n_random_params)
|
171 |
+
|
172 |
+
# Checks if the best parameter is better than the init_ones
|
173 |
+
p_ones = init_ones(x, **kwargs)
|
174 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones})
|
175 |
+
loss_ones = nn.MSELoss()(x, x_)
|
176 |
+
|
177 |
+
# Checks if the best parameter is better than the init_rand
|
178 |
+
p_rand = init_rand(x, **kwargs)
|
179 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand})
|
180 |
+
loss_rand = nn.MSELoss()(x, x_)
|
181 |
+
|
182 |
+
if loss_rand < best_params[0][1] and loss_rand < loss_ones:
|
183 |
+
return p_rand
|
184 |
+
elif loss_ones < best_params[0][1] and loss_ones < loss_rand:
|
185 |
+
return p_ones
|
186 |
+
else:
|
187 |
+
return best_params[0][0]
|
188 |
+
|
189 |
+
|
190 |
+
def init_linear_scale( # Symmetric scale. From the study folder
|
191 |
+
x: torch.Tensor,
|
192 |
+
**kwargs: Dict[str, Any],
|
193 |
+
) -> torch.Tensor:
|
194 |
+
assert "bits" in kwargs, "bits must be provided."
|
195 |
+
assert "params" in kwargs, "params must be provided."
|
196 |
+
assert "qtz_func" in kwargs, "qtz_func must be provided."
|
197 |
+
|
198 |
+
bits = kwargs.get('bits')
|
199 |
+
params = kwargs.get('params')
|
200 |
+
qtz_func = kwargs.get('qtz_func')
|
201 |
+
|
202 |
+
x_ = x.transpose(0, 1)
|
203 |
+
x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs))
|
204 |
+
x_ = x_.transpose(0, 1)
|
205 |
+
|
206 |
+
quant_min, quant_max = get_min_max_from_bits_signed(bits)
|
207 |
+
min_vals, max_vals = torch.aminmax(x_, dim=1)
|
208 |
+
min_vals = torch.min(min_vals, torch.zeros_like(min_vals))
|
209 |
+
max_vals = torch.max(max_vals, torch.zeros_like(max_vals))
|
210 |
+
|
211 |
+
eps = torch.finfo(torch.float32).eps
|
212 |
+
|
213 |
+
abs_max_val_per_ch = torch.max(-min_vals, max_vals)
|
214 |
+
scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2)
|
215 |
+
|
216 |
+
scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device)
|
217 |
+
|
218 |
+
# Introduces some noise in scale
|
219 |
+
# If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything
|
220 |
+
# scale = scale + 0.01 * torch.randn_like(scale)
|
221 |
+
return scale
|
222 |
+
|
223 |
+
|
224 |
+
def init_non_linear_regression_fit(
|
225 |
+
x: torch.Tensor,
|
226 |
+
**kwargs: Dict[str, Any],
|
227 |
+
) -> torch.Tensor:
|
228 |
+
|
229 |
+
assert "params_list" in kwargs, "params list must be provided."
|
230 |
+
assert "np_fit_func" in kwargs, "np_fit_func must be provided."
|
231 |
+
assert "p0" in kwargs, "p0 must be provided."
|
232 |
+
np_fit_func = kwargs.get('np_fit_func')
|
233 |
+
params_list = kwargs.get('params_list')
|
234 |
+
p0 = kwargs.get('p0')
|
235 |
+
|
236 |
+
def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]):
|
237 |
+
popt, _ = curve_fit(
|
238 |
+
func,
|
239 |
+
xdata,
|
240 |
+
ydata,
|
241 |
+
maxfev=1000,
|
242 |
+
p0=p0,
|
243 |
+
method='lm'
|
244 |
+
)
|
245 |
+
return popt
|
246 |
+
|
247 |
+
# 1. Needs to convert the torch tensor to numpy tensor
|
248 |
+
xdata = x.cpu().numpy()
|
249 |
+
|
250 |
+
# 2. Sorts the data so that it makes it easier to fit to it
|
251 |
+
sorted_xdata = np.sort(xdata, axis=-1)
|
252 |
+
|
253 |
+
p0 = {k: v.cpu().numpy() for k, v in p0.items()}
|
254 |
+
params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order
|
255 |
+
|
256 |
+
# 3. Finds the best parameters for each channel
|
257 |
+
try:
|
258 |
+
params = []
|
259 |
+
for i in range(sorted_xdata.shape[0]):
|
260 |
+
xdata_ = sorted_xdata[i]
|
261 |
+
p0_ = [p0[p][i] for p in params_list]
|
262 |
+
ch_params = _fit(xdata_, xdata_, np_fit_func, p0_)
|
263 |
+
params.append(ch_params)
|
264 |
+
|
265 |
+
# 4. Builds the parameters
|
266 |
+
result = {}
|
267 |
+
for i, p in enumerate(params_list):
|
268 |
+
result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device)
|
269 |
+
|
270 |
+
return result
|
271 |
+
|
272 |
+
except ValueError as e:
|
273 |
+
print(f"Could not fit the function with error: {e}")
|
274 |
+
print(f"Using fallback result...")
|
275 |
+
return {
|
276 |
+
k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items()
|
277 |
+
}
|
278 |
+
|
279 |
+
|
280 |
+
def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
281 |
+
val = torch.amin(x, dim=1)
|
282 |
+
return torch.zeros_like(val, dtype=torch.float32, device=x.device)
|
283 |
+
|
284 |
+
|
285 |
+
def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor:
|
286 |
+
# Calculate the original minimum and maximum values
|
287 |
+
min_vals, max_vals = torch.aminmax(tensor, dim=-1)
|
288 |
+
x_min = torch.min(min_vals, torch.zeros_like(min_vals))
|
289 |
+
x_max = torch.max(max_vals, torch.zeros_like(max_vals))
|
290 |
+
|
291 |
+
if _max is torch.inf: # We do not need to scale the tensor. Just need to move it
|
292 |
+
return torch.ones_like(x_min)
|
293 |
+
|
294 |
+
# Calculate the scale factor
|
295 |
+
scale = (_max - _min) / (x_max - x_min)
|
296 |
+
return scale
|
297 |
+
|
298 |
+
|
299 |
+
|
300 |
+
############## Quant ###############
|
301 |
+
|
302 |
+
@torch.enable_grad()
|
303 |
+
def learn_parameters(
|
304 |
+
x: torch.Tensor,
|
305 |
+
params: Dict[str, nn.Parameter],
|
306 |
+
qtz_func: nn.Module,
|
307 |
+
deqtz_func: nn.Module,
|
308 |
+
bits: int,
|
309 |
+
target_dtype: torch.dtype,
|
310 |
+
epochs: int = 1000,
|
311 |
+
early_stop: bool = True,
|
312 |
+
do_report: bool = False
|
313 |
+
) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]:
|
314 |
+
loss_fn = nn.MSELoss()
|
315 |
+
|
316 |
+
# Determines the initial learning rate by computing the initial loss and multiplying it by
|
317 |
+
# the order of magnitude of the loss divided by 2
|
318 |
+
quant = quantize(x, params, qtz_func, bits, target_dtype)
|
319 |
+
dequant = dequantize(quant, params, deqtz_func, bits, x.dtype)
|
320 |
+
loss = loss_fn(x, dequant)
|
321 |
+
|
322 |
+
base_lr = 0.1
|
323 |
+
exponent = int(np.floor(np.log10(loss.item())))
|
324 |
+
lr = base_lr * (10 ** (exponent // 2))
|
325 |
+
|
326 |
+
# Requires gradients in the parameters
|
327 |
+
for p in params.values():
|
328 |
+
p.requires_grad = True
|
329 |
+
p.grad = None
|
330 |
+
|
331 |
+
param_keys = list(params.keys())
|
332 |
+
param_values = list(params.values())
|
333 |
+
|
334 |
+
# Defines optimizer and loss function
|
335 |
+
optimizer = torch.optim.Adam(param_values, lr=lr)
|
336 |
+
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.01, total_iters=epochs // 10)
|
337 |
+
|
338 |
+
# Contains the best loss and the best parameters
|
339 |
+
best_loss = float("inf")
|
340 |
+
best_params = None
|
341 |
+
|
342 |
+
# Used to stop the search early
|
343 |
+
min_delta = 1e-7
|
344 |
+
acc_loss = []
|
345 |
+
percent_epochs_before_stop = 0.1
|
346 |
+
|
347 |
+
for i in range(epochs):
|
348 |
+
optimizer.zero_grad()
|
349 |
+
|
350 |
+
quant = quantize(x, params, qtz_func, bits, target_dtype)
|
351 |
+
dequant = dequantize(quant, params, deqtz_func, bits, x.dtype)
|
352 |
+
loss = loss_fn(x, dequant)
|
353 |
+
|
354 |
+
if loss.isnan() or loss.isinf():
|
355 |
+
raise Exception("Loss is NaN or Inf. Stopping the search.")
|
356 |
+
|
357 |
+
loss.backward()
|
358 |
+
optimizer.step()
|
359 |
+
scheduler.step()
|
360 |
+
|
361 |
+
acc_loss.append(loss.item())
|
362 |
+
|
363 |
+
# Reports loss every 10 steps
|
364 |
+
if i % 10 == 0 and do_report:
|
365 |
+
print(f"Epoch {i}: Loss {loss.item()}")
|
366 |
+
|
367 |
+
# Optimizes the parameter search by storing the best loss and the parameters
|
368 |
+
if loss.item() < best_loss:
|
369 |
+
best_loss = loss.item()
|
370 |
+
best_params = copy.deepcopy({
|
371 |
+
k: v for k, v in params.items() if k in param_keys
|
372 |
+
})
|
373 |
+
|
374 |
+
# We also stop the search if the loss has not considerably during the last 10% epochs
|
375 |
+
if early_stop:
|
376 |
+
epochs_before_stop = int(epochs * percent_epochs_before_stop)
|
377 |
+
if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta:
|
378 |
+
break
|
379 |
+
|
380 |
+
# No longer requires gradients in the parameters
|
381 |
+
for p in best_params.values():
|
382 |
+
p.requires_grad = False
|
383 |
+
p.grad = None
|
384 |
+
|
385 |
+
if do_report:
|
386 |
+
return best_params, acc_loss
|
387 |
+
else:
|
388 |
+
return best_params
|
389 |
+
|
390 |
+
|
391 |
+
def quantize(
|
392 |
+
x: torch.Tensor,
|
393 |
+
params: Dict[str, nn.Parameter],
|
394 |
+
func: nn.Module,
|
395 |
+
bits: int,
|
396 |
+
target_dtype: torch.dtype = torch.int8
|
397 |
+
) -> torch.Tensor:
|
398 |
+
quant_min, quant_max = get_min_max_from_bits_signed(bits)
|
399 |
+
x = x.transpose(0, 1) # Aligns shapes
|
400 |
+
x = func(x=x, **params)
|
401 |
+
x = x.transpose(0, 1)
|
402 |
+
x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype)
|
403 |
+
return x
|
404 |
+
|
405 |
+
|
406 |
+
def dequantize(
|
407 |
+
x: torch.Tensor,
|
408 |
+
params: Dict[str, nn.Parameter],
|
409 |
+
func: nn.Module,
|
410 |
+
bits: int,
|
411 |
+
out_dtype: torch.dtype
|
412 |
+
) -> torch.Tensor:
|
413 |
+
x = x.to(dtype=out_dtype)
|
414 |
+
x = x.transpose(0, 1)
|
415 |
+
x = func(x=x, **params)
|
416 |
+
x = x.transpose(0, 1)
|
417 |
+
return x
|
418 |
+
|
419 |
+
|
420 |
+
def round_func_BPDA(input):
|
421 |
+
# This is equivalent to replacing round function (non-differentiable) with
|
422 |
+
# an identity function (differentiable) only when backward.
|
423 |
+
forward_value = torch.round(input)
|
424 |
+
out = input.clone()
|
425 |
+
out.data = forward_value.data
|
426 |
+
return out
|
427 |
+
|
428 |
+
|
429 |
+
def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]:
|
430 |
+
return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1
|
431 |
+
|
432 |
+
|
433 |
+
|
434 |
+
############## Numpy ###############
|
435 |
+
|
436 |
+
def np_domain_guard(
|
437 |
+
x: np.ndarray,
|
438 |
+
min: float = None,
|
439 |
+
max: float = None,
|
440 |
+
posinf: float = None,
|
441 |
+
neginf: float = None,
|
442 |
+
nan: float = None
|
443 |
+
) -> np.ndarray:
|
444 |
+
"""Guard a tensor to a valid domain."""
|
445 |
+
x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan)
|
446 |
+
if min is not None or max is not None:
|
447 |
+
x = np.clip(x, min, max)
|
448 |
+
return x
|
449 |
+
|
450 |
+
|
451 |
+
def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray:
|
452 |
+
"""Replace a number in a tensor with another number.
|
453 |
+
|
454 |
+
Args:
|
455 |
+
x (np.ndarray): The input tensor.
|
456 |
+
num (float): The number to replace.
|
457 |
+
to (float): The number to replace with.
|
458 |
+
|
459 |
+
Returns:
|
460 |
+
np.ndarray: The tensor with the number replaced.
|
461 |
+
"""
|
462 |
+
return np.where(x == num, to, x)
|
463 |
+
|
464 |
+
|
465 |
+
def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray:
|
466 |
+
"""Guard the power operation to a valid domain."""
|
467 |
+
return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp)
|
468 |
+
|
fn_gen/norm_nlr/1/loss.png
ADDED
fn_gen/norm_nlr/1/quantization.png
ADDED
fn_gen/norm_nlr/10/__pycache__/fn.cpython-311.pyc
ADDED
Binary file (25.1 kB). View file
|
|
fn_gen/norm_nlr/10/distortion.png
ADDED
fn_gen/norm_nlr/10/expressions.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
(_0*(-_1 + x))**(1/3)/_s
|
2 |
+
_1 + _s**3*x**3/_0
|
fn_gen/norm_nlr/10/fn.py
ADDED
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from torch import amin # Necessary for arcsin
|
5 |
+
import copy
|
6 |
+
import torch.nn as nn
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
from scipy.optimize import curve_fit
|
10 |
+
from typing import Dict, Any, Tuple, List, Callable
|
11 |
+
|
12 |
+
|
13 |
+
def quantization(x, **params):
|
14 |
+
return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * guarded_torch_power((params['_0'] * (x + (torch.tensor(-1) * params['_1']))), 1 / 3))
|
15 |
+
|
16 |
+
|
17 |
+
def dequantization(x, **params):
|
18 |
+
return (params['_1'] + (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * guarded_torch_power(params['_s'], torch.tensor(3)) * guarded_torch_power(x, torch.tensor(3))))
|
19 |
+
|
20 |
+
|
21 |
+
def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]:
|
22 |
+
base_p0 = {
|
23 |
+
'_0': init_inner_scale(x, **{'_min': 0}),
|
24 |
+
'_1': amin(x, **{'dim': -1}),
|
25 |
+
}
|
26 |
+
|
27 |
+
base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs)
|
28 |
+
if 'post_init_hook' in kwargs:
|
29 |
+
kwargs['post_init_hook'](parameters=base_p0)
|
30 |
+
|
31 |
+
params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_1', '_s'], **kwargs)
|
32 |
+
params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()}
|
33 |
+
if 'post_method_hook' in kwargs:
|
34 |
+
kwargs['post_method_hook'](parameters=params)
|
35 |
+
|
36 |
+
|
37 |
+
if 'post_train_hook' in kwargs:
|
38 |
+
kwargs['post_train_hook'](parameters=params)
|
39 |
+
|
40 |
+
return params
|
41 |
+
|
42 |
+
|
43 |
+
############### Numpy Qtz ###############
|
44 |
+
|
45 |
+
|
46 |
+
def np_quantization(x, _0, _1, _s):
|
47 |
+
return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np_guarded_power((_0 * (x + (np.array(-1) * _1))), 1 / 3))
|
48 |
+
|
49 |
+
|
50 |
+
def np_dequantization(x, _0, _1, _s):
|
51 |
+
return (_1 + (np.divide(1, np_replace_num(_0, num=0, to=10000)) * np_guarded_power(_s, np.array(3)) * np_guarded_power(x, np.array(3))))
|
52 |
+
|
53 |
+
|
54 |
+
def fit_func(x, _0, _1, _s):
|
55 |
+
x_ = np_quantization(x, _0, _1, _s)
|
56 |
+
x_ = np_dequantization(x_, _0, _1, _s)
|
57 |
+
return x_
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
############### HELPERS ###############
|
62 |
+
|
63 |
+
def domain_guard(
|
64 |
+
x: torch.Tensor,
|
65 |
+
min: float = None,
|
66 |
+
max: float = None,
|
67 |
+
posinf: float = None,
|
68 |
+
neginf: float = None,
|
69 |
+
nan: float = None
|
70 |
+
) -> torch.Tensor:
|
71 |
+
"""Guard a tensor to a valid domain."""
|
72 |
+
x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan)
|
73 |
+
if min is not None or max is not None:
|
74 |
+
x = torch.clamp(x, min=min, max=max)
|
75 |
+
return x
|
76 |
+
|
77 |
+
|
78 |
+
def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor:
|
79 |
+
"""Replace a number in a tensor with another number.
|
80 |
+
|
81 |
+
Args:
|
82 |
+
x (torch.Tensor): The input tensor.
|
83 |
+
num (float): The number to replace.
|
84 |
+
to (float): The number to replace with.
|
85 |
+
|
86 |
+
Returns:
|
87 |
+
torch.Tensor: The tensor with the number replaced.
|
88 |
+
"""
|
89 |
+
return torch.where(x == num, to, x)
|
90 |
+
|
91 |
+
|
92 |
+
def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor:
|
93 |
+
"""Guard the power operation to a valid domain."""
|
94 |
+
return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp)
|
95 |
+
|
96 |
+
|
97 |
+
def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
98 |
+
val = torch.amin(x, dim=1)
|
99 |
+
return torch.ones_like(val, dtype=torch.float32, device=x.device)
|
100 |
+
|
101 |
+
|
102 |
+
def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
103 |
+
val = torch.amin(x, dim=1)
|
104 |
+
return torch.randn_like(val, dtype=torch.float32, device=x.device)
|
105 |
+
|
106 |
+
|
107 |
+
def init_space_search(
|
108 |
+
x: torch.Tensor,
|
109 |
+
**kwargs: Dict[str, Any],
|
110 |
+
) -> torch.Tensor:
|
111 |
+
|
112 |
+
def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int):
|
113 |
+
"""Generates the initial set of parameters. The first iteration generates 10 times more parameters."""
|
114 |
+
for _ in range(n_params * 10): # The first iteration generates 10 times more parameters
|
115 |
+
yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial]
|
116 |
+
|
117 |
+
def _search_param(tensors: List[torch.tensor], n_params):
|
118 |
+
"""Takes the best parameters and generates new parameters around the mean of the best parameters."""
|
119 |
+
torch_tensors = torch.stack(tensors)
|
120 |
+
min_vals, max_vals = torch.aminmax(torch_tensors, dim=0)
|
121 |
+
abs_max_val_per_ch = torch.max(-min_vals, max_vals)
|
122 |
+
mean = torch.mean(torch_tensors, dim=0)
|
123 |
+
for _ in range(n_params): # Generates n_params around the mean of the tensors
|
124 |
+
yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean
|
125 |
+
|
126 |
+
def _calc(x, qtz_func, deqtz_func, **params):
|
127 |
+
x_ = x.transpose(0, 1)
|
128 |
+
x_ = qtz_func(x=x_, **params)
|
129 |
+
x_ = deqtz_func(x=x_, **params)
|
130 |
+
x_ = x_.transpose(0, 1)
|
131 |
+
return x_
|
132 |
+
|
133 |
+
assert "qtz_func" in kwargs, "qtz_func must be provided."
|
134 |
+
assert "deqtz_func" in kwargs, "deqtz_func must be provided."
|
135 |
+
assert "params_list" in kwargs, "params list must be provided."
|
136 |
+
assert "param" in kwargs, "param must be provided."
|
137 |
+
|
138 |
+
qtz_func = kwargs.get('qtz_func')
|
139 |
+
deqtz_func = kwargs.get('deqtz_func')
|
140 |
+
params_list = kwargs.get('params_list')
|
141 |
+
param = kwargs.get('param')
|
142 |
+
|
143 |
+
n_runs = 50 # Number of runs to try to find the best parameters
|
144 |
+
n_random_params = 50 # Number of random parameters to generate
|
145 |
+
n_best_to_pick = 5 # Number of best parameters to pick after each run
|
146 |
+
max_initial = 10000 # Maximum value to initialize the parameters
|
147 |
+
|
148 |
+
# Initializes the parameters
|
149 |
+
base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param }
|
150 |
+
params = _build_initial_param(x, max_initial, n_random_params)
|
151 |
+
|
152 |
+
# Performs the search
|
153 |
+
for _ in range(n_runs):
|
154 |
+
|
155 |
+
best_params = []
|
156 |
+
for param_ in params:
|
157 |
+
try:
|
158 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_})
|
159 |
+
loss_ones = nn.MSELoss()(x, x_)
|
160 |
+
|
161 |
+
if len(best_params) < n_best_to_pick:
|
162 |
+
best_params.append((param_, loss_ones.item()))
|
163 |
+
best_params = sorted(best_params, key=lambda x: x[1])
|
164 |
+
elif loss_ones < best_params[-1][1]:
|
165 |
+
best_params[-1] = (param_, loss_ones.item())
|
166 |
+
best_params = sorted(best_params, key=lambda x: x[1])
|
167 |
+
|
168 |
+
except Exception: # The parameters might not be valid for the function's domain
|
169 |
+
continue
|
170 |
+
|
171 |
+
# Generates new parameters around the mean
|
172 |
+
params = _search_param([p for p, _ in best_params], n_random_params)
|
173 |
+
|
174 |
+
# Checks if the best parameter is better than the init_ones
|
175 |
+
p_ones = init_ones(x, **kwargs)
|
176 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones})
|
177 |
+
loss_ones = nn.MSELoss()(x, x_)
|
178 |
+
|
179 |
+
# Checks if the best parameter is better than the init_rand
|
180 |
+
p_rand = init_rand(x, **kwargs)
|
181 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand})
|
182 |
+
loss_rand = nn.MSELoss()(x, x_)
|
183 |
+
|
184 |
+
if loss_rand < best_params[0][1] and loss_rand < loss_ones:
|
185 |
+
return p_rand
|
186 |
+
elif loss_ones < best_params[0][1] and loss_ones < loss_rand:
|
187 |
+
return p_ones
|
188 |
+
else:
|
189 |
+
return best_params[0][0]
|
190 |
+
|
191 |
+
|
192 |
+
def init_linear_scale( # Symmetric scale. From the study folder
|
193 |
+
x: torch.Tensor,
|
194 |
+
**kwargs: Dict[str, Any],
|
195 |
+
) -> torch.Tensor:
|
196 |
+
assert "bits" in kwargs, "bits must be provided."
|
197 |
+
assert "params" in kwargs, "params must be provided."
|
198 |
+
assert "qtz_func" in kwargs, "qtz_func must be provided."
|
199 |
+
|
200 |
+
bits = kwargs.get('bits')
|
201 |
+
params = kwargs.get('params')
|
202 |
+
qtz_func = kwargs.get('qtz_func')
|
203 |
+
|
204 |
+
x_ = x.transpose(0, 1)
|
205 |
+
x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs))
|
206 |
+
x_ = x_.transpose(0, 1)
|
207 |
+
|
208 |
+
quant_min, quant_max = get_min_max_from_bits_signed(bits)
|
209 |
+
min_vals, max_vals = torch.aminmax(x_, dim=1)
|
210 |
+
min_vals = torch.min(min_vals, torch.zeros_like(min_vals))
|
211 |
+
max_vals = torch.max(max_vals, torch.zeros_like(max_vals))
|
212 |
+
|
213 |
+
eps = torch.finfo(torch.float32).eps
|
214 |
+
|
215 |
+
abs_max_val_per_ch = torch.max(-min_vals, max_vals)
|
216 |
+
scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2)
|
217 |
+
|
218 |
+
scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device)
|
219 |
+
|
220 |
+
# Introduces some noise in scale
|
221 |
+
# If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything
|
222 |
+
# scale = scale + 0.01 * torch.randn_like(scale)
|
223 |
+
return scale
|
224 |
+
|
225 |
+
|
226 |
+
def init_non_linear_regression_fit(
|
227 |
+
x: torch.Tensor,
|
228 |
+
**kwargs: Dict[str, Any],
|
229 |
+
) -> torch.Tensor:
|
230 |
+
|
231 |
+
assert "params_list" in kwargs, "params list must be provided."
|
232 |
+
assert "np_fit_func" in kwargs, "np_fit_func must be provided."
|
233 |
+
assert "p0" in kwargs, "p0 must be provided."
|
234 |
+
np_fit_func = kwargs.get('np_fit_func')
|
235 |
+
params_list = kwargs.get('params_list')
|
236 |
+
p0 = kwargs.get('p0')
|
237 |
+
|
238 |
+
def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]):
|
239 |
+
popt, _ = curve_fit(
|
240 |
+
func,
|
241 |
+
xdata,
|
242 |
+
ydata,
|
243 |
+
maxfev=1000,
|
244 |
+
p0=p0,
|
245 |
+
method='lm'
|
246 |
+
)
|
247 |
+
return popt
|
248 |
+
|
249 |
+
# 1. Needs to convert the torch tensor to numpy tensor
|
250 |
+
xdata = x.cpu().numpy()
|
251 |
+
|
252 |
+
# 2. Sorts the data so that it makes it easier to fit to it
|
253 |
+
sorted_xdata = np.sort(xdata, axis=-1)
|
254 |
+
|
255 |
+
p0 = {k: v.cpu().numpy() for k, v in p0.items()}
|
256 |
+
params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order
|
257 |
+
|
258 |
+
# 3. Finds the best parameters for each channel
|
259 |
+
try:
|
260 |
+
params = []
|
261 |
+
for i in range(sorted_xdata.shape[0]):
|
262 |
+
xdata_ = sorted_xdata[i]
|
263 |
+
p0_ = [p0[p][i] for p in params_list]
|
264 |
+
ch_params = _fit(xdata_, xdata_, np_fit_func, p0_)
|
265 |
+
params.append(ch_params)
|
266 |
+
|
267 |
+
# 4. Builds the parameters
|
268 |
+
result = {}
|
269 |
+
for i, p in enumerate(params_list):
|
270 |
+
result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device)
|
271 |
+
|
272 |
+
return result
|
273 |
+
|
274 |
+
except ValueError as e:
|
275 |
+
print(f"Could not fit the function with error: {e}")
|
276 |
+
print(f"Using fallback result...")
|
277 |
+
return {
|
278 |
+
k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items()
|
279 |
+
}
|
280 |
+
|
281 |
+
|
282 |
+
def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
283 |
+
val = torch.amin(x, dim=1)
|
284 |
+
return torch.zeros_like(val, dtype=torch.float32, device=x.device)
|
285 |
+
|
286 |
+
|
287 |
+
def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor:
|
288 |
+
# Calculate the original minimum and maximum values
|
289 |
+
min_vals, max_vals = torch.aminmax(tensor, dim=-1)
|
290 |
+
x_min = torch.min(min_vals, torch.zeros_like(min_vals))
|
291 |
+
x_max = torch.max(max_vals, torch.zeros_like(max_vals))
|
292 |
+
|
293 |
+
if _max is torch.inf: # We do not need to scale the tensor. Just need to move it
|
294 |
+
return torch.ones_like(x_min)
|
295 |
+
|
296 |
+
# Calculate the scale factor
|
297 |
+
scale = (_max - _min) / (x_max - x_min)
|
298 |
+
return scale
|
299 |
+
|
300 |
+
|
301 |
+
|
302 |
+
############## Quant ###############
|
303 |
+
|
304 |
+
@torch.enable_grad()
|
305 |
+
def learn_parameters(
|
306 |
+
x: torch.Tensor,
|
307 |
+
params: Dict[str, nn.Parameter],
|
308 |
+
qtz_func: nn.Module,
|
309 |
+
deqtz_func: nn.Module,
|
310 |
+
bits: int,
|
311 |
+
target_dtype: torch.dtype,
|
312 |
+
epochs: int = 1000,
|
313 |
+
early_stop: bool = True,
|
314 |
+
do_report: bool = False
|
315 |
+
) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]:
|
316 |
+
loss_fn = nn.MSELoss()
|
317 |
+
|
318 |
+
# Determines the initial learning rate by computing the initial loss and multiplying it by
|
319 |
+
# the order of magnitude of the loss divided by 2
|
320 |
+
quant = quantize(x, params, qtz_func, bits, target_dtype)
|
321 |
+
dequant = dequantize(quant, params, deqtz_func, bits, x.dtype)
|
322 |
+
loss = loss_fn(x, dequant)
|
323 |
+
|
324 |
+
base_lr = 0.1
|
325 |
+
exponent = int(np.floor(np.log10(loss.item())))
|
326 |
+
lr = base_lr * (10 ** (exponent // 2))
|
327 |
+
|
328 |
+
# Requires gradients in the parameters
|
329 |
+
for p in params.values():
|
330 |
+
p.requires_grad = True
|
331 |
+
p.grad = None
|
332 |
+
|
333 |
+
param_keys = list(params.keys())
|
334 |
+
param_values = list(params.values())
|
335 |
+
|
336 |
+
# Defines optimizer and loss function
|
337 |
+
optimizer = torch.optim.Adam(param_values, lr=lr)
|
338 |
+
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.01, total_iters=epochs // 10)
|
339 |
+
|
340 |
+
# Contains the best loss and the best parameters
|
341 |
+
best_loss = float("inf")
|
342 |
+
best_params = None
|
343 |
+
|
344 |
+
# Used to stop the search early
|
345 |
+
min_delta = 1e-7
|
346 |
+
acc_loss = []
|
347 |
+
percent_epochs_before_stop = 0.1
|
348 |
+
|
349 |
+
for i in range(epochs):
|
350 |
+
optimizer.zero_grad()
|
351 |
+
|
352 |
+
quant = quantize(x, params, qtz_func, bits, target_dtype)
|
353 |
+
dequant = dequantize(quant, params, deqtz_func, bits, x.dtype)
|
354 |
+
loss = loss_fn(x, dequant)
|
355 |
+
|
356 |
+
if loss.isnan() or loss.isinf():
|
357 |
+
raise Exception("Loss is NaN or Inf. Stopping the search.")
|
358 |
+
|
359 |
+
loss.backward()
|
360 |
+
optimizer.step()
|
361 |
+
scheduler.step()
|
362 |
+
|
363 |
+
acc_loss.append(loss.item())
|
364 |
+
|
365 |
+
# Reports loss every 10 steps
|
366 |
+
if i % 10 == 0 and do_report:
|
367 |
+
print(f"Epoch {i}: Loss {loss.item()}")
|
368 |
+
|
369 |
+
# Optimizes the parameter search by storing the best loss and the parameters
|
370 |
+
if loss.item() < best_loss:
|
371 |
+
best_loss = loss.item()
|
372 |
+
best_params = copy.deepcopy({
|
373 |
+
k: v for k, v in params.items() if k in param_keys
|
374 |
+
})
|
375 |
+
|
376 |
+
# We also stop the search if the loss has not considerably during the last 10% epochs
|
377 |
+
if early_stop:
|
378 |
+
epochs_before_stop = int(epochs * percent_epochs_before_stop)
|
379 |
+
if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta:
|
380 |
+
break
|
381 |
+
|
382 |
+
# No longer requires gradients in the parameters
|
383 |
+
for p in best_params.values():
|
384 |
+
p.requires_grad = False
|
385 |
+
p.grad = None
|
386 |
+
|
387 |
+
if do_report:
|
388 |
+
return best_params, acc_loss
|
389 |
+
else:
|
390 |
+
return best_params
|
391 |
+
|
392 |
+
|
393 |
+
def quantize(
|
394 |
+
x: torch.Tensor,
|
395 |
+
params: Dict[str, nn.Parameter],
|
396 |
+
func: nn.Module,
|
397 |
+
bits: int,
|
398 |
+
target_dtype: torch.dtype = torch.int8
|
399 |
+
) -> torch.Tensor:
|
400 |
+
quant_min, quant_max = get_min_max_from_bits_signed(bits)
|
401 |
+
x = x.transpose(0, 1) # Aligns shapes
|
402 |
+
x = func(x=x, **params)
|
403 |
+
x = x.transpose(0, 1)
|
404 |
+
x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype)
|
405 |
+
return x
|
406 |
+
|
407 |
+
|
408 |
+
def dequantize(
|
409 |
+
x: torch.Tensor,
|
410 |
+
params: Dict[str, nn.Parameter],
|
411 |
+
func: nn.Module,
|
412 |
+
bits: int,
|
413 |
+
out_dtype: torch.dtype
|
414 |
+
) -> torch.Tensor:
|
415 |
+
x = x.to(dtype=out_dtype)
|
416 |
+
x = x.transpose(0, 1)
|
417 |
+
x = func(x=x, **params)
|
418 |
+
x = x.transpose(0, 1)
|
419 |
+
return x
|
420 |
+
|
421 |
+
|
422 |
+
def round_func_BPDA(input):
|
423 |
+
# This is equivalent to replacing round function (non-differentiable) with
|
424 |
+
# an identity function (differentiable) only when backward.
|
425 |
+
forward_value = torch.round(input)
|
426 |
+
out = input.clone()
|
427 |
+
out.data = forward_value.data
|
428 |
+
return out
|
429 |
+
|
430 |
+
|
431 |
+
def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]:
|
432 |
+
return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1
|
433 |
+
|
434 |
+
|
435 |
+
|
436 |
+
############## Numpy ###############
|
437 |
+
|
438 |
+
def np_domain_guard(
|
439 |
+
x: np.ndarray,
|
440 |
+
min: float = None,
|
441 |
+
max: float = None,
|
442 |
+
posinf: float = None,
|
443 |
+
neginf: float = None,
|
444 |
+
nan: float = None
|
445 |
+
) -> np.ndarray:
|
446 |
+
"""Guard a tensor to a valid domain."""
|
447 |
+
x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan)
|
448 |
+
if min is not None or max is not None:
|
449 |
+
x = np.clip(x, min, max)
|
450 |
+
return x
|
451 |
+
|
452 |
+
|
453 |
+
def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray:
|
454 |
+
"""Replace a number in a tensor with another number.
|
455 |
+
|
456 |
+
Args:
|
457 |
+
x (np.ndarray): The input tensor.
|
458 |
+
num (float): The number to replace.
|
459 |
+
to (float): The number to replace with.
|
460 |
+
|
461 |
+
Returns:
|
462 |
+
np.ndarray: The tensor with the number replaced.
|
463 |
+
"""
|
464 |
+
return np.where(x == num, to, x)
|
465 |
+
|
466 |
+
|
467 |
+
def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray:
|
468 |
+
"""Guard the power operation to a valid domain."""
|
469 |
+
return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp)
|
470 |
+
|
fn_gen/norm_nlr/10/loss.png
ADDED
fn_gen/norm_nlr/10/quantization.png
ADDED
fn_gen/norm_nlr/11/__pycache__/fn.cpython-311.pyc
ADDED
Binary file (24.6 kB). View file
|
|
fn_gen/norm_nlr/11/distortion.png
ADDED
fn_gen/norm_nlr/11/expressions.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
tan(_0*x)/_s
|
2 |
+
atan(_s*x)/_0
|
fn_gen/norm_nlr/11/fn.py
ADDED
@@ -0,0 +1,469 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from torch import amin # Necessary for arcsin
|
5 |
+
import copy
|
6 |
+
import torch.nn as nn
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
from scipy.optimize import curve_fit
|
10 |
+
from typing import Dict, Any, Tuple, List, Callable
|
11 |
+
|
12 |
+
|
13 |
+
def quantization(x, **params):
|
14 |
+
return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * torch.tan(domain_guard((params['_0'] * x), posinf=1, neginf=-1, nan=0)))
|
15 |
+
|
16 |
+
|
17 |
+
def dequantization(x, **params):
|
18 |
+
return (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * torch.atan((params['_s'] * x)))
|
19 |
+
|
20 |
+
|
21 |
+
def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]:
|
22 |
+
base_p0 = {
|
23 |
+
'_0': init_space_search(x, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], param='_0', **kwargs),
|
24 |
+
}
|
25 |
+
|
26 |
+
base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs)
|
27 |
+
if 'post_init_hook' in kwargs:
|
28 |
+
kwargs['post_init_hook'](parameters=base_p0)
|
29 |
+
|
30 |
+
params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], **kwargs)
|
31 |
+
params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()}
|
32 |
+
if 'post_method_hook' in kwargs:
|
33 |
+
kwargs['post_method_hook'](parameters=params)
|
34 |
+
|
35 |
+
|
36 |
+
if 'post_train_hook' in kwargs:
|
37 |
+
kwargs['post_train_hook'](parameters=params)
|
38 |
+
|
39 |
+
return params
|
40 |
+
|
41 |
+
|
42 |
+
############### Numpy Qtz ###############
|
43 |
+
|
44 |
+
|
45 |
+
def np_quantization(x, _0, _s):
|
46 |
+
return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np.tan(np_domain_guard((_0 * x), posinf=1, neginf=-1, nan=0)))
|
47 |
+
|
48 |
+
|
49 |
+
def np_dequantization(x, _0, _s):
|
50 |
+
return (np.divide(1, np_replace_num(_0, num=0, to=10000)) * np.arctan((_s * x)))
|
51 |
+
|
52 |
+
|
53 |
+
def fit_func(x, _0, _s):
|
54 |
+
x_ = np_quantization(x, _0, _s)
|
55 |
+
x_ = np_dequantization(x_, _0, _s)
|
56 |
+
return x_
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
############### HELPERS ###############
|
61 |
+
|
62 |
+
def domain_guard(
|
63 |
+
x: torch.Tensor,
|
64 |
+
min: float = None,
|
65 |
+
max: float = None,
|
66 |
+
posinf: float = None,
|
67 |
+
neginf: float = None,
|
68 |
+
nan: float = None
|
69 |
+
) -> torch.Tensor:
|
70 |
+
"""Guard a tensor to a valid domain."""
|
71 |
+
x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan)
|
72 |
+
if min is not None or max is not None:
|
73 |
+
x = torch.clamp(x, min=min, max=max)
|
74 |
+
return x
|
75 |
+
|
76 |
+
|
77 |
+
def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor:
|
78 |
+
"""Replace a number in a tensor with another number.
|
79 |
+
|
80 |
+
Args:
|
81 |
+
x (torch.Tensor): The input tensor.
|
82 |
+
num (float): The number to replace.
|
83 |
+
to (float): The number to replace with.
|
84 |
+
|
85 |
+
Returns:
|
86 |
+
torch.Tensor: The tensor with the number replaced.
|
87 |
+
"""
|
88 |
+
return torch.where(x == num, to, x)
|
89 |
+
|
90 |
+
|
91 |
+
def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor:
|
92 |
+
"""Guard the power operation to a valid domain."""
|
93 |
+
return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp)
|
94 |
+
|
95 |
+
|
96 |
+
def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
97 |
+
val = torch.amin(x, dim=1)
|
98 |
+
return torch.ones_like(val, dtype=torch.float32, device=x.device)
|
99 |
+
|
100 |
+
|
101 |
+
def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
102 |
+
val = torch.amin(x, dim=1)
|
103 |
+
return torch.randn_like(val, dtype=torch.float32, device=x.device)
|
104 |
+
|
105 |
+
|
106 |
+
def init_space_search(
|
107 |
+
x: torch.Tensor,
|
108 |
+
**kwargs: Dict[str, Any],
|
109 |
+
) -> torch.Tensor:
|
110 |
+
|
111 |
+
def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int):
|
112 |
+
"""Generates the initial set of parameters. The first iteration generates 10 times more parameters."""
|
113 |
+
for _ in range(n_params * 10): # The first iteration generates 10 times more parameters
|
114 |
+
yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial]
|
115 |
+
|
116 |
+
def _search_param(tensors: List[torch.tensor], n_params):
|
117 |
+
"""Takes the best parameters and generates new parameters around the mean of the best parameters."""
|
118 |
+
torch_tensors = torch.stack(tensors)
|
119 |
+
min_vals, max_vals = torch.aminmax(torch_tensors, dim=0)
|
120 |
+
abs_max_val_per_ch = torch.max(-min_vals, max_vals)
|
121 |
+
mean = torch.mean(torch_tensors, dim=0)
|
122 |
+
for _ in range(n_params): # Generates n_params around the mean of the tensors
|
123 |
+
yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean
|
124 |
+
|
125 |
+
def _calc(x, qtz_func, deqtz_func, **params):
|
126 |
+
x_ = x.transpose(0, 1)
|
127 |
+
x_ = qtz_func(x=x_, **params)
|
128 |
+
x_ = deqtz_func(x=x_, **params)
|
129 |
+
x_ = x_.transpose(0, 1)
|
130 |
+
return x_
|
131 |
+
|
132 |
+
assert "qtz_func" in kwargs, "qtz_func must be provided."
|
133 |
+
assert "deqtz_func" in kwargs, "deqtz_func must be provided."
|
134 |
+
assert "params_list" in kwargs, "params list must be provided."
|
135 |
+
assert "param" in kwargs, "param must be provided."
|
136 |
+
|
137 |
+
qtz_func = kwargs.get('qtz_func')
|
138 |
+
deqtz_func = kwargs.get('deqtz_func')
|
139 |
+
params_list = kwargs.get('params_list')
|
140 |
+
param = kwargs.get('param')
|
141 |
+
|
142 |
+
n_runs = 50 # Number of runs to try to find the best parameters
|
143 |
+
n_random_params = 50 # Number of random parameters to generate
|
144 |
+
n_best_to_pick = 5 # Number of best parameters to pick after each run
|
145 |
+
max_initial = 10000 # Maximum value to initialize the parameters
|
146 |
+
|
147 |
+
# Initializes the parameters
|
148 |
+
base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param }
|
149 |
+
params = _build_initial_param(x, max_initial, n_random_params)
|
150 |
+
|
151 |
+
# Performs the search
|
152 |
+
for _ in range(n_runs):
|
153 |
+
|
154 |
+
best_params = []
|
155 |
+
for param_ in params:
|
156 |
+
try:
|
157 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_})
|
158 |
+
loss_ones = nn.MSELoss()(x, x_)
|
159 |
+
|
160 |
+
if len(best_params) < n_best_to_pick:
|
161 |
+
best_params.append((param_, loss_ones.item()))
|
162 |
+
best_params = sorted(best_params, key=lambda x: x[1])
|
163 |
+
elif loss_ones < best_params[-1][1]:
|
164 |
+
best_params[-1] = (param_, loss_ones.item())
|
165 |
+
best_params = sorted(best_params, key=lambda x: x[1])
|
166 |
+
|
167 |
+
except Exception: # The parameters might not be valid for the function's domain
|
168 |
+
continue
|
169 |
+
|
170 |
+
# Generates new parameters around the mean
|
171 |
+
params = _search_param([p for p, _ in best_params], n_random_params)
|
172 |
+
|
173 |
+
# Checks if the best parameter is better than the init_ones
|
174 |
+
p_ones = init_ones(x, **kwargs)
|
175 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones})
|
176 |
+
loss_ones = nn.MSELoss()(x, x_)
|
177 |
+
|
178 |
+
# Checks if the best parameter is better than the init_rand
|
179 |
+
p_rand = init_rand(x, **kwargs)
|
180 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand})
|
181 |
+
loss_rand = nn.MSELoss()(x, x_)
|
182 |
+
|
183 |
+
if loss_rand < best_params[0][1] and loss_rand < loss_ones:
|
184 |
+
return p_rand
|
185 |
+
elif loss_ones < best_params[0][1] and loss_ones < loss_rand:
|
186 |
+
return p_ones
|
187 |
+
else:
|
188 |
+
return best_params[0][0]
|
189 |
+
|
190 |
+
|
191 |
+
def init_linear_scale( # Symmetric scale. From the study folder
|
192 |
+
x: torch.Tensor,
|
193 |
+
**kwargs: Dict[str, Any],
|
194 |
+
) -> torch.Tensor:
|
195 |
+
assert "bits" in kwargs, "bits must be provided."
|
196 |
+
assert "params" in kwargs, "params must be provided."
|
197 |
+
assert "qtz_func" in kwargs, "qtz_func must be provided."
|
198 |
+
|
199 |
+
bits = kwargs.get('bits')
|
200 |
+
params = kwargs.get('params')
|
201 |
+
qtz_func = kwargs.get('qtz_func')
|
202 |
+
|
203 |
+
x_ = x.transpose(0, 1)
|
204 |
+
x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs))
|
205 |
+
x_ = x_.transpose(0, 1)
|
206 |
+
|
207 |
+
quant_min, quant_max = get_min_max_from_bits_signed(bits)
|
208 |
+
min_vals, max_vals = torch.aminmax(x_, dim=1)
|
209 |
+
min_vals = torch.min(min_vals, torch.zeros_like(min_vals))
|
210 |
+
max_vals = torch.max(max_vals, torch.zeros_like(max_vals))
|
211 |
+
|
212 |
+
eps = torch.finfo(torch.float32).eps
|
213 |
+
|
214 |
+
abs_max_val_per_ch = torch.max(-min_vals, max_vals)
|
215 |
+
scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2)
|
216 |
+
|
217 |
+
scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device)
|
218 |
+
|
219 |
+
# Introduces some noise in scale
|
220 |
+
# If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything
|
221 |
+
# scale = scale + 0.01 * torch.randn_like(scale)
|
222 |
+
return scale
|
223 |
+
|
224 |
+
|
225 |
+
def init_non_linear_regression_fit(
|
226 |
+
x: torch.Tensor,
|
227 |
+
**kwargs: Dict[str, Any],
|
228 |
+
) -> torch.Tensor:
|
229 |
+
|
230 |
+
assert "params_list" in kwargs, "params list must be provided."
|
231 |
+
assert "np_fit_func" in kwargs, "np_fit_func must be provided."
|
232 |
+
assert "p0" in kwargs, "p0 must be provided."
|
233 |
+
np_fit_func = kwargs.get('np_fit_func')
|
234 |
+
params_list = kwargs.get('params_list')
|
235 |
+
p0 = kwargs.get('p0')
|
236 |
+
|
237 |
+
def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]):
|
238 |
+
popt, _ = curve_fit(
|
239 |
+
func,
|
240 |
+
xdata,
|
241 |
+
ydata,
|
242 |
+
maxfev=1000,
|
243 |
+
p0=p0,
|
244 |
+
method='lm'
|
245 |
+
)
|
246 |
+
return popt
|
247 |
+
|
248 |
+
# 1. Needs to convert the torch tensor to numpy tensor
|
249 |
+
xdata = x.cpu().numpy()
|
250 |
+
|
251 |
+
# 2. Sorts the data so that it makes it easier to fit to it
|
252 |
+
sorted_xdata = np.sort(xdata, axis=-1)
|
253 |
+
|
254 |
+
p0 = {k: v.cpu().numpy() for k, v in p0.items()}
|
255 |
+
params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order
|
256 |
+
|
257 |
+
# 3. Finds the best parameters for each channel
|
258 |
+
try:
|
259 |
+
params = []
|
260 |
+
for i in range(sorted_xdata.shape[0]):
|
261 |
+
xdata_ = sorted_xdata[i]
|
262 |
+
p0_ = [p0[p][i] for p in params_list]
|
263 |
+
ch_params = _fit(xdata_, xdata_, np_fit_func, p0_)
|
264 |
+
params.append(ch_params)
|
265 |
+
|
266 |
+
# 4. Builds the parameters
|
267 |
+
result = {}
|
268 |
+
for i, p in enumerate(params_list):
|
269 |
+
result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device)
|
270 |
+
|
271 |
+
return result
|
272 |
+
|
273 |
+
except ValueError as e:
|
274 |
+
print(f"Could not fit the function with error: {e}")
|
275 |
+
print(f"Using fallback result...")
|
276 |
+
return {
|
277 |
+
k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items()
|
278 |
+
}
|
279 |
+
|
280 |
+
|
281 |
+
def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
282 |
+
val = torch.amin(x, dim=1)
|
283 |
+
return torch.zeros_like(val, dtype=torch.float32, device=x.device)
|
284 |
+
|
285 |
+
|
286 |
+
def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor:
|
287 |
+
# Calculate the original minimum and maximum values
|
288 |
+
min_vals, max_vals = torch.aminmax(tensor, dim=-1)
|
289 |
+
x_min = torch.min(min_vals, torch.zeros_like(min_vals))
|
290 |
+
x_max = torch.max(max_vals, torch.zeros_like(max_vals))
|
291 |
+
|
292 |
+
if _max is torch.inf: # We do not need to scale the tensor. Just need to move it
|
293 |
+
return torch.ones_like(x_min)
|
294 |
+
|
295 |
+
# Calculate the scale factor
|
296 |
+
scale = (_max - _min) / (x_max - x_min)
|
297 |
+
return scale
|
298 |
+
|
299 |
+
|
300 |
+
|
301 |
+
############## Quant ###############
|
302 |
+
|
303 |
+
@torch.enable_grad()
|
304 |
+
def learn_parameters(
|
305 |
+
x: torch.Tensor,
|
306 |
+
params: Dict[str, nn.Parameter],
|
307 |
+
qtz_func: nn.Module,
|
308 |
+
deqtz_func: nn.Module,
|
309 |
+
bits: int,
|
310 |
+
target_dtype: torch.dtype,
|
311 |
+
epochs: int = 1000,
|
312 |
+
early_stop: bool = True,
|
313 |
+
do_report: bool = False
|
314 |
+
) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]:
|
315 |
+
loss_fn = nn.MSELoss()
|
316 |
+
|
317 |
+
# Determines the initial learning rate by computing the initial loss and multiplying it by
|
318 |
+
# the order of magnitude of the loss divided by 2
|
319 |
+
quant = quantize(x, params, qtz_func, bits, target_dtype)
|
320 |
+
dequant = dequantize(quant, params, deqtz_func, bits, x.dtype)
|
321 |
+
loss = loss_fn(x, dequant)
|
322 |
+
|
323 |
+
base_lr = 0.1
|
324 |
+
exponent = int(np.floor(np.log10(loss.item())))
|
325 |
+
lr = base_lr * (10 ** (exponent // 2))
|
326 |
+
|
327 |
+
# Requires gradients in the parameters
|
328 |
+
for p in params.values():
|
329 |
+
p.requires_grad = True
|
330 |
+
p.grad = None
|
331 |
+
|
332 |
+
param_keys = list(params.keys())
|
333 |
+
param_values = list(params.values())
|
334 |
+
|
335 |
+
# Defines optimizer and loss function
|
336 |
+
optimizer = torch.optim.Adam(param_values, lr=lr)
|
337 |
+
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.01, total_iters=epochs // 10)
|
338 |
+
|
339 |
+
# Contains the best loss and the best parameters
|
340 |
+
best_loss = float("inf")
|
341 |
+
best_params = None
|
342 |
+
|
343 |
+
# Used to stop the search early
|
344 |
+
min_delta = 1e-7
|
345 |
+
acc_loss = []
|
346 |
+
percent_epochs_before_stop = 0.1
|
347 |
+
|
348 |
+
for i in range(epochs):
|
349 |
+
optimizer.zero_grad()
|
350 |
+
|
351 |
+
quant = quantize(x, params, qtz_func, bits, target_dtype)
|
352 |
+
dequant = dequantize(quant, params, deqtz_func, bits, x.dtype)
|
353 |
+
loss = loss_fn(x, dequant)
|
354 |
+
|
355 |
+
if loss.isnan() or loss.isinf():
|
356 |
+
raise Exception("Loss is NaN or Inf. Stopping the search.")
|
357 |
+
|
358 |
+
loss.backward()
|
359 |
+
optimizer.step()
|
360 |
+
scheduler.step()
|
361 |
+
|
362 |
+
acc_loss.append(loss.item())
|
363 |
+
|
364 |
+
# Reports loss every 10 steps
|
365 |
+
if i % 10 == 0 and do_report:
|
366 |
+
print(f"Epoch {i}: Loss {loss.item()}")
|
367 |
+
|
368 |
+
# Optimizes the parameter search by storing the best loss and the parameters
|
369 |
+
if loss.item() < best_loss:
|
370 |
+
best_loss = loss.item()
|
371 |
+
best_params = copy.deepcopy({
|
372 |
+
k: v for k, v in params.items() if k in param_keys
|
373 |
+
})
|
374 |
+
|
375 |
+
# We also stop the search if the loss has not considerably during the last 10% epochs
|
376 |
+
if early_stop:
|
377 |
+
epochs_before_stop = int(epochs * percent_epochs_before_stop)
|
378 |
+
if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta:
|
379 |
+
break
|
380 |
+
|
381 |
+
# No longer requires gradients in the parameters
|
382 |
+
for p in best_params.values():
|
383 |
+
p.requires_grad = False
|
384 |
+
p.grad = None
|
385 |
+
|
386 |
+
if do_report:
|
387 |
+
return best_params, acc_loss
|
388 |
+
else:
|
389 |
+
return best_params
|
390 |
+
|
391 |
+
|
392 |
+
def quantize(
|
393 |
+
x: torch.Tensor,
|
394 |
+
params: Dict[str, nn.Parameter],
|
395 |
+
func: nn.Module,
|
396 |
+
bits: int,
|
397 |
+
target_dtype: torch.dtype = torch.int8
|
398 |
+
) -> torch.Tensor:
|
399 |
+
quant_min, quant_max = get_min_max_from_bits_signed(bits)
|
400 |
+
x = x.transpose(0, 1) # Aligns shapes
|
401 |
+
x = func(x=x, **params)
|
402 |
+
x = x.transpose(0, 1)
|
403 |
+
x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype)
|
404 |
+
return x
|
405 |
+
|
406 |
+
|
407 |
+
def dequantize(
|
408 |
+
x: torch.Tensor,
|
409 |
+
params: Dict[str, nn.Parameter],
|
410 |
+
func: nn.Module,
|
411 |
+
bits: int,
|
412 |
+
out_dtype: torch.dtype
|
413 |
+
) -> torch.Tensor:
|
414 |
+
x = x.to(dtype=out_dtype)
|
415 |
+
x = x.transpose(0, 1)
|
416 |
+
x = func(x=x, **params)
|
417 |
+
x = x.transpose(0, 1)
|
418 |
+
return x
|
419 |
+
|
420 |
+
|
421 |
+
def round_func_BPDA(input):
|
422 |
+
# This is equivalent to replacing round function (non-differentiable) with
|
423 |
+
# an identity function (differentiable) only when backward.
|
424 |
+
forward_value = torch.round(input)
|
425 |
+
out = input.clone()
|
426 |
+
out.data = forward_value.data
|
427 |
+
return out
|
428 |
+
|
429 |
+
|
430 |
+
def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]:
|
431 |
+
return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1
|
432 |
+
|
433 |
+
|
434 |
+
|
435 |
+
############## Numpy ###############
|
436 |
+
|
437 |
+
def np_domain_guard(
|
438 |
+
x: np.ndarray,
|
439 |
+
min: float = None,
|
440 |
+
max: float = None,
|
441 |
+
posinf: float = None,
|
442 |
+
neginf: float = None,
|
443 |
+
nan: float = None
|
444 |
+
) -> np.ndarray:
|
445 |
+
"""Guard a tensor to a valid domain."""
|
446 |
+
x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan)
|
447 |
+
if min is not None or max is not None:
|
448 |
+
x = np.clip(x, min, max)
|
449 |
+
return x
|
450 |
+
|
451 |
+
|
452 |
+
def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray:
|
453 |
+
"""Replace a number in a tensor with another number.
|
454 |
+
|
455 |
+
Args:
|
456 |
+
x (np.ndarray): The input tensor.
|
457 |
+
num (float): The number to replace.
|
458 |
+
to (float): The number to replace with.
|
459 |
+
|
460 |
+
Returns:
|
461 |
+
np.ndarray: The tensor with the number replaced.
|
462 |
+
"""
|
463 |
+
return np.where(x == num, to, x)
|
464 |
+
|
465 |
+
|
466 |
+
def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray:
|
467 |
+
"""Guard the power operation to a valid domain."""
|
468 |
+
return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp)
|
469 |
+
|
fn_gen/norm_nlr/11/loss.png
ADDED
fn_gen/norm_nlr/11/quantization.png
ADDED
fn_gen/norm_nlr/12/__pycache__/fn.cpython-311.pyc
ADDED
Binary file (25.4 kB). View file
|
|
fn_gen/norm_nlr/12/distortion.png
ADDED
fn_gen/norm_nlr/12/expressions.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
acos(_0*(-_1 + x) - 0.99999)/_s
|
2 |
+
(_0*_1 + cos(_s*x) + 0.99999)/_0
|
fn_gen/norm_nlr/12/fn.py
ADDED
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from torch import amin # Necessary for arcsin
|
5 |
+
import copy
|
6 |
+
import torch.nn as nn
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
from scipy.optimize import curve_fit
|
10 |
+
from typing import Dict, Any, Tuple, List, Callable
|
11 |
+
|
12 |
+
|
13 |
+
def quantization(x, **params):
|
14 |
+
return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * torch.acos(domain_guard((torch.tensor(-0.99999) + (params['_0'] * (x + (torch.tensor(-1) * params['_1'])))), min=-0.99999, max=0.99999, nan=0)))
|
15 |
+
|
16 |
+
|
17 |
+
def dequantization(x, **params):
|
18 |
+
return (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * (torch.tensor(0.99999) + (params['_0'] * params['_1']) + torch.cos((params['_s'] * x))))
|
19 |
+
|
20 |
+
|
21 |
+
def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]:
|
22 |
+
base_p0 = {
|
23 |
+
'_0': init_inner_scale(x, **{'_min': -0.99999, '_max': 0.99999}),
|
24 |
+
'_1': amin(x, **{'dim': -1}),
|
25 |
+
}
|
26 |
+
|
27 |
+
base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs)
|
28 |
+
if 'post_init_hook' in kwargs:
|
29 |
+
kwargs['post_init_hook'](parameters=base_p0)
|
30 |
+
|
31 |
+
params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_1', '_s'], **kwargs)
|
32 |
+
params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()}
|
33 |
+
if 'post_method_hook' in kwargs:
|
34 |
+
kwargs['post_method_hook'](parameters=params)
|
35 |
+
|
36 |
+
|
37 |
+
if 'post_train_hook' in kwargs:
|
38 |
+
kwargs['post_train_hook'](parameters=params)
|
39 |
+
|
40 |
+
return params
|
41 |
+
|
42 |
+
|
43 |
+
############### Numpy Qtz ###############
|
44 |
+
|
45 |
+
|
46 |
+
def np_quantization(x, _0, _1, _s):
|
47 |
+
return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np.arccos(np_domain_guard((np.array(-0.99999) + (_0 * (x + (np.array(-1) * _1)))), min=-0.99999, max=0.99999, nan=0)))
|
48 |
+
|
49 |
+
|
50 |
+
def np_dequantization(x, _0, _1, _s):
|
51 |
+
return (np.divide(1, np_replace_num(_0, num=0, to=10000)) * (np.array(0.99999) + (_0 * _1) + np.cos((_s * x))))
|
52 |
+
|
53 |
+
|
54 |
+
def fit_func(x, _0, _1, _s):
|
55 |
+
x_ = np_quantization(x, _0, _1, _s)
|
56 |
+
x_ = np_dequantization(x_, _0, _1, _s)
|
57 |
+
return x_
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
############### HELPERS ###############
|
62 |
+
|
63 |
+
def domain_guard(
|
64 |
+
x: torch.Tensor,
|
65 |
+
min: float = None,
|
66 |
+
max: float = None,
|
67 |
+
posinf: float = None,
|
68 |
+
neginf: float = None,
|
69 |
+
nan: float = None
|
70 |
+
) -> torch.Tensor:
|
71 |
+
"""Guard a tensor to a valid domain."""
|
72 |
+
x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan)
|
73 |
+
if min is not None or max is not None:
|
74 |
+
x = torch.clamp(x, min=min, max=max)
|
75 |
+
return x
|
76 |
+
|
77 |
+
|
78 |
+
def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor:
|
79 |
+
"""Replace a number in a tensor with another number.
|
80 |
+
|
81 |
+
Args:
|
82 |
+
x (torch.Tensor): The input tensor.
|
83 |
+
num (float): The number to replace.
|
84 |
+
to (float): The number to replace with.
|
85 |
+
|
86 |
+
Returns:
|
87 |
+
torch.Tensor: The tensor with the number replaced.
|
88 |
+
"""
|
89 |
+
return torch.where(x == num, to, x)
|
90 |
+
|
91 |
+
|
92 |
+
def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor:
|
93 |
+
"""Guard the power operation to a valid domain."""
|
94 |
+
return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp)
|
95 |
+
|
96 |
+
|
97 |
+
def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
98 |
+
val = torch.amin(x, dim=1)
|
99 |
+
return torch.ones_like(val, dtype=torch.float32, device=x.device)
|
100 |
+
|
101 |
+
|
102 |
+
def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
103 |
+
val = torch.amin(x, dim=1)
|
104 |
+
return torch.randn_like(val, dtype=torch.float32, device=x.device)
|
105 |
+
|
106 |
+
|
107 |
+
def init_space_search(
|
108 |
+
x: torch.Tensor,
|
109 |
+
**kwargs: Dict[str, Any],
|
110 |
+
) -> torch.Tensor:
|
111 |
+
|
112 |
+
def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int):
|
113 |
+
"""Generates the initial set of parameters. The first iteration generates 10 times more parameters."""
|
114 |
+
for _ in range(n_params * 10): # The first iteration generates 10 times more parameters
|
115 |
+
yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial]
|
116 |
+
|
117 |
+
def _search_param(tensors: List[torch.tensor], n_params):
|
118 |
+
"""Takes the best parameters and generates new parameters around the mean of the best parameters."""
|
119 |
+
torch_tensors = torch.stack(tensors)
|
120 |
+
min_vals, max_vals = torch.aminmax(torch_tensors, dim=0)
|
121 |
+
abs_max_val_per_ch = torch.max(-min_vals, max_vals)
|
122 |
+
mean = torch.mean(torch_tensors, dim=0)
|
123 |
+
for _ in range(n_params): # Generates n_params around the mean of the tensors
|
124 |
+
yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean
|
125 |
+
|
126 |
+
def _calc(x, qtz_func, deqtz_func, **params):
|
127 |
+
x_ = x.transpose(0, 1)
|
128 |
+
x_ = qtz_func(x=x_, **params)
|
129 |
+
x_ = deqtz_func(x=x_, **params)
|
130 |
+
x_ = x_.transpose(0, 1)
|
131 |
+
return x_
|
132 |
+
|
133 |
+
assert "qtz_func" in kwargs, "qtz_func must be provided."
|
134 |
+
assert "deqtz_func" in kwargs, "deqtz_func must be provided."
|
135 |
+
assert "params_list" in kwargs, "params list must be provided."
|
136 |
+
assert "param" in kwargs, "param must be provided."
|
137 |
+
|
138 |
+
qtz_func = kwargs.get('qtz_func')
|
139 |
+
deqtz_func = kwargs.get('deqtz_func')
|
140 |
+
params_list = kwargs.get('params_list')
|
141 |
+
param = kwargs.get('param')
|
142 |
+
|
143 |
+
n_runs = 50 # Number of runs to try to find the best parameters
|
144 |
+
n_random_params = 50 # Number of random parameters to generate
|
145 |
+
n_best_to_pick = 5 # Number of best parameters to pick after each run
|
146 |
+
max_initial = 10000 # Maximum value to initialize the parameters
|
147 |
+
|
148 |
+
# Initializes the parameters
|
149 |
+
base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param }
|
150 |
+
params = _build_initial_param(x, max_initial, n_random_params)
|
151 |
+
|
152 |
+
# Performs the search
|
153 |
+
for _ in range(n_runs):
|
154 |
+
|
155 |
+
best_params = []
|
156 |
+
for param_ in params:
|
157 |
+
try:
|
158 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_})
|
159 |
+
loss_ones = nn.MSELoss()(x, x_)
|
160 |
+
|
161 |
+
if len(best_params) < n_best_to_pick:
|
162 |
+
best_params.append((param_, loss_ones.item()))
|
163 |
+
best_params = sorted(best_params, key=lambda x: x[1])
|
164 |
+
elif loss_ones < best_params[-1][1]:
|
165 |
+
best_params[-1] = (param_, loss_ones.item())
|
166 |
+
best_params = sorted(best_params, key=lambda x: x[1])
|
167 |
+
|
168 |
+
except Exception: # The parameters might not be valid for the function's domain
|
169 |
+
continue
|
170 |
+
|
171 |
+
# Generates new parameters around the mean
|
172 |
+
params = _search_param([p for p, _ in best_params], n_random_params)
|
173 |
+
|
174 |
+
# Checks if the best parameter is better than the init_ones
|
175 |
+
p_ones = init_ones(x, **kwargs)
|
176 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones})
|
177 |
+
loss_ones = nn.MSELoss()(x, x_)
|
178 |
+
|
179 |
+
# Checks if the best parameter is better than the init_rand
|
180 |
+
p_rand = init_rand(x, **kwargs)
|
181 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand})
|
182 |
+
loss_rand = nn.MSELoss()(x, x_)
|
183 |
+
|
184 |
+
if loss_rand < best_params[0][1] and loss_rand < loss_ones:
|
185 |
+
return p_rand
|
186 |
+
elif loss_ones < best_params[0][1] and loss_ones < loss_rand:
|
187 |
+
return p_ones
|
188 |
+
else:
|
189 |
+
return best_params[0][0]
|
190 |
+
|
191 |
+
|
192 |
+
def init_linear_scale( # Symmetric scale. From the study folder
|
193 |
+
x: torch.Tensor,
|
194 |
+
**kwargs: Dict[str, Any],
|
195 |
+
) -> torch.Tensor:
|
196 |
+
assert "bits" in kwargs, "bits must be provided."
|
197 |
+
assert "params" in kwargs, "params must be provided."
|
198 |
+
assert "qtz_func" in kwargs, "qtz_func must be provided."
|
199 |
+
|
200 |
+
bits = kwargs.get('bits')
|
201 |
+
params = kwargs.get('params')
|
202 |
+
qtz_func = kwargs.get('qtz_func')
|
203 |
+
|
204 |
+
x_ = x.transpose(0, 1)
|
205 |
+
x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs))
|
206 |
+
x_ = x_.transpose(0, 1)
|
207 |
+
|
208 |
+
quant_min, quant_max = get_min_max_from_bits_signed(bits)
|
209 |
+
min_vals, max_vals = torch.aminmax(x_, dim=1)
|
210 |
+
min_vals = torch.min(min_vals, torch.zeros_like(min_vals))
|
211 |
+
max_vals = torch.max(max_vals, torch.zeros_like(max_vals))
|
212 |
+
|
213 |
+
eps = torch.finfo(torch.float32).eps
|
214 |
+
|
215 |
+
abs_max_val_per_ch = torch.max(-min_vals, max_vals)
|
216 |
+
scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2)
|
217 |
+
|
218 |
+
scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device)
|
219 |
+
|
220 |
+
# Introduces some noise in scale
|
221 |
+
# If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything
|
222 |
+
# scale = scale + 0.01 * torch.randn_like(scale)
|
223 |
+
return scale
|
224 |
+
|
225 |
+
|
226 |
+
def init_non_linear_regression_fit(
|
227 |
+
x: torch.Tensor,
|
228 |
+
**kwargs: Dict[str, Any],
|
229 |
+
) -> torch.Tensor:
|
230 |
+
|
231 |
+
assert "params_list" in kwargs, "params list must be provided."
|
232 |
+
assert "np_fit_func" in kwargs, "np_fit_func must be provided."
|
233 |
+
assert "p0" in kwargs, "p0 must be provided."
|
234 |
+
np_fit_func = kwargs.get('np_fit_func')
|
235 |
+
params_list = kwargs.get('params_list')
|
236 |
+
p0 = kwargs.get('p0')
|
237 |
+
|
238 |
+
def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]):
|
239 |
+
popt, _ = curve_fit(
|
240 |
+
func,
|
241 |
+
xdata,
|
242 |
+
ydata,
|
243 |
+
maxfev=1000,
|
244 |
+
p0=p0,
|
245 |
+
method='lm'
|
246 |
+
)
|
247 |
+
return popt
|
248 |
+
|
249 |
+
# 1. Needs to convert the torch tensor to numpy tensor
|
250 |
+
xdata = x.cpu().numpy()
|
251 |
+
|
252 |
+
# 2. Sorts the data so that it makes it easier to fit to it
|
253 |
+
sorted_xdata = np.sort(xdata, axis=-1)
|
254 |
+
|
255 |
+
p0 = {k: v.cpu().numpy() for k, v in p0.items()}
|
256 |
+
params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order
|
257 |
+
|
258 |
+
# 3. Finds the best parameters for each channel
|
259 |
+
try:
|
260 |
+
params = []
|
261 |
+
for i in range(sorted_xdata.shape[0]):
|
262 |
+
xdata_ = sorted_xdata[i]
|
263 |
+
p0_ = [p0[p][i] for p in params_list]
|
264 |
+
ch_params = _fit(xdata_, xdata_, np_fit_func, p0_)
|
265 |
+
params.append(ch_params)
|
266 |
+
|
267 |
+
# 4. Builds the parameters
|
268 |
+
result = {}
|
269 |
+
for i, p in enumerate(params_list):
|
270 |
+
result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device)
|
271 |
+
|
272 |
+
return result
|
273 |
+
|
274 |
+
except ValueError as e:
|
275 |
+
print(f"Could not fit the function with error: {e}")
|
276 |
+
print(f"Using fallback result...")
|
277 |
+
return {
|
278 |
+
k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items()
|
279 |
+
}
|
280 |
+
|
281 |
+
|
282 |
+
def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
283 |
+
val = torch.amin(x, dim=1)
|
284 |
+
return torch.zeros_like(val, dtype=torch.float32, device=x.device)
|
285 |
+
|
286 |
+
|
287 |
+
def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor:
|
288 |
+
# Calculate the original minimum and maximum values
|
289 |
+
min_vals, max_vals = torch.aminmax(tensor, dim=-1)
|
290 |
+
x_min = torch.min(min_vals, torch.zeros_like(min_vals))
|
291 |
+
x_max = torch.max(max_vals, torch.zeros_like(max_vals))
|
292 |
+
|
293 |
+
if _max is torch.inf: # We do not need to scale the tensor. Just need to move it
|
294 |
+
return torch.ones_like(x_min)
|
295 |
+
|
296 |
+
# Calculate the scale factor
|
297 |
+
scale = (_max - _min) / (x_max - x_min)
|
298 |
+
return scale
|
299 |
+
|
300 |
+
|
301 |
+
|
302 |
+
############## Quant ###############
|
303 |
+
|
304 |
+
@torch.enable_grad()
|
305 |
+
def learn_parameters(
|
306 |
+
x: torch.Tensor,
|
307 |
+
params: Dict[str, nn.Parameter],
|
308 |
+
qtz_func: nn.Module,
|
309 |
+
deqtz_func: nn.Module,
|
310 |
+
bits: int,
|
311 |
+
target_dtype: torch.dtype,
|
312 |
+
epochs: int = 1000,
|
313 |
+
early_stop: bool = True,
|
314 |
+
do_report: bool = False
|
315 |
+
) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]:
|
316 |
+
loss_fn = nn.MSELoss()
|
317 |
+
|
318 |
+
# Determines the initial learning rate by computing the initial loss and multiplying it by
|
319 |
+
# the order of magnitude of the loss divided by 2
|
320 |
+
quant = quantize(x, params, qtz_func, bits, target_dtype)
|
321 |
+
dequant = dequantize(quant, params, deqtz_func, bits, x.dtype)
|
322 |
+
loss = loss_fn(x, dequant)
|
323 |
+
|
324 |
+
base_lr = 0.1
|
325 |
+
exponent = int(np.floor(np.log10(loss.item())))
|
326 |
+
lr = base_lr * (10 ** (exponent // 2))
|
327 |
+
|
328 |
+
# Requires gradients in the parameters
|
329 |
+
for p in params.values():
|
330 |
+
p.requires_grad = True
|
331 |
+
p.grad = None
|
332 |
+
|
333 |
+
param_keys = list(params.keys())
|
334 |
+
param_values = list(params.values())
|
335 |
+
|
336 |
+
# Defines optimizer and loss function
|
337 |
+
optimizer = torch.optim.Adam(param_values, lr=lr)
|
338 |
+
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.01, total_iters=epochs // 10)
|
339 |
+
|
340 |
+
# Contains the best loss and the best parameters
|
341 |
+
best_loss = float("inf")
|
342 |
+
best_params = None
|
343 |
+
|
344 |
+
# Used to stop the search early
|
345 |
+
min_delta = 1e-7
|
346 |
+
acc_loss = []
|
347 |
+
percent_epochs_before_stop = 0.1
|
348 |
+
|
349 |
+
for i in range(epochs):
|
350 |
+
optimizer.zero_grad()
|
351 |
+
|
352 |
+
quant = quantize(x, params, qtz_func, bits, target_dtype)
|
353 |
+
dequant = dequantize(quant, params, deqtz_func, bits, x.dtype)
|
354 |
+
loss = loss_fn(x, dequant)
|
355 |
+
|
356 |
+
if loss.isnan() or loss.isinf():
|
357 |
+
raise Exception("Loss is NaN or Inf. Stopping the search.")
|
358 |
+
|
359 |
+
loss.backward()
|
360 |
+
optimizer.step()
|
361 |
+
scheduler.step()
|
362 |
+
|
363 |
+
acc_loss.append(loss.item())
|
364 |
+
|
365 |
+
# Reports loss every 10 steps
|
366 |
+
if i % 10 == 0 and do_report:
|
367 |
+
print(f"Epoch {i}: Loss {loss.item()}")
|
368 |
+
|
369 |
+
# Optimizes the parameter search by storing the best loss and the parameters
|
370 |
+
if loss.item() < best_loss:
|
371 |
+
best_loss = loss.item()
|
372 |
+
best_params = copy.deepcopy({
|
373 |
+
k: v for k, v in params.items() if k in param_keys
|
374 |
+
})
|
375 |
+
|
376 |
+
# We also stop the search if the loss has not considerably during the last 10% epochs
|
377 |
+
if early_stop:
|
378 |
+
epochs_before_stop = int(epochs * percent_epochs_before_stop)
|
379 |
+
if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta:
|
380 |
+
break
|
381 |
+
|
382 |
+
# No longer requires gradients in the parameters
|
383 |
+
for p in best_params.values():
|
384 |
+
p.requires_grad = False
|
385 |
+
p.grad = None
|
386 |
+
|
387 |
+
if do_report:
|
388 |
+
return best_params, acc_loss
|
389 |
+
else:
|
390 |
+
return best_params
|
391 |
+
|
392 |
+
|
393 |
+
def quantize(
|
394 |
+
x: torch.Tensor,
|
395 |
+
params: Dict[str, nn.Parameter],
|
396 |
+
func: nn.Module,
|
397 |
+
bits: int,
|
398 |
+
target_dtype: torch.dtype = torch.int8
|
399 |
+
) -> torch.Tensor:
|
400 |
+
quant_min, quant_max = get_min_max_from_bits_signed(bits)
|
401 |
+
x = x.transpose(0, 1) # Aligns shapes
|
402 |
+
x = func(x=x, **params)
|
403 |
+
x = x.transpose(0, 1)
|
404 |
+
x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype)
|
405 |
+
return x
|
406 |
+
|
407 |
+
|
408 |
+
def dequantize(
|
409 |
+
x: torch.Tensor,
|
410 |
+
params: Dict[str, nn.Parameter],
|
411 |
+
func: nn.Module,
|
412 |
+
bits: int,
|
413 |
+
out_dtype: torch.dtype
|
414 |
+
) -> torch.Tensor:
|
415 |
+
x = x.to(dtype=out_dtype)
|
416 |
+
x = x.transpose(0, 1)
|
417 |
+
x = func(x=x, **params)
|
418 |
+
x = x.transpose(0, 1)
|
419 |
+
return x
|
420 |
+
|
421 |
+
|
422 |
+
def round_func_BPDA(input):
|
423 |
+
# This is equivalent to replacing round function (non-differentiable) with
|
424 |
+
# an identity function (differentiable) only when backward.
|
425 |
+
forward_value = torch.round(input)
|
426 |
+
out = input.clone()
|
427 |
+
out.data = forward_value.data
|
428 |
+
return out
|
429 |
+
|
430 |
+
|
431 |
+
def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]:
|
432 |
+
return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1
|
433 |
+
|
434 |
+
|
435 |
+
|
436 |
+
############## Numpy ###############
|
437 |
+
|
438 |
+
def np_domain_guard(
|
439 |
+
x: np.ndarray,
|
440 |
+
min: float = None,
|
441 |
+
max: float = None,
|
442 |
+
posinf: float = None,
|
443 |
+
neginf: float = None,
|
444 |
+
nan: float = None
|
445 |
+
) -> np.ndarray:
|
446 |
+
"""Guard a tensor to a valid domain."""
|
447 |
+
x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan)
|
448 |
+
if min is not None or max is not None:
|
449 |
+
x = np.clip(x, min, max)
|
450 |
+
return x
|
451 |
+
|
452 |
+
|
453 |
+
def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray:
|
454 |
+
"""Replace a number in a tensor with another number.
|
455 |
+
|
456 |
+
Args:
|
457 |
+
x (np.ndarray): The input tensor.
|
458 |
+
num (float): The number to replace.
|
459 |
+
to (float): The number to replace with.
|
460 |
+
|
461 |
+
Returns:
|
462 |
+
np.ndarray: The tensor with the number replaced.
|
463 |
+
"""
|
464 |
+
return np.where(x == num, to, x)
|
465 |
+
|
466 |
+
|
467 |
+
def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray:
|
468 |
+
"""Guard the power operation to a valid domain."""
|
469 |
+
return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp)
|
470 |
+
|
fn_gen/norm_nlr/12/loss.png
ADDED
fn_gen/norm_nlr/12/quantization.png
ADDED
fn_gen/norm_nlr/13/__pycache__/fn.cpython-311.pyc
ADDED
Binary file (25.3 kB). View file
|
|
fn_gen/norm_nlr/13/distortion.png
ADDED
fn_gen/norm_nlr/13/expressions.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
log(_0*(-_1 + x) + 1.0e-5)/_s
|
2 |
+
(_0*_1 + exp(_s*x) - 1.0e-5)/_0
|
fn_gen/norm_nlr/13/fn.py
ADDED
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from torch import amin # Necessary for arcsin
|
5 |
+
import copy
|
6 |
+
import torch.nn as nn
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
from scipy.optimize import curve_fit
|
10 |
+
from typing import Dict, Any, Tuple, List, Callable
|
11 |
+
|
12 |
+
|
13 |
+
def quantization(x, **params):
|
14 |
+
return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * torch.log(domain_guard((torch.tensor(1e-05) + (params['_0'] * (x + (torch.tensor(-1) * params['_1'])))), min=1e-5, nan=1e-5)))
|
15 |
+
|
16 |
+
|
17 |
+
def dequantization(x, **params):
|
18 |
+
return (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * (torch.tensor(-1e-05) + (params['_0'] * params['_1']) + torch.exp((params['_s'] * x))))
|
19 |
+
|
20 |
+
|
21 |
+
def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]:
|
22 |
+
base_p0 = {
|
23 |
+
'_0': init_inner_scale(x, **{'_min': 1e-05}),
|
24 |
+
'_1': amin(x, **{'dim': -1}),
|
25 |
+
}
|
26 |
+
|
27 |
+
base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs)
|
28 |
+
if 'post_init_hook' in kwargs:
|
29 |
+
kwargs['post_init_hook'](parameters=base_p0)
|
30 |
+
|
31 |
+
params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_1', '_s'], **kwargs)
|
32 |
+
params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()}
|
33 |
+
if 'post_method_hook' in kwargs:
|
34 |
+
kwargs['post_method_hook'](parameters=params)
|
35 |
+
|
36 |
+
|
37 |
+
if 'post_train_hook' in kwargs:
|
38 |
+
kwargs['post_train_hook'](parameters=params)
|
39 |
+
|
40 |
+
return params
|
41 |
+
|
42 |
+
|
43 |
+
############### Numpy Qtz ###############
|
44 |
+
|
45 |
+
|
46 |
+
def np_quantization(x, _0, _1, _s):
|
47 |
+
return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np.log(np_domain_guard((np.array(1e-05) + (_0 * (x + (np.array(-1) * _1)))), min=1e-5, nan=1e-5)))
|
48 |
+
|
49 |
+
|
50 |
+
def np_dequantization(x, _0, _1, _s):
|
51 |
+
return (np.divide(1, np_replace_num(_0, num=0, to=10000)) * (np.array(-1e-05) + (_0 * _1) + np.exp((_s * x))))
|
52 |
+
|
53 |
+
|
54 |
+
def fit_func(x, _0, _1, _s):
|
55 |
+
x_ = np_quantization(x, _0, _1, _s)
|
56 |
+
x_ = np_dequantization(x_, _0, _1, _s)
|
57 |
+
return x_
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
############### HELPERS ###############
|
62 |
+
|
63 |
+
def domain_guard(
|
64 |
+
x: torch.Tensor,
|
65 |
+
min: float = None,
|
66 |
+
max: float = None,
|
67 |
+
posinf: float = None,
|
68 |
+
neginf: float = None,
|
69 |
+
nan: float = None
|
70 |
+
) -> torch.Tensor:
|
71 |
+
"""Guard a tensor to a valid domain."""
|
72 |
+
x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan)
|
73 |
+
if min is not None or max is not None:
|
74 |
+
x = torch.clamp(x, min=min, max=max)
|
75 |
+
return x
|
76 |
+
|
77 |
+
|
78 |
+
def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor:
|
79 |
+
"""Replace a number in a tensor with another number.
|
80 |
+
|
81 |
+
Args:
|
82 |
+
x (torch.Tensor): The input tensor.
|
83 |
+
num (float): The number to replace.
|
84 |
+
to (float): The number to replace with.
|
85 |
+
|
86 |
+
Returns:
|
87 |
+
torch.Tensor: The tensor with the number replaced.
|
88 |
+
"""
|
89 |
+
return torch.where(x == num, to, x)
|
90 |
+
|
91 |
+
|
92 |
+
def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor:
|
93 |
+
"""Guard the power operation to a valid domain."""
|
94 |
+
return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp)
|
95 |
+
|
96 |
+
|
97 |
+
def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
98 |
+
val = torch.amin(x, dim=1)
|
99 |
+
return torch.ones_like(val, dtype=torch.float32, device=x.device)
|
100 |
+
|
101 |
+
|
102 |
+
def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
103 |
+
val = torch.amin(x, dim=1)
|
104 |
+
return torch.randn_like(val, dtype=torch.float32, device=x.device)
|
105 |
+
|
106 |
+
|
107 |
+
def init_space_search(
|
108 |
+
x: torch.Tensor,
|
109 |
+
**kwargs: Dict[str, Any],
|
110 |
+
) -> torch.Tensor:
|
111 |
+
|
112 |
+
def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int):
|
113 |
+
"""Generates the initial set of parameters. The first iteration generates 10 times more parameters."""
|
114 |
+
for _ in range(n_params * 10): # The first iteration generates 10 times more parameters
|
115 |
+
yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial]
|
116 |
+
|
117 |
+
def _search_param(tensors: List[torch.tensor], n_params):
|
118 |
+
"""Takes the best parameters and generates new parameters around the mean of the best parameters."""
|
119 |
+
torch_tensors = torch.stack(tensors)
|
120 |
+
min_vals, max_vals = torch.aminmax(torch_tensors, dim=0)
|
121 |
+
abs_max_val_per_ch = torch.max(-min_vals, max_vals)
|
122 |
+
mean = torch.mean(torch_tensors, dim=0)
|
123 |
+
for _ in range(n_params): # Generates n_params around the mean of the tensors
|
124 |
+
yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean
|
125 |
+
|
126 |
+
def _calc(x, qtz_func, deqtz_func, **params):
|
127 |
+
x_ = x.transpose(0, 1)
|
128 |
+
x_ = qtz_func(x=x_, **params)
|
129 |
+
x_ = deqtz_func(x=x_, **params)
|
130 |
+
x_ = x_.transpose(0, 1)
|
131 |
+
return x_
|
132 |
+
|
133 |
+
assert "qtz_func" in kwargs, "qtz_func must be provided."
|
134 |
+
assert "deqtz_func" in kwargs, "deqtz_func must be provided."
|
135 |
+
assert "params_list" in kwargs, "params list must be provided."
|
136 |
+
assert "param" in kwargs, "param must be provided."
|
137 |
+
|
138 |
+
qtz_func = kwargs.get('qtz_func')
|
139 |
+
deqtz_func = kwargs.get('deqtz_func')
|
140 |
+
params_list = kwargs.get('params_list')
|
141 |
+
param = kwargs.get('param')
|
142 |
+
|
143 |
+
n_runs = 50 # Number of runs to try to find the best parameters
|
144 |
+
n_random_params = 50 # Number of random parameters to generate
|
145 |
+
n_best_to_pick = 5 # Number of best parameters to pick after each run
|
146 |
+
max_initial = 10000 # Maximum value to initialize the parameters
|
147 |
+
|
148 |
+
# Initializes the parameters
|
149 |
+
base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param }
|
150 |
+
params = _build_initial_param(x, max_initial, n_random_params)
|
151 |
+
|
152 |
+
# Performs the search
|
153 |
+
for _ in range(n_runs):
|
154 |
+
|
155 |
+
best_params = []
|
156 |
+
for param_ in params:
|
157 |
+
try:
|
158 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_})
|
159 |
+
loss_ones = nn.MSELoss()(x, x_)
|
160 |
+
|
161 |
+
if len(best_params) < n_best_to_pick:
|
162 |
+
best_params.append((param_, loss_ones.item()))
|
163 |
+
best_params = sorted(best_params, key=lambda x: x[1])
|
164 |
+
elif loss_ones < best_params[-1][1]:
|
165 |
+
best_params[-1] = (param_, loss_ones.item())
|
166 |
+
best_params = sorted(best_params, key=lambda x: x[1])
|
167 |
+
|
168 |
+
except Exception: # The parameters might not be valid for the function's domain
|
169 |
+
continue
|
170 |
+
|
171 |
+
# Generates new parameters around the mean
|
172 |
+
params = _search_param([p for p, _ in best_params], n_random_params)
|
173 |
+
|
174 |
+
# Checks if the best parameter is better than the init_ones
|
175 |
+
p_ones = init_ones(x, **kwargs)
|
176 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones})
|
177 |
+
loss_ones = nn.MSELoss()(x, x_)
|
178 |
+
|
179 |
+
# Checks if the best parameter is better than the init_rand
|
180 |
+
p_rand = init_rand(x, **kwargs)
|
181 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand})
|
182 |
+
loss_rand = nn.MSELoss()(x, x_)
|
183 |
+
|
184 |
+
if loss_rand < best_params[0][1] and loss_rand < loss_ones:
|
185 |
+
return p_rand
|
186 |
+
elif loss_ones < best_params[0][1] and loss_ones < loss_rand:
|
187 |
+
return p_ones
|
188 |
+
else:
|
189 |
+
return best_params[0][0]
|
190 |
+
|
191 |
+
|
192 |
+
def init_linear_scale( # Symmetric scale. From the study folder
|
193 |
+
x: torch.Tensor,
|
194 |
+
**kwargs: Dict[str, Any],
|
195 |
+
) -> torch.Tensor:
|
196 |
+
assert "bits" in kwargs, "bits must be provided."
|
197 |
+
assert "params" in kwargs, "params must be provided."
|
198 |
+
assert "qtz_func" in kwargs, "qtz_func must be provided."
|
199 |
+
|
200 |
+
bits = kwargs.get('bits')
|
201 |
+
params = kwargs.get('params')
|
202 |
+
qtz_func = kwargs.get('qtz_func')
|
203 |
+
|
204 |
+
x_ = x.transpose(0, 1)
|
205 |
+
x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs))
|
206 |
+
x_ = x_.transpose(0, 1)
|
207 |
+
|
208 |
+
quant_min, quant_max = get_min_max_from_bits_signed(bits)
|
209 |
+
min_vals, max_vals = torch.aminmax(x_, dim=1)
|
210 |
+
min_vals = torch.min(min_vals, torch.zeros_like(min_vals))
|
211 |
+
max_vals = torch.max(max_vals, torch.zeros_like(max_vals))
|
212 |
+
|
213 |
+
eps = torch.finfo(torch.float32).eps
|
214 |
+
|
215 |
+
abs_max_val_per_ch = torch.max(-min_vals, max_vals)
|
216 |
+
scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2)
|
217 |
+
|
218 |
+
scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device)
|
219 |
+
|
220 |
+
# Introduces some noise in scale
|
221 |
+
# If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything
|
222 |
+
# scale = scale + 0.01 * torch.randn_like(scale)
|
223 |
+
return scale
|
224 |
+
|
225 |
+
|
226 |
+
def init_non_linear_regression_fit(
|
227 |
+
x: torch.Tensor,
|
228 |
+
**kwargs: Dict[str, Any],
|
229 |
+
) -> torch.Tensor:
|
230 |
+
|
231 |
+
assert "params_list" in kwargs, "params list must be provided."
|
232 |
+
assert "np_fit_func" in kwargs, "np_fit_func must be provided."
|
233 |
+
assert "p0" in kwargs, "p0 must be provided."
|
234 |
+
np_fit_func = kwargs.get('np_fit_func')
|
235 |
+
params_list = kwargs.get('params_list')
|
236 |
+
p0 = kwargs.get('p0')
|
237 |
+
|
238 |
+
def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]):
|
239 |
+
popt, _ = curve_fit(
|
240 |
+
func,
|
241 |
+
xdata,
|
242 |
+
ydata,
|
243 |
+
maxfev=1000,
|
244 |
+
p0=p0,
|
245 |
+
method='lm'
|
246 |
+
)
|
247 |
+
return popt
|
248 |
+
|
249 |
+
# 1. Needs to convert the torch tensor to numpy tensor
|
250 |
+
xdata = x.cpu().numpy()
|
251 |
+
|
252 |
+
# 2. Sorts the data so that it makes it easier to fit to it
|
253 |
+
sorted_xdata = np.sort(xdata, axis=-1)
|
254 |
+
|
255 |
+
p0 = {k: v.cpu().numpy() for k, v in p0.items()}
|
256 |
+
params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order
|
257 |
+
|
258 |
+
# 3. Finds the best parameters for each channel
|
259 |
+
try:
|
260 |
+
params = []
|
261 |
+
for i in range(sorted_xdata.shape[0]):
|
262 |
+
xdata_ = sorted_xdata[i]
|
263 |
+
p0_ = [p0[p][i] for p in params_list]
|
264 |
+
ch_params = _fit(xdata_, xdata_, np_fit_func, p0_)
|
265 |
+
params.append(ch_params)
|
266 |
+
|
267 |
+
# 4. Builds the parameters
|
268 |
+
result = {}
|
269 |
+
for i, p in enumerate(params_list):
|
270 |
+
result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device)
|
271 |
+
|
272 |
+
return result
|
273 |
+
|
274 |
+
except ValueError as e:
|
275 |
+
print(f"Could not fit the function with error: {e}")
|
276 |
+
print(f"Using fallback result...")
|
277 |
+
return {
|
278 |
+
k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items()
|
279 |
+
}
|
280 |
+
|
281 |
+
|
282 |
+
def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
283 |
+
val = torch.amin(x, dim=1)
|
284 |
+
return torch.zeros_like(val, dtype=torch.float32, device=x.device)
|
285 |
+
|
286 |
+
|
287 |
+
def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor:
|
288 |
+
# Calculate the original minimum and maximum values
|
289 |
+
min_vals, max_vals = torch.aminmax(tensor, dim=-1)
|
290 |
+
x_min = torch.min(min_vals, torch.zeros_like(min_vals))
|
291 |
+
x_max = torch.max(max_vals, torch.zeros_like(max_vals))
|
292 |
+
|
293 |
+
if _max is torch.inf: # We do not need to scale the tensor. Just need to move it
|
294 |
+
return torch.ones_like(x_min)
|
295 |
+
|
296 |
+
# Calculate the scale factor
|
297 |
+
scale = (_max - _min) / (x_max - x_min)
|
298 |
+
return scale
|
299 |
+
|
300 |
+
|
301 |
+
|
302 |
+
############## Quant ###############
|
303 |
+
|
304 |
+
@torch.enable_grad()
|
305 |
+
def learn_parameters(
|
306 |
+
x: torch.Tensor,
|
307 |
+
params: Dict[str, nn.Parameter],
|
308 |
+
qtz_func: nn.Module,
|
309 |
+
deqtz_func: nn.Module,
|
310 |
+
bits: int,
|
311 |
+
target_dtype: torch.dtype,
|
312 |
+
epochs: int = 1000,
|
313 |
+
early_stop: bool = True,
|
314 |
+
do_report: bool = False
|
315 |
+
) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]:
|
316 |
+
loss_fn = nn.MSELoss()
|
317 |
+
|
318 |
+
# Determines the initial learning rate by computing the initial loss and multiplying it by
|
319 |
+
# the order of magnitude of the loss divided by 2
|
320 |
+
quant = quantize(x, params, qtz_func, bits, target_dtype)
|
321 |
+
dequant = dequantize(quant, params, deqtz_func, bits, x.dtype)
|
322 |
+
loss = loss_fn(x, dequant)
|
323 |
+
|
324 |
+
base_lr = 0.1
|
325 |
+
exponent = int(np.floor(np.log10(loss.item())))
|
326 |
+
lr = base_lr * (10 ** (exponent // 2))
|
327 |
+
|
328 |
+
# Requires gradients in the parameters
|
329 |
+
for p in params.values():
|
330 |
+
p.requires_grad = True
|
331 |
+
p.grad = None
|
332 |
+
|
333 |
+
param_keys = list(params.keys())
|
334 |
+
param_values = list(params.values())
|
335 |
+
|
336 |
+
# Defines optimizer and loss function
|
337 |
+
optimizer = torch.optim.Adam(param_values, lr=lr)
|
338 |
+
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.01, total_iters=epochs // 10)
|
339 |
+
|
340 |
+
# Contains the best loss and the best parameters
|
341 |
+
best_loss = float("inf")
|
342 |
+
best_params = None
|
343 |
+
|
344 |
+
# Used to stop the search early
|
345 |
+
min_delta = 1e-7
|
346 |
+
acc_loss = []
|
347 |
+
percent_epochs_before_stop = 0.1
|
348 |
+
|
349 |
+
for i in range(epochs):
|
350 |
+
optimizer.zero_grad()
|
351 |
+
|
352 |
+
quant = quantize(x, params, qtz_func, bits, target_dtype)
|
353 |
+
dequant = dequantize(quant, params, deqtz_func, bits, x.dtype)
|
354 |
+
loss = loss_fn(x, dequant)
|
355 |
+
|
356 |
+
if loss.isnan() or loss.isinf():
|
357 |
+
raise Exception("Loss is NaN or Inf. Stopping the search.")
|
358 |
+
|
359 |
+
loss.backward()
|
360 |
+
optimizer.step()
|
361 |
+
scheduler.step()
|
362 |
+
|
363 |
+
acc_loss.append(loss.item())
|
364 |
+
|
365 |
+
# Reports loss every 10 steps
|
366 |
+
if i % 10 == 0 and do_report:
|
367 |
+
print(f"Epoch {i}: Loss {loss.item()}")
|
368 |
+
|
369 |
+
# Optimizes the parameter search by storing the best loss and the parameters
|
370 |
+
if loss.item() < best_loss:
|
371 |
+
best_loss = loss.item()
|
372 |
+
best_params = copy.deepcopy({
|
373 |
+
k: v for k, v in params.items() if k in param_keys
|
374 |
+
})
|
375 |
+
|
376 |
+
# We also stop the search if the loss has not considerably during the last 10% epochs
|
377 |
+
if early_stop:
|
378 |
+
epochs_before_stop = int(epochs * percent_epochs_before_stop)
|
379 |
+
if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta:
|
380 |
+
break
|
381 |
+
|
382 |
+
# No longer requires gradients in the parameters
|
383 |
+
for p in best_params.values():
|
384 |
+
p.requires_grad = False
|
385 |
+
p.grad = None
|
386 |
+
|
387 |
+
if do_report:
|
388 |
+
return best_params, acc_loss
|
389 |
+
else:
|
390 |
+
return best_params
|
391 |
+
|
392 |
+
|
393 |
+
def quantize(
|
394 |
+
x: torch.Tensor,
|
395 |
+
params: Dict[str, nn.Parameter],
|
396 |
+
func: nn.Module,
|
397 |
+
bits: int,
|
398 |
+
target_dtype: torch.dtype = torch.int8
|
399 |
+
) -> torch.Tensor:
|
400 |
+
quant_min, quant_max = get_min_max_from_bits_signed(bits)
|
401 |
+
x = x.transpose(0, 1) # Aligns shapes
|
402 |
+
x = func(x=x, **params)
|
403 |
+
x = x.transpose(0, 1)
|
404 |
+
x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype)
|
405 |
+
return x
|
406 |
+
|
407 |
+
|
408 |
+
def dequantize(
|
409 |
+
x: torch.Tensor,
|
410 |
+
params: Dict[str, nn.Parameter],
|
411 |
+
func: nn.Module,
|
412 |
+
bits: int,
|
413 |
+
out_dtype: torch.dtype
|
414 |
+
) -> torch.Tensor:
|
415 |
+
x = x.to(dtype=out_dtype)
|
416 |
+
x = x.transpose(0, 1)
|
417 |
+
x = func(x=x, **params)
|
418 |
+
x = x.transpose(0, 1)
|
419 |
+
return x
|
420 |
+
|
421 |
+
|
422 |
+
def round_func_BPDA(input):
|
423 |
+
# This is equivalent to replacing round function (non-differentiable) with
|
424 |
+
# an identity function (differentiable) only when backward.
|
425 |
+
forward_value = torch.round(input)
|
426 |
+
out = input.clone()
|
427 |
+
out.data = forward_value.data
|
428 |
+
return out
|
429 |
+
|
430 |
+
|
431 |
+
def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]:
|
432 |
+
return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1
|
433 |
+
|
434 |
+
|
435 |
+
|
436 |
+
############## Numpy ###############
|
437 |
+
|
438 |
+
def np_domain_guard(
|
439 |
+
x: np.ndarray,
|
440 |
+
min: float = None,
|
441 |
+
max: float = None,
|
442 |
+
posinf: float = None,
|
443 |
+
neginf: float = None,
|
444 |
+
nan: float = None
|
445 |
+
) -> np.ndarray:
|
446 |
+
"""Guard a tensor to a valid domain."""
|
447 |
+
x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan)
|
448 |
+
if min is not None or max is not None:
|
449 |
+
x = np.clip(x, min, max)
|
450 |
+
return x
|
451 |
+
|
452 |
+
|
453 |
+
def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray:
|
454 |
+
"""Replace a number in a tensor with another number.
|
455 |
+
|
456 |
+
Args:
|
457 |
+
x (np.ndarray): The input tensor.
|
458 |
+
num (float): The number to replace.
|
459 |
+
to (float): The number to replace with.
|
460 |
+
|
461 |
+
Returns:
|
462 |
+
np.ndarray: The tensor with the number replaced.
|
463 |
+
"""
|
464 |
+
return np.where(x == num, to, x)
|
465 |
+
|
466 |
+
|
467 |
+
def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray:
|
468 |
+
"""Guard the power operation to a valid domain."""
|
469 |
+
return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp)
|
470 |
+
|
fn_gen/norm_nlr/13/loss.png
ADDED
fn_gen/norm_nlr/13/quantization.png
ADDED
fn_gen/norm_nlr/14/__pycache__/fn.cpython-311.pyc
ADDED
Binary file (25.3 kB). View file
|
|
fn_gen/norm_nlr/14/distortion.png
ADDED
fn_gen/norm_nlr/14/expressions.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
sqrt(_0*(-_1 + x))/_s
|
2 |
+
_1 + _s**2*x**2/_0
|
fn_gen/norm_nlr/14/fn.py
ADDED
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from torch import amin # Necessary for arcsin
|
5 |
+
import copy
|
6 |
+
import torch.nn as nn
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
from scipy.optimize import curve_fit
|
10 |
+
from typing import Dict, Any, Tuple, List, Callable
|
11 |
+
|
12 |
+
|
13 |
+
def quantization(x, **params):
|
14 |
+
return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * torch.sqrt(domain_guard((params['_0'] * (x + (torch.tensor(-1) * params['_1']))), min=0.1, nan=0.1)))
|
15 |
+
|
16 |
+
|
17 |
+
def dequantization(x, **params):
|
18 |
+
return (params['_1'] + (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * guarded_torch_power(params['_s'], torch.tensor(2)) * guarded_torch_power(x, torch.tensor(2))))
|
19 |
+
|
20 |
+
|
21 |
+
def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]:
|
22 |
+
base_p0 = {
|
23 |
+
'_0': init_inner_scale(x, **{'_min': 0}),
|
24 |
+
'_1': amin(x, **{'dim': -1}),
|
25 |
+
}
|
26 |
+
|
27 |
+
base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs)
|
28 |
+
if 'post_init_hook' in kwargs:
|
29 |
+
kwargs['post_init_hook'](parameters=base_p0)
|
30 |
+
|
31 |
+
params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_1', '_s'], **kwargs)
|
32 |
+
params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()}
|
33 |
+
if 'post_method_hook' in kwargs:
|
34 |
+
kwargs['post_method_hook'](parameters=params)
|
35 |
+
|
36 |
+
|
37 |
+
if 'post_train_hook' in kwargs:
|
38 |
+
kwargs['post_train_hook'](parameters=params)
|
39 |
+
|
40 |
+
return params
|
41 |
+
|
42 |
+
|
43 |
+
############### Numpy Qtz ###############
|
44 |
+
|
45 |
+
|
46 |
+
def np_quantization(x, _0, _1, _s):
|
47 |
+
return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np.sqrt(np_domain_guard((_0 * (x + (np.array(-1) * _1))), min=0.1, nan=0.1)))
|
48 |
+
|
49 |
+
|
50 |
+
def np_dequantization(x, _0, _1, _s):
|
51 |
+
return (_1 + (np.divide(1, np_replace_num(_0, num=0, to=10000)) * np_guarded_power(_s, np.array(2)) * np_guarded_power(x, np.array(2))))
|
52 |
+
|
53 |
+
|
54 |
+
def fit_func(x, _0, _1, _s):
|
55 |
+
x_ = np_quantization(x, _0, _1, _s)
|
56 |
+
x_ = np_dequantization(x_, _0, _1, _s)
|
57 |
+
return x_
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
############### HELPERS ###############
|
62 |
+
|
63 |
+
def domain_guard(
|
64 |
+
x: torch.Tensor,
|
65 |
+
min: float = None,
|
66 |
+
max: float = None,
|
67 |
+
posinf: float = None,
|
68 |
+
neginf: float = None,
|
69 |
+
nan: float = None
|
70 |
+
) -> torch.Tensor:
|
71 |
+
"""Guard a tensor to a valid domain."""
|
72 |
+
x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan)
|
73 |
+
if min is not None or max is not None:
|
74 |
+
x = torch.clamp(x, min=min, max=max)
|
75 |
+
return x
|
76 |
+
|
77 |
+
|
78 |
+
def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor:
|
79 |
+
"""Replace a number in a tensor with another number.
|
80 |
+
|
81 |
+
Args:
|
82 |
+
x (torch.Tensor): The input tensor.
|
83 |
+
num (float): The number to replace.
|
84 |
+
to (float): The number to replace with.
|
85 |
+
|
86 |
+
Returns:
|
87 |
+
torch.Tensor: The tensor with the number replaced.
|
88 |
+
"""
|
89 |
+
return torch.where(x == num, to, x)
|
90 |
+
|
91 |
+
|
92 |
+
def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor:
|
93 |
+
"""Guard the power operation to a valid domain."""
|
94 |
+
return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp)
|
95 |
+
|
96 |
+
|
97 |
+
def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
98 |
+
val = torch.amin(x, dim=1)
|
99 |
+
return torch.ones_like(val, dtype=torch.float32, device=x.device)
|
100 |
+
|
101 |
+
|
102 |
+
def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
103 |
+
val = torch.amin(x, dim=1)
|
104 |
+
return torch.randn_like(val, dtype=torch.float32, device=x.device)
|
105 |
+
|
106 |
+
|
107 |
+
def init_space_search(
|
108 |
+
x: torch.Tensor,
|
109 |
+
**kwargs: Dict[str, Any],
|
110 |
+
) -> torch.Tensor:
|
111 |
+
|
112 |
+
def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int):
|
113 |
+
"""Generates the initial set of parameters. The first iteration generates 10 times more parameters."""
|
114 |
+
for _ in range(n_params * 10): # The first iteration generates 10 times more parameters
|
115 |
+
yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial]
|
116 |
+
|
117 |
+
def _search_param(tensors: List[torch.tensor], n_params):
|
118 |
+
"""Takes the best parameters and generates new parameters around the mean of the best parameters."""
|
119 |
+
torch_tensors = torch.stack(tensors)
|
120 |
+
min_vals, max_vals = torch.aminmax(torch_tensors, dim=0)
|
121 |
+
abs_max_val_per_ch = torch.max(-min_vals, max_vals)
|
122 |
+
mean = torch.mean(torch_tensors, dim=0)
|
123 |
+
for _ in range(n_params): # Generates n_params around the mean of the tensors
|
124 |
+
yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean
|
125 |
+
|
126 |
+
def _calc(x, qtz_func, deqtz_func, **params):
|
127 |
+
x_ = x.transpose(0, 1)
|
128 |
+
x_ = qtz_func(x=x_, **params)
|
129 |
+
x_ = deqtz_func(x=x_, **params)
|
130 |
+
x_ = x_.transpose(0, 1)
|
131 |
+
return x_
|
132 |
+
|
133 |
+
assert "qtz_func" in kwargs, "qtz_func must be provided."
|
134 |
+
assert "deqtz_func" in kwargs, "deqtz_func must be provided."
|
135 |
+
assert "params_list" in kwargs, "params list must be provided."
|
136 |
+
assert "param" in kwargs, "param must be provided."
|
137 |
+
|
138 |
+
qtz_func = kwargs.get('qtz_func')
|
139 |
+
deqtz_func = kwargs.get('deqtz_func')
|
140 |
+
params_list = kwargs.get('params_list')
|
141 |
+
param = kwargs.get('param')
|
142 |
+
|
143 |
+
n_runs = 50 # Number of runs to try to find the best parameters
|
144 |
+
n_random_params = 50 # Number of random parameters to generate
|
145 |
+
n_best_to_pick = 5 # Number of best parameters to pick after each run
|
146 |
+
max_initial = 10000 # Maximum value to initialize the parameters
|
147 |
+
|
148 |
+
# Initializes the parameters
|
149 |
+
base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param }
|
150 |
+
params = _build_initial_param(x, max_initial, n_random_params)
|
151 |
+
|
152 |
+
# Performs the search
|
153 |
+
for _ in range(n_runs):
|
154 |
+
|
155 |
+
best_params = []
|
156 |
+
for param_ in params:
|
157 |
+
try:
|
158 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_})
|
159 |
+
loss_ones = nn.MSELoss()(x, x_)
|
160 |
+
|
161 |
+
if len(best_params) < n_best_to_pick:
|
162 |
+
best_params.append((param_, loss_ones.item()))
|
163 |
+
best_params = sorted(best_params, key=lambda x: x[1])
|
164 |
+
elif loss_ones < best_params[-1][1]:
|
165 |
+
best_params[-1] = (param_, loss_ones.item())
|
166 |
+
best_params = sorted(best_params, key=lambda x: x[1])
|
167 |
+
|
168 |
+
except Exception: # The parameters might not be valid for the function's domain
|
169 |
+
continue
|
170 |
+
|
171 |
+
# Generates new parameters around the mean
|
172 |
+
params = _search_param([p for p, _ in best_params], n_random_params)
|
173 |
+
|
174 |
+
# Checks if the best parameter is better than the init_ones
|
175 |
+
p_ones = init_ones(x, **kwargs)
|
176 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones})
|
177 |
+
loss_ones = nn.MSELoss()(x, x_)
|
178 |
+
|
179 |
+
# Checks if the best parameter is better than the init_rand
|
180 |
+
p_rand = init_rand(x, **kwargs)
|
181 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand})
|
182 |
+
loss_rand = nn.MSELoss()(x, x_)
|
183 |
+
|
184 |
+
if loss_rand < best_params[0][1] and loss_rand < loss_ones:
|
185 |
+
return p_rand
|
186 |
+
elif loss_ones < best_params[0][1] and loss_ones < loss_rand:
|
187 |
+
return p_ones
|
188 |
+
else:
|
189 |
+
return best_params[0][0]
|
190 |
+
|
191 |
+
|
192 |
+
def init_linear_scale( # Symmetric scale. From the study folder
|
193 |
+
x: torch.Tensor,
|
194 |
+
**kwargs: Dict[str, Any],
|
195 |
+
) -> torch.Tensor:
|
196 |
+
assert "bits" in kwargs, "bits must be provided."
|
197 |
+
assert "params" in kwargs, "params must be provided."
|
198 |
+
assert "qtz_func" in kwargs, "qtz_func must be provided."
|
199 |
+
|
200 |
+
bits = kwargs.get('bits')
|
201 |
+
params = kwargs.get('params')
|
202 |
+
qtz_func = kwargs.get('qtz_func')
|
203 |
+
|
204 |
+
x_ = x.transpose(0, 1)
|
205 |
+
x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs))
|
206 |
+
x_ = x_.transpose(0, 1)
|
207 |
+
|
208 |
+
quant_min, quant_max = get_min_max_from_bits_signed(bits)
|
209 |
+
min_vals, max_vals = torch.aminmax(x_, dim=1)
|
210 |
+
min_vals = torch.min(min_vals, torch.zeros_like(min_vals))
|
211 |
+
max_vals = torch.max(max_vals, torch.zeros_like(max_vals))
|
212 |
+
|
213 |
+
eps = torch.finfo(torch.float32).eps
|
214 |
+
|
215 |
+
abs_max_val_per_ch = torch.max(-min_vals, max_vals)
|
216 |
+
scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2)
|
217 |
+
|
218 |
+
scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device)
|
219 |
+
|
220 |
+
# Introduces some noise in scale
|
221 |
+
# If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything
|
222 |
+
# scale = scale + 0.01 * torch.randn_like(scale)
|
223 |
+
return scale
|
224 |
+
|
225 |
+
|
226 |
+
def init_non_linear_regression_fit(
|
227 |
+
x: torch.Tensor,
|
228 |
+
**kwargs: Dict[str, Any],
|
229 |
+
) -> torch.Tensor:
|
230 |
+
|
231 |
+
assert "params_list" in kwargs, "params list must be provided."
|
232 |
+
assert "np_fit_func" in kwargs, "np_fit_func must be provided."
|
233 |
+
assert "p0" in kwargs, "p0 must be provided."
|
234 |
+
np_fit_func = kwargs.get('np_fit_func')
|
235 |
+
params_list = kwargs.get('params_list')
|
236 |
+
p0 = kwargs.get('p0')
|
237 |
+
|
238 |
+
def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]):
|
239 |
+
popt, _ = curve_fit(
|
240 |
+
func,
|
241 |
+
xdata,
|
242 |
+
ydata,
|
243 |
+
maxfev=1000,
|
244 |
+
p0=p0,
|
245 |
+
method='lm'
|
246 |
+
)
|
247 |
+
return popt
|
248 |
+
|
249 |
+
# 1. Needs to convert the torch tensor to numpy tensor
|
250 |
+
xdata = x.cpu().numpy()
|
251 |
+
|
252 |
+
# 2. Sorts the data so that it makes it easier to fit to it
|
253 |
+
sorted_xdata = np.sort(xdata, axis=-1)
|
254 |
+
|
255 |
+
p0 = {k: v.cpu().numpy() for k, v in p0.items()}
|
256 |
+
params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order
|
257 |
+
|
258 |
+
# 3. Finds the best parameters for each channel
|
259 |
+
try:
|
260 |
+
params = []
|
261 |
+
for i in range(sorted_xdata.shape[0]):
|
262 |
+
xdata_ = sorted_xdata[i]
|
263 |
+
p0_ = [p0[p][i] for p in params_list]
|
264 |
+
ch_params = _fit(xdata_, xdata_, np_fit_func, p0_)
|
265 |
+
params.append(ch_params)
|
266 |
+
|
267 |
+
# 4. Builds the parameters
|
268 |
+
result = {}
|
269 |
+
for i, p in enumerate(params_list):
|
270 |
+
result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device)
|
271 |
+
|
272 |
+
return result
|
273 |
+
|
274 |
+
except ValueError as e:
|
275 |
+
print(f"Could not fit the function with error: {e}")
|
276 |
+
print(f"Using fallback result...")
|
277 |
+
return {
|
278 |
+
k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items()
|
279 |
+
}
|
280 |
+
|
281 |
+
|
282 |
+
def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
283 |
+
val = torch.amin(x, dim=1)
|
284 |
+
return torch.zeros_like(val, dtype=torch.float32, device=x.device)
|
285 |
+
|
286 |
+
|
287 |
+
def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor:
|
288 |
+
# Calculate the original minimum and maximum values
|
289 |
+
min_vals, max_vals = torch.aminmax(tensor, dim=-1)
|
290 |
+
x_min = torch.min(min_vals, torch.zeros_like(min_vals))
|
291 |
+
x_max = torch.max(max_vals, torch.zeros_like(max_vals))
|
292 |
+
|
293 |
+
if _max is torch.inf: # We do not need to scale the tensor. Just need to move it
|
294 |
+
return torch.ones_like(x_min)
|
295 |
+
|
296 |
+
# Calculate the scale factor
|
297 |
+
scale = (_max - _min) / (x_max - x_min)
|
298 |
+
return scale
|
299 |
+
|
300 |
+
|
301 |
+
|
302 |
+
############## Quant ###############
|
303 |
+
|
304 |
+
@torch.enable_grad()
|
305 |
+
def learn_parameters(
|
306 |
+
x: torch.Tensor,
|
307 |
+
params: Dict[str, nn.Parameter],
|
308 |
+
qtz_func: nn.Module,
|
309 |
+
deqtz_func: nn.Module,
|
310 |
+
bits: int,
|
311 |
+
target_dtype: torch.dtype,
|
312 |
+
epochs: int = 1000,
|
313 |
+
early_stop: bool = True,
|
314 |
+
do_report: bool = False
|
315 |
+
) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]:
|
316 |
+
loss_fn = nn.MSELoss()
|
317 |
+
|
318 |
+
# Determines the initial learning rate by computing the initial loss and multiplying it by
|
319 |
+
# the order of magnitude of the loss divided by 2
|
320 |
+
quant = quantize(x, params, qtz_func, bits, target_dtype)
|
321 |
+
dequant = dequantize(quant, params, deqtz_func, bits, x.dtype)
|
322 |
+
loss = loss_fn(x, dequant)
|
323 |
+
|
324 |
+
base_lr = 0.1
|
325 |
+
exponent = int(np.floor(np.log10(loss.item())))
|
326 |
+
lr = base_lr * (10 ** (exponent // 2))
|
327 |
+
|
328 |
+
# Requires gradients in the parameters
|
329 |
+
for p in params.values():
|
330 |
+
p.requires_grad = True
|
331 |
+
p.grad = None
|
332 |
+
|
333 |
+
param_keys = list(params.keys())
|
334 |
+
param_values = list(params.values())
|
335 |
+
|
336 |
+
# Defines optimizer and loss function
|
337 |
+
optimizer = torch.optim.Adam(param_values, lr=lr)
|
338 |
+
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.01, total_iters=epochs // 10)
|
339 |
+
|
340 |
+
# Contains the best loss and the best parameters
|
341 |
+
best_loss = float("inf")
|
342 |
+
best_params = None
|
343 |
+
|
344 |
+
# Used to stop the search early
|
345 |
+
min_delta = 1e-7
|
346 |
+
acc_loss = []
|
347 |
+
percent_epochs_before_stop = 0.1
|
348 |
+
|
349 |
+
for i in range(epochs):
|
350 |
+
optimizer.zero_grad()
|
351 |
+
|
352 |
+
quant = quantize(x, params, qtz_func, bits, target_dtype)
|
353 |
+
dequant = dequantize(quant, params, deqtz_func, bits, x.dtype)
|
354 |
+
loss = loss_fn(x, dequant)
|
355 |
+
|
356 |
+
if loss.isnan() or loss.isinf():
|
357 |
+
raise Exception("Loss is NaN or Inf. Stopping the search.")
|
358 |
+
|
359 |
+
loss.backward()
|
360 |
+
optimizer.step()
|
361 |
+
scheduler.step()
|
362 |
+
|
363 |
+
acc_loss.append(loss.item())
|
364 |
+
|
365 |
+
# Reports loss every 10 steps
|
366 |
+
if i % 10 == 0 and do_report:
|
367 |
+
print(f"Epoch {i}: Loss {loss.item()}")
|
368 |
+
|
369 |
+
# Optimizes the parameter search by storing the best loss and the parameters
|
370 |
+
if loss.item() < best_loss:
|
371 |
+
best_loss = loss.item()
|
372 |
+
best_params = copy.deepcopy({
|
373 |
+
k: v for k, v in params.items() if k in param_keys
|
374 |
+
})
|
375 |
+
|
376 |
+
# We also stop the search if the loss has not considerably during the last 10% epochs
|
377 |
+
if early_stop:
|
378 |
+
epochs_before_stop = int(epochs * percent_epochs_before_stop)
|
379 |
+
if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta:
|
380 |
+
break
|
381 |
+
|
382 |
+
# No longer requires gradients in the parameters
|
383 |
+
for p in best_params.values():
|
384 |
+
p.requires_grad = False
|
385 |
+
p.grad = None
|
386 |
+
|
387 |
+
if do_report:
|
388 |
+
return best_params, acc_loss
|
389 |
+
else:
|
390 |
+
return best_params
|
391 |
+
|
392 |
+
|
393 |
+
def quantize(
|
394 |
+
x: torch.Tensor,
|
395 |
+
params: Dict[str, nn.Parameter],
|
396 |
+
func: nn.Module,
|
397 |
+
bits: int,
|
398 |
+
target_dtype: torch.dtype = torch.int8
|
399 |
+
) -> torch.Tensor:
|
400 |
+
quant_min, quant_max = get_min_max_from_bits_signed(bits)
|
401 |
+
x = x.transpose(0, 1) # Aligns shapes
|
402 |
+
x = func(x=x, **params)
|
403 |
+
x = x.transpose(0, 1)
|
404 |
+
x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype)
|
405 |
+
return x
|
406 |
+
|
407 |
+
|
408 |
+
def dequantize(
|
409 |
+
x: torch.Tensor,
|
410 |
+
params: Dict[str, nn.Parameter],
|
411 |
+
func: nn.Module,
|
412 |
+
bits: int,
|
413 |
+
out_dtype: torch.dtype
|
414 |
+
) -> torch.Tensor:
|
415 |
+
x = x.to(dtype=out_dtype)
|
416 |
+
x = x.transpose(0, 1)
|
417 |
+
x = func(x=x, **params)
|
418 |
+
x = x.transpose(0, 1)
|
419 |
+
return x
|
420 |
+
|
421 |
+
|
422 |
+
def round_func_BPDA(input):
|
423 |
+
# This is equivalent to replacing round function (non-differentiable) with
|
424 |
+
# an identity function (differentiable) only when backward.
|
425 |
+
forward_value = torch.round(input)
|
426 |
+
out = input.clone()
|
427 |
+
out.data = forward_value.data
|
428 |
+
return out
|
429 |
+
|
430 |
+
|
431 |
+
def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]:
|
432 |
+
return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1
|
433 |
+
|
434 |
+
|
435 |
+
|
436 |
+
############## Numpy ###############
|
437 |
+
|
438 |
+
def np_domain_guard(
|
439 |
+
x: np.ndarray,
|
440 |
+
min: float = None,
|
441 |
+
max: float = None,
|
442 |
+
posinf: float = None,
|
443 |
+
neginf: float = None,
|
444 |
+
nan: float = None
|
445 |
+
) -> np.ndarray:
|
446 |
+
"""Guard a tensor to a valid domain."""
|
447 |
+
x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan)
|
448 |
+
if min is not None or max is not None:
|
449 |
+
x = np.clip(x, min, max)
|
450 |
+
return x
|
451 |
+
|
452 |
+
|
453 |
+
def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray:
|
454 |
+
"""Replace a number in a tensor with another number.
|
455 |
+
|
456 |
+
Args:
|
457 |
+
x (np.ndarray): The input tensor.
|
458 |
+
num (float): The number to replace.
|
459 |
+
to (float): The number to replace with.
|
460 |
+
|
461 |
+
Returns:
|
462 |
+
np.ndarray: The tensor with the number replaced.
|
463 |
+
"""
|
464 |
+
return np.where(x == num, to, x)
|
465 |
+
|
466 |
+
|
467 |
+
def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray:
|
468 |
+
"""Guard the power operation to a valid domain."""
|
469 |
+
return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp)
|
470 |
+
|
fn_gen/norm_nlr/14/loss.png
ADDED
fn_gen/norm_nlr/14/quantization.png
ADDED
fn_gen/norm_nlr/15/__pycache__/fn.cpython-311.pyc
ADDED
Binary file (24.7 kB). View file
|
|
fn_gen/norm_nlr/15/distortion.png
ADDED
fn_gen/norm_nlr/15/expressions.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
sin(_0*x)/_s
|
2 |
+
asin(_s*x)/_0
|
fn_gen/norm_nlr/15/fn.py
ADDED
@@ -0,0 +1,469 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from torch import amin # Necessary for arcsin
|
5 |
+
import copy
|
6 |
+
import torch.nn as nn
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
from scipy.optimize import curve_fit
|
10 |
+
from typing import Dict, Any, Tuple, List, Callable
|
11 |
+
|
12 |
+
|
13 |
+
def quantization(x, **params):
|
14 |
+
return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * torch.sin((params['_0'] * x)))
|
15 |
+
|
16 |
+
|
17 |
+
def dequantization(x, **params):
|
18 |
+
return (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * torch.asin(domain_guard((params['_s'] * x), min=-0.99999, max=0.99999, nan=0)))
|
19 |
+
|
20 |
+
|
21 |
+
def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]:
|
22 |
+
base_p0 = {
|
23 |
+
'_0': init_space_search(x, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], param='_0', **kwargs),
|
24 |
+
}
|
25 |
+
|
26 |
+
base_p0['_s'] = init_linear_scale(x, qtz_func=quantization, params=base_p0, **kwargs)
|
27 |
+
if 'post_init_hook' in kwargs:
|
28 |
+
kwargs['post_init_hook'](parameters=base_p0)
|
29 |
+
|
30 |
+
params = init_non_linear_regression_fit(x, p0=base_p0, np_fit_func=fit_func, qtz_func=quantization, deqtz_func=dequantization, params_list=['_0', '_s'], **kwargs)
|
31 |
+
params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()}
|
32 |
+
if 'post_method_hook' in kwargs:
|
33 |
+
kwargs['post_method_hook'](parameters=params)
|
34 |
+
|
35 |
+
|
36 |
+
if 'post_train_hook' in kwargs:
|
37 |
+
kwargs['post_train_hook'](parameters=params)
|
38 |
+
|
39 |
+
return params
|
40 |
+
|
41 |
+
|
42 |
+
############### Numpy Qtz ###############
|
43 |
+
|
44 |
+
|
45 |
+
def np_quantization(x, _0, _s):
|
46 |
+
return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np.sin((_0 * x)))
|
47 |
+
|
48 |
+
|
49 |
+
def np_dequantization(x, _0, _s):
|
50 |
+
return (np.divide(1, np_replace_num(_0, num=0, to=10000)) * np.arcsin(np_domain_guard((_s * x), min=-0.99999, max=0.99999, nan=0)))
|
51 |
+
|
52 |
+
|
53 |
+
def fit_func(x, _0, _s):
|
54 |
+
x_ = np_quantization(x, _0, _s)
|
55 |
+
x_ = np_dequantization(x_, _0, _s)
|
56 |
+
return x_
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
############### HELPERS ###############
|
61 |
+
|
62 |
+
def domain_guard(
|
63 |
+
x: torch.Tensor,
|
64 |
+
min: float = None,
|
65 |
+
max: float = None,
|
66 |
+
posinf: float = None,
|
67 |
+
neginf: float = None,
|
68 |
+
nan: float = None
|
69 |
+
) -> torch.Tensor:
|
70 |
+
"""Guard a tensor to a valid domain."""
|
71 |
+
x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan)
|
72 |
+
if min is not None or max is not None:
|
73 |
+
x = torch.clamp(x, min=min, max=max)
|
74 |
+
return x
|
75 |
+
|
76 |
+
|
77 |
+
def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor:
|
78 |
+
"""Replace a number in a tensor with another number.
|
79 |
+
|
80 |
+
Args:
|
81 |
+
x (torch.Tensor): The input tensor.
|
82 |
+
num (float): The number to replace.
|
83 |
+
to (float): The number to replace with.
|
84 |
+
|
85 |
+
Returns:
|
86 |
+
torch.Tensor: The tensor with the number replaced.
|
87 |
+
"""
|
88 |
+
return torch.where(x == num, to, x)
|
89 |
+
|
90 |
+
|
91 |
+
def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor:
|
92 |
+
"""Guard the power operation to a valid domain."""
|
93 |
+
return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp)
|
94 |
+
|
95 |
+
|
96 |
+
def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
97 |
+
val = torch.amin(x, dim=1)
|
98 |
+
return torch.ones_like(val, dtype=torch.float32, device=x.device)
|
99 |
+
|
100 |
+
|
101 |
+
def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
102 |
+
val = torch.amin(x, dim=1)
|
103 |
+
return torch.randn_like(val, dtype=torch.float32, device=x.device)
|
104 |
+
|
105 |
+
|
106 |
+
def init_space_search(
|
107 |
+
x: torch.Tensor,
|
108 |
+
**kwargs: Dict[str, Any],
|
109 |
+
) -> torch.Tensor:
|
110 |
+
|
111 |
+
def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int):
|
112 |
+
"""Generates the initial set of parameters. The first iteration generates 10 times more parameters."""
|
113 |
+
for _ in range(n_params * 10): # The first iteration generates 10 times more parameters
|
114 |
+
yield init_rand(tensor) * max_initial # Generates n_params in range [-max_initial, max_initial]
|
115 |
+
|
116 |
+
def _search_param(tensors: List[torch.tensor], n_params):
|
117 |
+
"""Takes the best parameters and generates new parameters around the mean of the best parameters."""
|
118 |
+
torch_tensors = torch.stack(tensors)
|
119 |
+
min_vals, max_vals = torch.aminmax(torch_tensors, dim=0)
|
120 |
+
abs_max_val_per_ch = torch.max(-min_vals, max_vals)
|
121 |
+
mean = torch.mean(torch_tensors, dim=0)
|
122 |
+
for _ in range(n_params): # Generates n_params around the mean of the tensors
|
123 |
+
yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean
|
124 |
+
|
125 |
+
def _calc(x, qtz_func, deqtz_func, **params):
|
126 |
+
x_ = x.transpose(0, 1)
|
127 |
+
x_ = qtz_func(x=x_, **params)
|
128 |
+
x_ = deqtz_func(x=x_, **params)
|
129 |
+
x_ = x_.transpose(0, 1)
|
130 |
+
return x_
|
131 |
+
|
132 |
+
assert "qtz_func" in kwargs, "qtz_func must be provided."
|
133 |
+
assert "deqtz_func" in kwargs, "deqtz_func must be provided."
|
134 |
+
assert "params_list" in kwargs, "params list must be provided."
|
135 |
+
assert "param" in kwargs, "param must be provided."
|
136 |
+
|
137 |
+
qtz_func = kwargs.get('qtz_func')
|
138 |
+
deqtz_func = kwargs.get('deqtz_func')
|
139 |
+
params_list = kwargs.get('params_list')
|
140 |
+
param = kwargs.get('param')
|
141 |
+
|
142 |
+
n_runs = 50 # Number of runs to try to find the best parameters
|
143 |
+
n_random_params = 50 # Number of random parameters to generate
|
144 |
+
n_best_to_pick = 5 # Number of best parameters to pick after each run
|
145 |
+
max_initial = 10000 # Maximum value to initialize the parameters
|
146 |
+
|
147 |
+
# Initializes the parameters
|
148 |
+
base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param }
|
149 |
+
params = _build_initial_param(x, max_initial, n_random_params)
|
150 |
+
|
151 |
+
# Performs the search
|
152 |
+
for _ in range(n_runs):
|
153 |
+
|
154 |
+
best_params = []
|
155 |
+
for param_ in params:
|
156 |
+
try:
|
157 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_})
|
158 |
+
loss_ones = nn.MSELoss()(x, x_)
|
159 |
+
|
160 |
+
if len(best_params) < n_best_to_pick:
|
161 |
+
best_params.append((param_, loss_ones.item()))
|
162 |
+
best_params = sorted(best_params, key=lambda x: x[1])
|
163 |
+
elif loss_ones < best_params[-1][1]:
|
164 |
+
best_params[-1] = (param_, loss_ones.item())
|
165 |
+
best_params = sorted(best_params, key=lambda x: x[1])
|
166 |
+
|
167 |
+
except Exception: # The parameters might not be valid for the function's domain
|
168 |
+
continue
|
169 |
+
|
170 |
+
# Generates new parameters around the mean
|
171 |
+
params = _search_param([p for p, _ in best_params], n_random_params)
|
172 |
+
|
173 |
+
# Checks if the best parameter is better than the init_ones
|
174 |
+
p_ones = init_ones(x, **kwargs)
|
175 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones})
|
176 |
+
loss_ones = nn.MSELoss()(x, x_)
|
177 |
+
|
178 |
+
# Checks if the best parameter is better than the init_rand
|
179 |
+
p_rand = init_rand(x, **kwargs)
|
180 |
+
x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand})
|
181 |
+
loss_rand = nn.MSELoss()(x, x_)
|
182 |
+
|
183 |
+
if loss_rand < best_params[0][1] and loss_rand < loss_ones:
|
184 |
+
return p_rand
|
185 |
+
elif loss_ones < best_params[0][1] and loss_ones < loss_rand:
|
186 |
+
return p_ones
|
187 |
+
else:
|
188 |
+
return best_params[0][0]
|
189 |
+
|
190 |
+
|
191 |
+
def init_linear_scale( # Symmetric scale. From the study folder
|
192 |
+
x: torch.Tensor,
|
193 |
+
**kwargs: Dict[str, Any],
|
194 |
+
) -> torch.Tensor:
|
195 |
+
assert "bits" in kwargs, "bits must be provided."
|
196 |
+
assert "params" in kwargs, "params must be provided."
|
197 |
+
assert "qtz_func" in kwargs, "qtz_func must be provided."
|
198 |
+
|
199 |
+
bits = kwargs.get('bits')
|
200 |
+
params = kwargs.get('params')
|
201 |
+
qtz_func = kwargs.get('qtz_func')
|
202 |
+
|
203 |
+
x_ = x.transpose(0, 1)
|
204 |
+
x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs))
|
205 |
+
x_ = x_.transpose(0, 1)
|
206 |
+
|
207 |
+
quant_min, quant_max = get_min_max_from_bits_signed(bits)
|
208 |
+
min_vals, max_vals = torch.aminmax(x_, dim=1)
|
209 |
+
min_vals = torch.min(min_vals, torch.zeros_like(min_vals))
|
210 |
+
max_vals = torch.max(max_vals, torch.zeros_like(max_vals))
|
211 |
+
|
212 |
+
eps = torch.finfo(torch.float32).eps
|
213 |
+
|
214 |
+
abs_max_val_per_ch = torch.max(-min_vals, max_vals)
|
215 |
+
scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2)
|
216 |
+
|
217 |
+
scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device)
|
218 |
+
|
219 |
+
# Introduces some noise in scale
|
220 |
+
# If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything
|
221 |
+
# scale = scale + 0.01 * torch.randn_like(scale)
|
222 |
+
return scale
|
223 |
+
|
224 |
+
|
225 |
+
def init_non_linear_regression_fit(
|
226 |
+
x: torch.Tensor,
|
227 |
+
**kwargs: Dict[str, Any],
|
228 |
+
) -> torch.Tensor:
|
229 |
+
|
230 |
+
assert "params_list" in kwargs, "params list must be provided."
|
231 |
+
assert "np_fit_func" in kwargs, "np_fit_func must be provided."
|
232 |
+
assert "p0" in kwargs, "p0 must be provided."
|
233 |
+
np_fit_func = kwargs.get('np_fit_func')
|
234 |
+
params_list = kwargs.get('params_list')
|
235 |
+
p0 = kwargs.get('p0')
|
236 |
+
|
237 |
+
def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]):
|
238 |
+
popt, _ = curve_fit(
|
239 |
+
func,
|
240 |
+
xdata,
|
241 |
+
ydata,
|
242 |
+
maxfev=1000,
|
243 |
+
p0=p0,
|
244 |
+
method='lm'
|
245 |
+
)
|
246 |
+
return popt
|
247 |
+
|
248 |
+
# 1. Needs to convert the torch tensor to numpy tensor
|
249 |
+
xdata = x.cpu().numpy()
|
250 |
+
|
251 |
+
# 2. Sorts the data so that it makes it easier to fit to it
|
252 |
+
sorted_xdata = np.sort(xdata, axis=-1)
|
253 |
+
|
254 |
+
p0 = {k: v.cpu().numpy() for k, v in p0.items()}
|
255 |
+
params_list = sorted(params_list) # We need to make sure that it matches the numpy fit func arg order
|
256 |
+
|
257 |
+
# 3. Finds the best parameters for each channel
|
258 |
+
try:
|
259 |
+
params = []
|
260 |
+
for i in range(sorted_xdata.shape[0]):
|
261 |
+
xdata_ = sorted_xdata[i]
|
262 |
+
p0_ = [p0[p][i] for p in params_list]
|
263 |
+
ch_params = _fit(xdata_, xdata_, np_fit_func, p0_)
|
264 |
+
params.append(ch_params)
|
265 |
+
|
266 |
+
# 4. Builds the parameters
|
267 |
+
result = {}
|
268 |
+
for i, p in enumerate(params_list):
|
269 |
+
result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device)
|
270 |
+
|
271 |
+
return result
|
272 |
+
|
273 |
+
except ValueError as e:
|
274 |
+
print(f"Could not fit the function with error: {e}")
|
275 |
+
print(f"Using fallback result...")
|
276 |
+
return {
|
277 |
+
k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items()
|
278 |
+
}
|
279 |
+
|
280 |
+
|
281 |
+
def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
|
282 |
+
val = torch.amin(x, dim=1)
|
283 |
+
return torch.zeros_like(val, dtype=torch.float32, device=x.device)
|
284 |
+
|
285 |
+
|
286 |
+
def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor:
|
287 |
+
# Calculate the original minimum and maximum values
|
288 |
+
min_vals, max_vals = torch.aminmax(tensor, dim=-1)
|
289 |
+
x_min = torch.min(min_vals, torch.zeros_like(min_vals))
|
290 |
+
x_max = torch.max(max_vals, torch.zeros_like(max_vals))
|
291 |
+
|
292 |
+
if _max is torch.inf: # We do not need to scale the tensor. Just need to move it
|
293 |
+
return torch.ones_like(x_min)
|
294 |
+
|
295 |
+
# Calculate the scale factor
|
296 |
+
scale = (_max - _min) / (x_max - x_min)
|
297 |
+
return scale
|
298 |
+
|
299 |
+
|
300 |
+
|
301 |
+
############## Quant ###############
|
302 |
+
|
303 |
+
@torch.enable_grad()
|
304 |
+
def learn_parameters(
|
305 |
+
x: torch.Tensor,
|
306 |
+
params: Dict[str, nn.Parameter],
|
307 |
+
qtz_func: nn.Module,
|
308 |
+
deqtz_func: nn.Module,
|
309 |
+
bits: int,
|
310 |
+
target_dtype: torch.dtype,
|
311 |
+
epochs: int = 1000,
|
312 |
+
early_stop: bool = True,
|
313 |
+
do_report: bool = False
|
314 |
+
) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]:
|
315 |
+
loss_fn = nn.MSELoss()
|
316 |
+
|
317 |
+
# Determines the initial learning rate by computing the initial loss and multiplying it by
|
318 |
+
# the order of magnitude of the loss divided by 2
|
319 |
+
quant = quantize(x, params, qtz_func, bits, target_dtype)
|
320 |
+
dequant = dequantize(quant, params, deqtz_func, bits, x.dtype)
|
321 |
+
loss = loss_fn(x, dequant)
|
322 |
+
|
323 |
+
base_lr = 0.1
|
324 |
+
exponent = int(np.floor(np.log10(loss.item())))
|
325 |
+
lr = base_lr * (10 ** (exponent // 2))
|
326 |
+
|
327 |
+
# Requires gradients in the parameters
|
328 |
+
for p in params.values():
|
329 |
+
p.requires_grad = True
|
330 |
+
p.grad = None
|
331 |
+
|
332 |
+
param_keys = list(params.keys())
|
333 |
+
param_values = list(params.values())
|
334 |
+
|
335 |
+
# Defines optimizer and loss function
|
336 |
+
optimizer = torch.optim.Adam(param_values, lr=lr)
|
337 |
+
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.01, total_iters=epochs // 10)
|
338 |
+
|
339 |
+
# Contains the best loss and the best parameters
|
340 |
+
best_loss = float("inf")
|
341 |
+
best_params = None
|
342 |
+
|
343 |
+
# Used to stop the search early
|
344 |
+
min_delta = 1e-7
|
345 |
+
acc_loss = []
|
346 |
+
percent_epochs_before_stop = 0.1
|
347 |
+
|
348 |
+
for i in range(epochs):
|
349 |
+
optimizer.zero_grad()
|
350 |
+
|
351 |
+
quant = quantize(x, params, qtz_func, bits, target_dtype)
|
352 |
+
dequant = dequantize(quant, params, deqtz_func, bits, x.dtype)
|
353 |
+
loss = loss_fn(x, dequant)
|
354 |
+
|
355 |
+
if loss.isnan() or loss.isinf():
|
356 |
+
raise Exception("Loss is NaN or Inf. Stopping the search.")
|
357 |
+
|
358 |
+
loss.backward()
|
359 |
+
optimizer.step()
|
360 |
+
scheduler.step()
|
361 |
+
|
362 |
+
acc_loss.append(loss.item())
|
363 |
+
|
364 |
+
# Reports loss every 10 steps
|
365 |
+
if i % 10 == 0 and do_report:
|
366 |
+
print(f"Epoch {i}: Loss {loss.item()}")
|
367 |
+
|
368 |
+
# Optimizes the parameter search by storing the best loss and the parameters
|
369 |
+
if loss.item() < best_loss:
|
370 |
+
best_loss = loss.item()
|
371 |
+
best_params = copy.deepcopy({
|
372 |
+
k: v for k, v in params.items() if k in param_keys
|
373 |
+
})
|
374 |
+
|
375 |
+
# We also stop the search if the loss has not considerably during the last 10% epochs
|
376 |
+
if early_stop:
|
377 |
+
epochs_before_stop = int(epochs * percent_epochs_before_stop)
|
378 |
+
if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta:
|
379 |
+
break
|
380 |
+
|
381 |
+
# No longer requires gradients in the parameters
|
382 |
+
for p in best_params.values():
|
383 |
+
p.requires_grad = False
|
384 |
+
p.grad = None
|
385 |
+
|
386 |
+
if do_report:
|
387 |
+
return best_params, acc_loss
|
388 |
+
else:
|
389 |
+
return best_params
|
390 |
+
|
391 |
+
|
392 |
+
def quantize(
|
393 |
+
x: torch.Tensor,
|
394 |
+
params: Dict[str, nn.Parameter],
|
395 |
+
func: nn.Module,
|
396 |
+
bits: int,
|
397 |
+
target_dtype: torch.dtype = torch.int8
|
398 |
+
) -> torch.Tensor:
|
399 |
+
quant_min, quant_max = get_min_max_from_bits_signed(bits)
|
400 |
+
x = x.transpose(0, 1) # Aligns shapes
|
401 |
+
x = func(x=x, **params)
|
402 |
+
x = x.transpose(0, 1)
|
403 |
+
x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype)
|
404 |
+
return x
|
405 |
+
|
406 |
+
|
407 |
+
def dequantize(
|
408 |
+
x: torch.Tensor,
|
409 |
+
params: Dict[str, nn.Parameter],
|
410 |
+
func: nn.Module,
|
411 |
+
bits: int,
|
412 |
+
out_dtype: torch.dtype
|
413 |
+
) -> torch.Tensor:
|
414 |
+
x = x.to(dtype=out_dtype)
|
415 |
+
x = x.transpose(0, 1)
|
416 |
+
x = func(x=x, **params)
|
417 |
+
x = x.transpose(0, 1)
|
418 |
+
return x
|
419 |
+
|
420 |
+
|
421 |
+
def round_func_BPDA(input):
|
422 |
+
# This is equivalent to replacing round function (non-differentiable) with
|
423 |
+
# an identity function (differentiable) only when backward.
|
424 |
+
forward_value = torch.round(input)
|
425 |
+
out = input.clone()
|
426 |
+
out.data = forward_value.data
|
427 |
+
return out
|
428 |
+
|
429 |
+
|
430 |
+
def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]:
|
431 |
+
return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1
|
432 |
+
|
433 |
+
|
434 |
+
|
435 |
+
############## Numpy ###############
|
436 |
+
|
437 |
+
def np_domain_guard(
|
438 |
+
x: np.ndarray,
|
439 |
+
min: float = None,
|
440 |
+
max: float = None,
|
441 |
+
posinf: float = None,
|
442 |
+
neginf: float = None,
|
443 |
+
nan: float = None
|
444 |
+
) -> np.ndarray:
|
445 |
+
"""Guard a tensor to a valid domain."""
|
446 |
+
x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan)
|
447 |
+
if min is not None or max is not None:
|
448 |
+
x = np.clip(x, min, max)
|
449 |
+
return x
|
450 |
+
|
451 |
+
|
452 |
+
def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray:
|
453 |
+
"""Replace a number in a tensor with another number.
|
454 |
+
|
455 |
+
Args:
|
456 |
+
x (np.ndarray): The input tensor.
|
457 |
+
num (float): The number to replace.
|
458 |
+
to (float): The number to replace with.
|
459 |
+
|
460 |
+
Returns:
|
461 |
+
np.ndarray: The tensor with the number replaced.
|
462 |
+
"""
|
463 |
+
return np.where(x == num, to, x)
|
464 |
+
|
465 |
+
|
466 |
+
def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray:
|
467 |
+
"""Guard the power operation to a valid domain."""
|
468 |
+
return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp)
|
469 |
+
|
fn_gen/norm_nlr/15/loss.png
ADDED
fn_gen/norm_nlr/15/quantization.png
ADDED
fn_gen/norm_nlr/16/__pycache__/fn.cpython-311.pyc
ADDED
Binary file (25.3 kB). View file
|
|
fn_gen/norm_nlr/16/distortion.png
ADDED