from __future__ import annotations

import torch
from torch import amin  # Necessary for arcsin
import copy
import torch.nn as nn
import numpy as np

from scipy.optimize import curve_fit
from typing import Dict, Any, Tuple, List, Callable


def quantization(x, **params):
	return (torch.div(1, replace_num(params['_s'], num=0, to=10000)) * torch.tanh((params['_0'] * x)))


def dequantization(x, **params):
	return (torch.div(1, replace_num(params['_0'], num=0, to=10000)) * torch.log(domain_guard((torch.div(1, replace_num((torch.tensor(-1) + (params['_s'] * x)), num=0, to=10000)) * (torch.tensor(-1) + (torch.tensor(-1) * params['_s'] * x))), min=1e-5, nan=1e-5)))


def init_space_search(
        x: torch.Tensor,
        **kwargs: Dict[str, Any],
    ) -> torch.Tensor:

    def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int):
        """Generates the initial set of parameters. The first iteration generates 10 times more parameters."""
        for _ in range(n_params * 10):  # The first iteration generates 10 times more parameters
            yield init_rand(tensor) * max_initial  # Generates n_params in range [-max_initial, max_initial]

    def _search_param(tensors: List[torch.tensor], n_params):
        """Takes the best parameters and generates new parameters around the mean of the best parameters."""
        torch_tensors = torch.stack(tensors)
        min_vals, max_vals = torch.aminmax(torch_tensors, dim=0)
        abs_max_val_per_ch = torch.max(-min_vals, max_vals)
        mean = torch.mean(torch_tensors, dim=0)
        for _ in range(n_params):  # Generates n_params around the mean of the tensors
            yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean

    def _calc(x, qtz_func, deqtz_func, **params):
        x_ = x.transpose(0, 1)
        x_ = qtz_func(x=x_, **params)
        x_ = deqtz_func(x=x_, **params)
        x_ = x_.transpose(0, 1)
        return x_

    assert "qtz_func" in kwargs, "qtz_func must be provided."
    assert "deqtz_func" in kwargs, "deqtz_func must be provided."
    assert "params_list" in kwargs, "params list must be provided."
    assert "param" in kwargs, "param must be provided."

    qtz_func = kwargs.get('qtz_func')
    deqtz_func = kwargs.get('deqtz_func')
    params_list = kwargs.get('params_list')
    param = kwargs.get('param')

    n_runs = 50                # Number of runs to try to find the best parameters
    n_random_params = 50       # Number of random parameters to generate
    n_best_to_pick = 5         # Number of best parameters to pick after each run
    max_initial = 10000        # Maximum value to initialize the parameters

    # Initializes the parameters
    base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param }
    params = _build_initial_param(x, max_initial, n_random_params)

    # Performs the search
    for _ in range(n_runs):
        
        best_params = []
        for param_ in params:
            try:
                x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_})
                loss_ones = nn.MSELoss()(x, x_)

                if len(best_params) < n_best_to_pick:
                    best_params.append((param_, loss_ones.item()))
                    best_params = sorted(best_params, key=lambda x: x[1])
                elif loss_ones < best_params[-1][1]:
                    best_params[-1] = (param_, loss_ones.item())
                    best_params = sorted(best_params, key=lambda x: x[1])

            except Exception:  # The parameters might not be valid for the function's domain
                continue

        # Generates new parameters around the mean
        params = _search_param([p for p, _ in best_params], n_random_params)

    # Checks if the best parameter is better than the init_ones
    p_ones = init_ones(x, **kwargs)
    x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones})
    loss_ones = nn.MSELoss()(x, x_)

    # Checks if the best parameter is better than the init_rand
    p_rand = init_rand(x, **kwargs)
    x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand})
    loss_rand = nn.MSELoss()(x, x_)

    if loss_rand < best_params[0][1] and loss_rand < loss_ones:
        return p_rand
    elif loss_ones < best_params[0][1] and loss_ones < loss_rand:
        return p_ones
    else:
        return best_params[0][0]


def init_linear_scale(  # Symmetric scale. From the study folder
        x: torch.Tensor,
        **kwargs: Dict[str, Any],
    ) -> torch.Tensor:
    assert "bits" in kwargs, "bits must be provided."
    assert "params" in kwargs, "params must be provided."
    assert "qtz_func" in kwargs, "qtz_func must be provided."

    bits = kwargs.get('bits')
    params = kwargs.get('params')
    qtz_func = kwargs.get('qtz_func')

    x_ = x.transpose(0, 1)
    x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs))
    x_ = x_.transpose(0, 1)
    
    quant_min, quant_max = get_min_max_from_bits_signed(bits)
    min_vals, max_vals = torch.aminmax(x_, dim=1)
    min_vals = torch.min(min_vals, torch.zeros_like(min_vals))
    max_vals = torch.max(max_vals, torch.zeros_like(max_vals))

    eps = torch.finfo(torch.float32).eps

    abs_max_val_per_ch = torch.max(-min_vals, max_vals)
    scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2)
    
    scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device)

    # Introduces some noise in scale
    # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything
    # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything.
    # NOTE(diogo): This has been disproven. The noise does not help the learning process but I still
    # left it here for future reference. Will be removed later.
    # scale = scale + 0.01 * torch.randn_like(scale)

    return scale


def init_params(x: torch.Tensor, **kwargs: Dict[str, Any]) -> Dict[str, nn.Parameter]:
	params = {
		'_0': init_space_search(x, qtz_func=quantization, deqtz_func=dequantization, param='_0', params_list=['_0', '_s'], **kwargs),
	}
	params['_s'] = init_linear_scale(x, params=params, qtz_func=quantization, **kwargs)
	params = {k: nn.Parameter(v, requires_grad=False) for k, v in params.items()}
	
	if 'post_init_hook' in kwargs:
		kwargs['post_init_hook'](parameters=params)
	
	params = learn_parameters(x, params,
		qtz_func=quantization,
		deqtz_func=dequantization,
		bits=kwargs['bits'],
		target_dtype=torch.int8,
		epochs=500,
		early_stop=False,
	)
	if 'post_train_hook' in kwargs:
		kwargs['post_train_hook'](parameters=params)
	
	return params


############### Numpy Qtz ###############


def np_quantization(x, _0, _s):
	return (np.divide(1, np_replace_num(_s, num=0, to=10000)) * np.tanh((_0 * x)))


def np_dequantization(x, _0, _s):
	return (np.divide(1, np_replace_num(_0, num=0, to=10000)) * np.log(np_domain_guard((np.divide(1, np_replace_num((np.array(-1) + (_s * x)), num=0, to=10000)) * (np.array(-1) + (np.array(-1) * _s * x))), min=1e-5, nan=1e-5)))


def fit_func(x, _0, _s):
    x_ = np_quantization(x, _0, _s)
    x_ = np_dequantization(x_, _0, _s)
    return x_


############### HELPERS ###############

def domain_guard(
        x: torch.Tensor, 
        min: float = None, 
        max: float = None, 
        posinf: float = None,
        neginf: float = None,
        nan: float = None
    ) -> torch.Tensor:
    """Guard a tensor to a valid domain."""
    x = torch.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan)
    if min is not None or max is not None:
        x = torch.clamp(x, min=min, max=max)
    return x


def replace_num(x: torch.Tensor, num: float, to: float) -> torch.Tensor:
    """Replace a number in a tensor with another number.
    
    Args:
        x (torch.Tensor): The input tensor.
        num (float): The number to replace.
        to (float): The number to replace with.

    Returns:
        torch.Tensor: The tensor with the number replaced.
    """
    return torch.where(x == num, to, x)


def guarded_torch_power(x: torch.Tensor, exp: float) -> torch.Tensor:
    """Guard the power operation to a valid domain."""
    return torch.pow(x, exp) if exp >= 1 else torch.pow(torch.relu(x), exp)


def init_ones(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
    val = torch.amin(x, dim=1)
    return torch.ones_like(val, dtype=torch.float32, device=x.device)


def init_rand(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
    val = torch.amin(x, dim=1)
    return torch.randn_like(val, dtype=torch.float32, device=x.device)


def init_space_search(
        x: torch.Tensor,
        **kwargs: Dict[str, Any],
    ) -> torch.Tensor:

    def _build_initial_param(tensor: torch.Tensor, max_initial: int, n_params: int):
        """Generates the initial set of parameters. The first iteration generates 10 times more parameters."""
        for _ in range(n_params * 10):  # The first iteration generates 10 times more parameters
            yield init_rand(tensor) * max_initial  # Generates n_params in range [-max_initial, max_initial]

    def _search_param(tensors: List[torch.tensor], n_params):
        """Takes the best parameters and generates new parameters around the mean of the best parameters."""
        torch_tensors = torch.stack(tensors)
        min_vals, max_vals = torch.aminmax(torch_tensors, dim=0)
        abs_max_val_per_ch = torch.max(-min_vals, max_vals)
        mean = torch.mean(torch_tensors, dim=0)
        for _ in range(n_params):  # Generates n_params around the mean of the tensors
            yield torch.randn_like(min_vals) * abs_max_val_per_ch + mean

    def _calc(x, qtz_func, deqtz_func, **params):
        x_ = x.transpose(0, 1)
        x_ = qtz_func(x=x_, **params)
        x_ = deqtz_func(x=x_, **params)
        x_ = x_.transpose(0, 1)
        return x_

    assert "qtz_func" in kwargs, "qtz_func must be provided."
    assert "deqtz_func" in kwargs, "deqtz_func must be provided."
    assert "params_list" in kwargs, "params list must be provided."
    assert "param" in kwargs, "param must be provided."

    qtz_func = kwargs.get('qtz_func')
    deqtz_func = kwargs.get('deqtz_func')
    params_list = kwargs.get('params_list')
    param = kwargs.get('param')

    n_runs = 50                # Number of runs to try to find the best parameters
    n_random_params = 50       # Number of random parameters to generate
    n_best_to_pick = 5         # Number of best parameters to pick after each run
    max_initial = 10000        # Maximum value to initialize the parameters

    # Initializes the parameters
    base_params = { p: init_ones(x, **kwargs) for p in params_list if p != param }
    params = _build_initial_param(x, max_initial, n_random_params)

    # Performs the search
    for _ in range(n_runs):
        
        best_params = []
        for param_ in params:
            try:
                x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: param_})
                loss_ones = nn.MSELoss()(x, x_)

                if len(best_params) < n_best_to_pick:
                    best_params.append((param_, loss_ones.item()))
                    best_params = sorted(best_params, key=lambda x: x[1])
                elif loss_ones < best_params[-1][1]:
                    best_params[-1] = (param_, loss_ones.item())
                    best_params = sorted(best_params, key=lambda x: x[1])

            except Exception:  # The parameters might not be valid for the function's domain
                continue

        # Generates new parameters around the mean
        params = _search_param([p for p, _ in best_params], n_random_params)

    # Checks if the best parameter is better than the init_ones
    p_ones = init_ones(x, **kwargs)
    x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_ones})
    loss_ones = nn.MSELoss()(x, x_)

    # Checks if the best parameter is better than the init_rand
    p_rand = init_rand(x, **kwargs)
    x_ = _calc(x, qtz_func, deqtz_func, **base_params, **{param: p_rand})
    loss_rand = nn.MSELoss()(x, x_)

    if loss_rand < best_params[0][1] and loss_rand < loss_ones:
        return p_rand
    elif loss_ones < best_params[0][1] and loss_ones < loss_rand:
        return p_ones
    else:
        return best_params[0][0]


def init_linear_scale(  # Symmetric scale. From the study folder
        x: torch.Tensor,
        **kwargs: Dict[str, Any],
    ) -> torch.Tensor:
    assert "bits" in kwargs, "bits must be provided."
    assert "params" in kwargs, "params must be provided."
    assert "qtz_func" in kwargs, "qtz_func must be provided."

    bits = kwargs.get('bits')
    params = kwargs.get('params')
    qtz_func = kwargs.get('qtz_func')

    x_ = x.transpose(0, 1)
    x_ = qtz_func(x=x_, **params, _s=init_ones(x, **kwargs))
    x_ = x_.transpose(0, 1)
    
    quant_min, quant_max = get_min_max_from_bits_signed(bits)
    min_vals, max_vals = torch.aminmax(x_, dim=1)
    min_vals = torch.min(min_vals, torch.zeros_like(min_vals))
    max_vals = torch.max(max_vals, torch.zeros_like(max_vals))

    eps = torch.finfo(torch.float32).eps

    abs_max_val_per_ch = torch.max(-min_vals, max_vals)
    scale = abs_max_val_per_ch / (float(quant_max - quant_min) / 2)
    
    scale = torch.clamp(scale, min=eps).to(dtype=torch.float32, device=min_vals.device)

    # Introduces some noise in scale
    # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything
    # If I don't introduce noise, the accuracy is going to be 0.0 and not learn anything.
    # NOTE(diogo): This has been disproven. The noise does not help the learning process but I still
    # left it here for future reference. Will be removed later.
    # scale = scale + 0.01 * torch.randn_like(scale)

    return scale


def init_non_linear_regression_fit(
        x: torch.Tensor,
        **kwargs: Dict[str, Any],
    ) -> torch.Tensor:

    assert "params_list" in kwargs, "params list must be provided."
    assert "np_fit_func" in kwargs, "np_fit_func must be provided."
    assert "p0" in kwargs, "p0 must be provided."
    np_fit_func = kwargs.get('np_fit_func')
    params_list = kwargs.get('params_list')
    p0 = kwargs.get('p0')

    def _fit(xdata: np.ndarray, ydata: np.ndarray, func: Callable, p0: List[float]):
        popt, _ = curve_fit(
            func, 
            xdata, 
            ydata, 
            maxfev=1000,
            p0=p0,
            method='lm'
        )
        return popt

    # 1. Needs to convert the torch tensor to numpy tensor
    xdata = x.cpu().numpy()

    # 2. Sorts the data so that it makes it easier to fit to it
    sorted_xdata = np.sort(xdata, axis=-1)

    p0 = {k: v.cpu().numpy() for k, v in p0.items()}
    params_list = sorted(params_list)  # We need to make sure that it matches the numpy fit func arg order

    # 3. Finds the best parameters for each channel
    try:
        params = []
        for i in range(sorted_xdata.shape[0]):
            xdata_ = sorted_xdata[i]
            p0_ = [p0[p][i] for p in params_list]
            ch_params = _fit(xdata_, xdata_, np_fit_func, p0_)
            params.append(ch_params)

        # 4. Builds the parameters
        result = {}
        for i, p in enumerate(params_list):
            result[p] = torch.tensor([p_[i] for p_ in params], dtype=torch.float32).to(x.device)

        return result

    except ValueError as e:
        print(f"Could not fit the function with error: {e}")
        print(f"Using fallback result...")
        return {
            k: torch.tensor(v, dtype=torch.float32).to(x.device) for k, v in p0.items()
        }


def init_zeros(x: torch.Tensor, **kwargs: Dict[str, Any]) -> torch.Tensor:
    val = torch.amin(x, dim=1)
    return torch.zeros_like(val, dtype=torch.float32, device=x.device)


def init_inner_scale(tensor: torch.Tensor, _min: float = torch.inf, _max: float = torch.inf) -> torch.Tensor:
    # Calculate the original minimum and maximum values
    min_vals, max_vals = torch.aminmax(tensor, dim=-1)
    x_min = torch.min(min_vals, torch.zeros_like(min_vals))
    x_max = torch.max(max_vals, torch.zeros_like(max_vals))

    if _max is torch.inf:  # We do not need to scale the tensor. Just need to move it
        return torch.ones_like(x_min)
    
    # Calculate the scale factor
    scale = (_max - _min) / (x_max - x_min)
    return scale


############## Quant ###############

@torch.enable_grad()
def learn_parameters(
    x: torch.Tensor,
    params: Dict[str, nn.Parameter],
    qtz_func: nn.Module,
    deqtz_func: nn.Module,
    bits: int,
    target_dtype: torch.dtype,
    epochs: int = 1000,
    early_stop: bool = True,
    do_report: bool = False
) -> Tuple[Dict[str, nn.Parameter], torch.Tensor]:
    loss_fn = nn.MSELoss()

    # Determines the initial learning rate by computing the initial loss and multiplying it by
    # the order of magnitude of the loss divided by 2
    quant = quantize(x, params, qtz_func, bits, target_dtype)
    dequant = dequantize(quant, params, deqtz_func, bits, x.dtype)
    loss = loss_fn(x, dequant)

    base_lr = 0.1
    exponent = int(np.floor(np.log10(loss.item())))
    lr = base_lr * (10 ** (exponent // 2))
    
    # Requires gradients in the parameters
    for p in params.values():
        p.requires_grad = True
        p.grad = None

    param_keys = list(params.keys())
    param_values = list(params.values())

    # Defines optimizer and loss function
    optimizer = torch.optim.Adam(param_values, lr=lr)

    # Contains the best loss and the best parameters 
    best_loss = float("inf")
    best_params = None

    # Used to stop the search early
    min_delta = 1e-7
    acc_loss = []
    percent_epochs_before_stop = 0.1

    for i in range(epochs):
        optimizer.zero_grad()

        quant = quantize(x, params, qtz_func, bits, target_dtype)
        dequant = dequantize(quant, params, deqtz_func, bits, x.dtype)
        loss = loss_fn(x, dequant)

        if loss.isnan() or loss.isinf():
            raise Exception("Loss is NaN or Inf. Stopping the search.")

        loss.backward()
        optimizer.step()

        acc_loss.append(loss.item())

        # Reports loss every 10 steps
        if i % 10 == 0 and do_report:
            print(f"Epoch {i}: Loss {loss.item()}")

        # Optimizes the parameter search by storing the best loss and the parameters
        if loss.item() < best_loss:
            best_loss = loss.item()
            best_params = copy.deepcopy({
                k: v for k, v in params.items() if k in param_keys
            })

        # We also stop the search if the loss has not considerably during the last 10% epochs
        if early_stop:
            epochs_before_stop = int(epochs * percent_epochs_before_stop)
            if i > epochs_before_stop and abs(acc_loss[i - epochs_before_stop] - acc_loss[i]) < min_delta:
                break

    # No longer requires gradients in the parameters
    for p in best_params.values():
        p.requires_grad = False
        p.grad = None
    
    if do_report:
        print(f"Best loss: {best_loss}")
        return best_params, acc_loss
    else:
        return best_params


def quantize(
    x: torch.Tensor,
    params: Dict[str, nn.Parameter],
    func: nn.Module,
    bits: int,
    target_dtype: torch.dtype = torch.int8
) -> torch.Tensor:
    quant_min, quant_max = get_min_max_from_bits_signed(bits)
    x = x.transpose(0, 1)  # Aligns shapes
    x = func(x=x, **params)
    x = x.transpose(0, 1)
    x = torch.clamp(round_func_BPDA(x), quant_min, quant_max).to(target_dtype)
    return x


def dequantize(
    x: torch.Tensor,
    params: Dict[str, nn.Parameter],
    func: nn.Module,
    bits: int,
    out_dtype: torch.dtype
) -> torch.Tensor:
    x = x.to(dtype=out_dtype)
    x = x.transpose(0, 1)
    x = func(x=x, **params)
    x = x.transpose(0, 1)
    return x


def round_func_BPDA(input):
    # This is equivalent to replacing round function (non-differentiable) with
    # an identity function (differentiable) only when backward.
    forward_value = torch.round(input)
    out = input.clone()
    out.data = forward_value.data
    return out


def get_min_max_from_bits_signed(bit_width: int) -> Tuple[int, int]:
    return -2 ** (bit_width - 1), 2 ** (bit_width - 1) - 1


############## Numpy ###############

def np_domain_guard(
        x: np.ndarray,
        min: float = None,
        max: float = None,
        posinf: float = None,
        neginf: float = None,
        nan: float = None
    ) -> np.ndarray:
    """Guard a tensor to a valid domain."""
    x = np.nan_to_num(x, posinf=posinf, neginf=neginf, nan=nan)
    if min is not None or max is not None:
        x = np.clip(x, min, max)
    return x


def np_replace_num(x: np.ndarray, num: float, to: float) -> np.ndarray:
    """Replace a number in a tensor with another number.
    
    Args:
        x (np.ndarray): The input tensor.
        num (float): The number to replace.
        to (float): The number to replace with.

    Returns:
        np.ndarray: The tensor with the number replaced.
    """
    return np.where(x == num, to, x)


def np_guarded_power(x: np.ndarray, exp: float) -> np.ndarray:
    """Guard the power operation to a valid domain."""
    return np.power(x, exp) if exp >= 1 else np.power(np.maximum(x, 0), exp)