import torch
import torch.nn as nn
import torch.optim as optim
from ding.torch_utils.optimizer_helper import Adam, RMSprop, calculate_grad_norm, \
    calculate_grad_norm_without_bias_two_norm, PCGrad, configure_weight_decay
import pytest
import time


class LinearNet(nn.Module):

    def __init__(self, features_in=1, features_out=1):
        super().__init__()
        self.linear = nn.Linear(features_in, features_out)
        self._init_weight()

    def forward(self, x):
        return self.linear(x)

    def _init_weight(self):
        nn.init.constant_(self.linear.weight, val=1)
        nn.init.constant_(self.linear.bias, val=0)


def try_optim_with(tname, t, optim_t):
    net = LinearNet()
    mse_fn = nn.L1Loss()
    if tname == 'grad_clip':
        if optim_t == 'rmsprop':
            optimizer = RMSprop(
                net.parameters(),
                grad_clip_type=t,
                clip_value=0.000001,
                clip_norm_type=1.2,
                lr=0.1,
                clip_momentum_timestep=2,
                ignore_momentum_timestep=2,
                clip_coef=0.5
            )
        else:
            optimizer = Adam(
                net.parameters(),
                grad_clip_type=t,
                clip_value=0.000001,
                clip_norm_type=1.2,
                lr=0.1,
                optim_type=optim_t,
                clip_momentum_timestep=2,
                ignore_momentum_timestep=2,
                clip_coef=0.5
            )
    if tname == 'grad_ignore':
        if optim_t == 'rmsprop':
            optimizer = RMSprop(
                net.parameters(),
                grad_ignore_type=t,
                clip_value=0.000001,
                ignore_value=0.000001,
                ignore_norm_type=1.2,
                lr=0.1,
                clip_momentum_timestep=2,
                ignore_momentum_timestep=2,
            )
        else:
            optimizer = Adam(
                net.parameters(),
                grad_ignore_type=t,
                clip_value=0.000001,
                ignore_value=0.000001,
                ignore_norm_type=1.2,
                lr=0.1,
                optim_type=optim_t,
                clip_momentum_timestep=2,
                ignore_momentum_timestep=2,
                ignore_coef=0.01
            )
    # 网络输入和标签
    x = torch.FloatTensor([120])
    x.requires_grad = True
    target_value = torch.FloatTensor([2])
    target_value.requires_grad = True
    # loss计算
    for _ in range(10):
        predict = net(x)
        loss = mse_fn(predict, target_value)
        loss.backward()
        optimizer.step()
    if t is not None and 'ignore' not in t:
        assert optimizer.get_grad() != 0.
    for _ in range(10):
        target_value = torch.FloatTensor([_ ** 2])
        target_value.requires_grad = True
        predict = net(x)
        loss = mse_fn(predict, target_value)
        loss.backward()
        optimizer.step()

    if t is None:
        print("weight without optimizer clip:" + str(net.linear.weight))
    else:
        print("weight with optimizer {} of type: {} is ".format(tname, t) + str(net.linear.weight))

    weight = net.linear.weight
    return weight


@pytest.mark.unittest
class TestAdam:

    def test_naive(self):
        support_type = {
            'optim': ['adam', 'adamw'],
            'grad_clip': [None, 'clip_momentum', 'clip_value', 'clip_norm', 'clip_momentum_norm'],
            'grad_norm': [None],
            'grad_ignore': [None, 'ignore_momentum', 'ignore_value', 'ignore_norm', 'ignore_momentum_norm'],
        }

        for optim_t in support_type['optim']:
            for tname in ['grad_clip', 'grad_ignore']:
                for t in support_type[tname]:
                    try_optim_with(tname=tname, t=t, optim_t=optim_t)


@pytest.mark.unittest
class TestRMSprop:

    def test_naive(self):
        support_type = {
            'grad_clip': [None, 'clip_momentum', 'clip_value', 'clip_norm', 'clip_momentum_norm'],
            'grad_norm': [None],
            'grad_ignore': [None, 'ignore_momentum', 'ignore_value', 'ignore_norm', 'ignore_momentum_norm'],
        }

        for tname in ['grad_clip', 'grad_ignore']:
            for t in support_type[tname]:
                try_optim_with(tname=tname, t=t, optim_t='rmsprop')


@pytest.mark.unittest
class Test_calculate_grad_norm_with_without_bias:

    def test_two_functions(self):
        net = LinearNet()
        mse_fn = nn.L1Loss()
        optimizer = Adam(net.parameters(), )
        x = torch.FloatTensor([120])
        x.requires_grad = True
        target_value = torch.FloatTensor([2])
        target_value.requires_grad = True
        for _ in range(10):
            predict = net(x)
            loss = mse_fn(predict, target_value)
            loss.backward()
            optimizer.step()
        inf_norm = calculate_grad_norm(model=net, norm_type='inf')
        two_norm = calculate_grad_norm(model=net)
        two_norm_nobias = float(calculate_grad_norm_without_bias_two_norm(model=net))
        one_norm = calculate_grad_norm(model=net, norm_type=1)
        assert isinstance(two_norm, float)
        assert isinstance(inf_norm, float)
        assert isinstance(one_norm, float)
        assert isinstance(two_norm_nobias, float)


@pytest.mark.unittest
class TestPCGrad:

    def naive_test(self):
        x, y = torch.randn(2, 3), torch.randn(2, 4)
        net = LinearNet(3, 4)
        y_pred = net(x)
        pc_adam = PCGrad(optim.Adam(net.parameters()))
        pc_adam.zero_grad()
        loss1_fn, loss2_fn = nn.L1Loss(), nn.MSELoss()
        loss1, loss2 = loss1_fn(y_pred, y), loss2_fn(y_pred, y)

        pc_adam.pc_backward([loss1, loss2])
        for p in net.parameters():
            assert isinstance(p, torch.Tensor)


@pytest.mark.unittest
class TestWeightDecay:

    def test_wd(self):
        net = nn.Sequential(nn.Linear(3, 4), nn.LayerNorm(4))
        x = torch.randn(1, 3)
        group_params = configure_weight_decay(model=net, weight_decay=1e-4)
        assert group_params[0]['weight_decay'] == 1e-4
        assert group_params[1]['weight_decay'] == 0
        assert len(group_params[0]['params']) == 1
        assert len(group_params[1]['params']) == 3
        opt = Adam(group_params, lr=1e-2)
        opt.zero_grad()
        y = torch.sum(net(x))
        y.backward()
        opt.step()