Spaces:

zjowowen
/

gomoku

Sleeping

File size: 6,878 Bytes

079c32c

import math
from typing import List, Dict, Any, Tuple
from collections import namedtuple

import torch
import torch.nn as nn
from torch.optim import Adam, SGD, AdamW
from torch.optim.lr_scheduler import LambdaLR

from ding.policy import Policy
from ding.model import model_wrap
from ding.torch_utils import to_device
from ding.utils import EasyTimer
from ding.utils import POLICY_REGISTRY


@POLICY_REGISTRY.register('pc_bfs')
class ProcedureCloningBFSPolicy(Policy):

    def default_model(self) -> Tuple[str, List[str]]:
        return 'pc_bfs', ['ding.model.template.procedure_cloning']

    config = dict(
        type='pc',
        cuda=False,
        on_policy=False,
        continuous=False,
        max_bfs_steps=100,
        learn=dict(
            update_per_collect=1,
            batch_size=32,
            learning_rate=1e-5,
            lr_decay=False,
            decay_epoch=30,
            decay_rate=0.1,
            warmup_lr=1e-4,
            warmup_epoch=3,
            optimizer='SGD',
            momentum=0.9,
            weight_decay=1e-4,
        ),
        collect=dict(
            unroll_len=1,
            noise=False,
            noise_sigma=0.2,
            noise_range=dict(
                min=-0.5,
                max=0.5,
            ),
        ),
        eval=dict(),
        other=dict(replay_buffer=dict(replay_buffer_size=10000)),
    )

    def _init_learn(self):
        assert self._cfg.learn.optimizer in ['SGD', 'Adam']
        if self._cfg.learn.optimizer == 'SGD':
            self._optimizer = SGD(
                self._model.parameters(),
                lr=self._cfg.learn.learning_rate,
                weight_decay=self._cfg.learn.weight_decay,
                momentum=self._cfg.learn.momentum
            )
        elif self._cfg.learn.optimizer == 'Adam':
            if self._cfg.learn.weight_decay is None:
                self._optimizer = Adam(
                    self._model.parameters(),
                    lr=self._cfg.learn.learning_rate,
                )
            else:
                self._optimizer = AdamW(
                    self._model.parameters(),
                    lr=self._cfg.learn.learning_rate,
                    weight_decay=self._cfg.learn.weight_decay
                )
        if self._cfg.learn.lr_decay:

            def lr_scheduler_fn(epoch):
                if epoch <= self._cfg.learn.warmup_epoch:
                    return self._cfg.learn.warmup_lr / self._cfg.learn.learning_rate
                else:
                    ratio = (epoch - self._cfg.learn.warmup_epoch) // self._cfg.learn.decay_epoch
                    return math.pow(self._cfg.learn.decay_rate, ratio)

            self._lr_scheduler = LambdaLR(self._optimizer, lr_scheduler_fn)
        self._timer = EasyTimer(cuda=True)
        self._learn_model = model_wrap(self._model, 'base')
        self._learn_model.reset()
        self._max_bfs_steps = self._cfg.max_bfs_steps
        self._maze_size = self._cfg.maze_size
        self._num_actions = self._cfg.num_actions

        self._loss = nn.CrossEntropyLoss()

    def process_states(self, observations, maze_maps):
        """Returns [B, W, W, 3] binary values. Channels are (wall; goal; obs)"""
        loc = torch.nn.functional.one_hot(
            (observations[:, 0] * self._maze_size + observations[:, 1]).long(),
            self._maze_size * self._maze_size,
        ).long()
        loc = torch.reshape(loc, [observations.shape[0], self._maze_size, self._maze_size])
        states = torch.cat([maze_maps, loc], dim=-1).long()
        return states

    def _forward_learn(self, data):
        if self._cuda:
            collated_data = to_device(data, self._device)
        else:
            collated_data = data
        observations = collated_data['obs'],
        bfs_input_maps, bfs_output_maps = collated_data['bfs_in'].long(), collated_data['bfs_out'].long()
        states = observations
        bfs_input_onehot = torch.nn.functional.one_hot(bfs_input_maps, self._num_actions + 1).float()

        bfs_states = torch.cat([
            states,
            bfs_input_onehot,
        ], dim=-1)
        logits = self._model(bfs_states)['logit']
        logits = logits.flatten(0, -2)
        labels = bfs_output_maps.flatten(0, -1)

        loss = self._loss(logits, labels)
        preds = torch.argmax(logits, dim=-1)
        acc = torch.sum((preds == labels)) / preds.shape[0]

        self._optimizer.zero_grad()
        loss.backward()
        self._optimizer.step()
        pred_loss = loss.item()

        cur_lr = [param_group['lr'] for param_group in self._optimizer.param_groups]
        cur_lr = sum(cur_lr) / len(cur_lr)
        return {'cur_lr': cur_lr, 'total_loss': pred_loss, 'acc': acc}

    def _monitor_vars_learn(self):
        return ['cur_lr', 'total_loss', 'acc']

    def _init_eval(self):
        self._eval_model = model_wrap(self._model, wrapper_name='base')
        self._eval_model.reset()

    def _forward_eval(self, data):
        if self._cuda:
            data = to_device(data, self._device)
        max_len = self._max_bfs_steps
        data_id = list(data.keys())
        output = {}

        for ii in data_id:
            states = data[ii].unsqueeze(0)
            bfs_input_maps = self._num_actions * torch.ones([1, self._maze_size, self._maze_size]).long()
            if self._cuda:
                bfs_input_maps = to_device(bfs_input_maps, self._device)
            xy = torch.where(states[:, :, :, -1] == 1)
            observation = (xy[1][0].item(), xy[2][0].item())

            i = 0
            while bfs_input_maps[0, observation[0], observation[1]].item() == self._num_actions and i < max_len:
                bfs_input_onehot = torch.nn.functional.one_hot(bfs_input_maps, self._num_actions + 1).long()

                bfs_states = torch.cat([
                    states,
                    bfs_input_onehot,
                ], dim=-1)
                logits = self._model(bfs_states)['logit']
                bfs_input_maps = torch.argmax(logits, dim=-1)
                i += 1
            output[ii] = bfs_input_maps[0, observation[0], observation[1]]
            if self._cuda:
                output[ii] = {'action': to_device(output[ii], 'cpu'), 'info': {}}
            if output[ii]['action'].item() == self._num_actions:
                output[ii]['action'] = torch.randint(low=0, high=self._num_actions, size=[1])[0]
        return output

    def _init_collect(self) -> None:
        raise NotImplementedError

    def _forward_collect(self, data: Dict[int, Any], **kwargs) -> Dict[int, Any]:
        raise NotImplementedError

    def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple) -> dict:
        raise NotImplementedError

    def _get_train_sample(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        raise NotImplementedError