File size: 3,733 Bytes
2d5f249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# This script is borrowed and extended from https://github.com/nkolot/SPIN/blob/master/utils/base_trainer.py
from __future__ import division
import logging
from utils import CheckpointSaver
from tensorboardX import SummaryWriter

import torch
from tqdm import tqdm

tqdm.monitor_interval = 0


logger = logging.getLogger(__name__)


class BaseTrainer(object):
    """Base class for Trainer objects.
    Takes care of checkpointing/logging/resuming training.
    """

    def __init__(self, options):
        self.options = options
        if options.multiprocessing_distributed:
            self.device = torch.device('cuda', options.gpu)
        else:
            self.device = torch.device(
                'cuda' if torch.cuda.is_available() else 'cpu')
        # override this function to define your model, optimizers etc.
        self.saver = CheckpointSaver(save_dir=options.checkpoint_dir,
                                     overwrite=options.overwrite)
        if options.rank == 0:
            self.summary_writer = SummaryWriter(self.options.summary_dir)
        self.init_fn()

        self.checkpoint = None
        if options.resume and self.saver.exists_checkpoint():
            self.checkpoint = self.saver.load_checkpoint(
                self.models_dict, self.optimizers_dict)

        if self.checkpoint is None:
            self.epoch_count = 0
            self.step_count = 0
        else:
            self.epoch_count = self.checkpoint['epoch']
            self.step_count = self.checkpoint['total_step_count']

        if self.checkpoint is not None:
            self.checkpoint_batch_idx = self.checkpoint['batch_idx']
        else:
            self.checkpoint_batch_idx = 0

        self.best_performance = float('inf')

    def load_pretrained(self, checkpoint_file=None):
        """Load a pretrained checkpoint.
        This is different from resuming training using --resume.
        """
        if checkpoint_file is not None:
            checkpoint = torch.load(checkpoint_file)
            for model in self.models_dict:
                if model in checkpoint:
                    self.models_dict[model].load_state_dict(checkpoint[model],
                                                            strict=True)
                    print(f'Checkpoint {model} loaded')

    def move_dict_to_device(self, dict, device, tensor2float=False):
        for k, v in dict.items():
            if isinstance(v, torch.Tensor):
                if tensor2float:
                    dict[k] = v.float().to(device)
                else:
                    dict[k] = v.to(device)

    # The following methods (with the possible exception of test) have to be implemented in the derived classes
    def train(self, epoch):
        raise NotImplementedError('You need to provide an train method')

    def init_fn(self):
        raise NotImplementedError('You need to provide an _init_fn method')

    def train_step(self, input_batch):
        raise NotImplementedError('You need to provide a _train_step method')

    def train_summaries(self, input_batch):
        raise NotImplementedError(
            'You need to provide a _train_summaries method')

    def visualize(self, input_batch):
        raise NotImplementedError('You need to provide a visualize method')

    def validate(self):
        pass

    def test(self):
        pass

    def evaluate(self):
        pass

    def fit(self):
        # Run training for num_epochs epochs
        for epoch in tqdm(range(self.epoch_count, self.options.num_epochs),
                          total=self.options.num_epochs,
                          initial=self.epoch_count):
            self.epoch_count = epoch
            self.train(epoch)
        return