|
|
|
|
|
from __future__ import print_function |
|
import json |
|
import os |
|
import struct |
|
import sys |
|
import platform |
|
import re |
|
import time |
|
import traceback |
|
import requests |
|
import socket |
|
import random |
|
import math |
|
import numpy as np |
|
import torch |
|
import logging |
|
import datetime |
|
from torch.optim.lr_scheduler import _LRScheduler |
|
from torch import nn |
|
import torch.nn.functional as F |
|
from torch.nn.modules.loss import _WeightedLoss |
|
|
|
|
|
|
|
def seed_all(seed_value, cuda_deterministic=False): |
|
""" |
|
设置所有的随机种子 |
|
""" |
|
random.seed(seed_value) |
|
os.environ['PYTHONHASHSEED'] = str(seed_value) |
|
np.random.seed(seed_value) |
|
torch.manual_seed(seed_value) |
|
if torch.cuda.is_available(): |
|
torch.cuda.manual_seed(seed_value) |
|
torch.cuda.manual_seed_all(seed_value) |
|
|
|
if cuda_deterministic: |
|
torch.backends.cudnn.deterministic = True |
|
torch.backends.cudnn.benchmark = False |
|
else: |
|
torch.backends.cudnn.deterministic = False |
|
torch.backends.cudnn.benchmark = True |
|
|
|
|
|
def set_log(logfileName, rank=-1): |
|
""" |
|
master节点保存所有log,其他节点只保存warning及error |
|
""" |
|
log_file_folder = os.path.dirname(logfileName) |
|
time_now = datetime.datetime.now() |
|
logfileName = f'{logfileName}_{time_now.year}_{time_now.month}_{time_now.day}_{time_now.hour}_{time_now.minute}.log' |
|
if not os.path.exists(log_file_folder): |
|
os.makedirs(log_file_folder) |
|
else: |
|
pass |
|
|
|
logging.basicConfig(level=logging.INFO if rank in [-1, 0] else logging.WARN, |
|
format='[%(asctime)s %(levelname)s %(filename)s line %(lineno)d %(process)d] %(message)s', |
|
datefmt='[%X]', |
|
handlers=[logging.FileHandler(logfileName), logging.StreamHandler()] |
|
) |
|
logger = logging.getLogger() |
|
return logger |
|
|
|
|
|
def save_ckpt(epoch, model, optimizer, scheduler, losses, model_name, ckpt_folder): |
|
""" |
|
保存模型checkpoint |
|
""" |
|
if not os.path.exists(ckpt_folder): |
|
os.makedirs(ckpt_folder) |
|
torch.save( |
|
{ |
|
'epoch': epoch, |
|
'model_state_dict': model.module.state_dict(), |
|
'optimizer_state_dict': optimizer.state_dict(), |
|
'scheduler_state_dict': scheduler.state_dict(), |
|
'losses': losses, |
|
}, |
|
f'{ckpt_folder}{model_name}_{epoch}.pth' |
|
) |
|
|
|
def save_simple_ckpt(model, model_name, ckpt_folder): |
|
""" |
|
保存模型checkpoint |
|
""" |
|
if not os.path.exists(ckpt_folder): |
|
os.makedirs(ckpt_folder) |
|
torch.save( |
|
{ |
|
'model_state_dict': model.module.state_dict() |
|
}, |
|
f'{ckpt_folder}{model_name}.pth' |
|
) |
|
|
|
def save_best_ckpt(epoch, model, optimizer, scheduler, losses, model_name, ckpt_folder): |
|
""" |
|
保存模型checkpoint |
|
""" |
|
if not os.path.exists(ckpt_folder): |
|
os.makedirs(ckpt_folder) |
|
torch.save( |
|
{ |
|
'epoch': epoch, |
|
'model_state_dict': model.module.state_dict(), |
|
'optimizer_state_dict': optimizer.state_dict(), |
|
'scheduler_state_dict': scheduler.state_dict(), |
|
'losses': losses, |
|
}, |
|
f'{ckpt_folder}{model_name}_best.pth' |
|
) |
|
|
|
def get_reduced(tensor, current_device, dest_device, world_size): |
|
""" |
|
将不同GPU上的变量或tensor集中在主GPU上,并得到均值 |
|
""" |
|
tensor = tensor.clone().detach() if torch.is_tensor(tensor) else torch.tensor(tensor) |
|
tensor = tensor.to(current_device) |
|
torch.distributed.reduce(tensor, dst=dest_device) |
|
tensor_mean = tensor.item() / world_size |
|
return tensor_mean |
|
|
|
def get_ndtensor_reduced(tensor, current_device, dest_device, world_size): |
|
""" |
|
将不同GPU上的变量或tensor集中在主GPU上,并得到均值, 需要是2维张量 |
|
""" |
|
tensor = tensor.clone().detach() if torch.is_tensor(tensor) else torch.tensor(tensor) |
|
tensor = tensor.to(current_device) |
|
torch.distributed.reduce(tensor, dst=dest_device) |
|
tensor_mean = torch.zeros(tensor.shape) |
|
if len(tensor.shape) == 2: |
|
for i in range(tensor.shape[0]): |
|
for j in range(tensor.shape[1]): |
|
tensor_mean[i,j] = tensor[i,j].item() / world_size |
|
elif len(tensor.shape) == 1: |
|
for i in range(tensor.shape[0]): |
|
tensor_mean[i] = tensor[i].item() / world_size |
|
return tensor_mean |
|
|
|
def numel(m: torch.nn.Module, only_trainable: bool = False): |
|
""" |
|
returns the total number of parameters used by `m` (only counting |
|
shared parameters once); if `only_trainable` is True, then only |
|
includes parameters with `requires_grad = True` |
|
""" |
|
parameters = m.parameters() |
|
if only_trainable: |
|
parameters = list(p for p in parameters if p.requires_grad) |
|
unique = dict((p.data_ptr(), p) for p in parameters).values() |
|
return sum(p.numel() for p in unique) |
|
|
|
|
|
def label_smooth(y, K, epsilon=0.1): |
|
""" |
|
Label smoothing for multiclass labels |
|
One hot encode labels `y` over `K` classes. `y` should be of the form [1, 6, 3, etc.] |
|
""" |
|
m = len(y) |
|
out = np.ones((m, K)) * epsilon / K |
|
for index in range(m): |
|
out[index][y[index] - 1] += 1 - epsilon |
|
return torch.tensor(out) |
|
|
|
|
|
class SequentialDistributedSampler(torch.utils.data.sampler.Sampler): |
|
""" |
|
Distributed Sampler that subsamples indicies sequentially, |
|
making it easier to collate all results at the end. |
|
Even though we only use this sampler for eval and predict (no training), |
|
which means that the model params won't have to be synced (i.e. will not hang |
|
for synchronization even if varied number of forward passes), we still add extra |
|
samples to the sampler to make it evenly divisible (like in `DistributedSampler`) |
|
to make it easy to `gather` or `reduce` resulting tensors at the end of the loop. |
|
""" |
|
|
|
def __init__(self, dataset, batch_size, world_size, rank=None, num_replicas=None): |
|
if num_replicas is None: |
|
if not torch.distributed.is_available(): |
|
raise RuntimeError("Requires distributed package to be available") |
|
num_replicas = world_size |
|
if rank is None: |
|
if not torch.distributed.is_available(): |
|
raise RuntimeError("Requires distributed package to be available") |
|
rank = torch.distributed.get_rank() |
|
self.dataset = dataset |
|
self.num_replicas = num_replicas |
|
self.rank = rank |
|
self.batch_size = batch_size |
|
self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.batch_size / self.num_replicas)) * self.batch_size |
|
self.total_size = self.num_samples * self.num_replicas |
|
|
|
def __iter__(self): |
|
indices = list(range(len(self.dataset))) |
|
|
|
indices += [indices[-1]] * (self.total_size - len(indices)) |
|
|
|
indices = indices[self.rank * self.num_samples : (self.rank + 1) * self.num_samples] |
|
return iter(indices) |
|
|
|
def __len__(self): |
|
return self.num_samples |
|
|
|
|
|
def distributed_concat(tensor, num_total_examples, world_size): |
|
""" |
|
合并不同进程的inference结果 |
|
""" |
|
output_tensors = [tensor.clone() for _ in range(world_size)] |
|
torch.distributed.all_gather(output_tensors, tensor) |
|
concat = torch.cat(output_tensors, dim=0) |
|
|
|
return concat[:num_total_examples] |
|
|
|
|
|
class CosineAnnealingWarmupRestarts(_LRScheduler): |
|
""" |
|
optimizer (Optimizer): Wrapped optimizer. |
|
first_cycle_steps (int): First cycle step size. |
|
cycle_mult(float): Cycle steps magnification. Default: -1. |
|
max_lr(float): First cycle's max learning rate. Default: 0.1. |
|
min_lr(float): Min learning rate. Default: 0.001. |
|
warmup_steps(int): Linear warmup step size. Default: 0. |
|
gamma(float): Decrease rate of max learning rate by cycle. Default: 1. |
|
last_epoch (int): The index of last epoch. Default: -1. |
|
""" |
|
|
|
def __init__(self, |
|
optimizer : torch.optim.Optimizer, |
|
first_cycle_steps : int, |
|
cycle_mult : float = 1., |
|
max_lr : float = 0.1, |
|
min_lr : float = 0.001, |
|
warmup_steps : int = 0, |
|
gamma : float = 1., |
|
last_epoch : int = -1 |
|
): |
|
assert warmup_steps < first_cycle_steps |
|
|
|
self.first_cycle_steps = first_cycle_steps |
|
self.cycle_mult = cycle_mult |
|
self.base_max_lr = max_lr |
|
self.max_lr = max_lr |
|
self.min_lr = min_lr |
|
self.warmup_steps = warmup_steps |
|
self.gamma = gamma |
|
|
|
self.cur_cycle_steps = first_cycle_steps |
|
self.cycle = 0 |
|
self.step_in_cycle = last_epoch |
|
|
|
super(CosineAnnealingWarmupRestarts, self).__init__(optimizer, last_epoch) |
|
|
|
|
|
self.init_lr() |
|
|
|
def init_lr(self): |
|
self.base_lrs = [] |
|
for param_group in self.optimizer.param_groups: |
|
param_group['lr'] = self.min_lr |
|
self.base_lrs.append(self.min_lr) |
|
|
|
def get_lr(self): |
|
if self.step_in_cycle == -1: |
|
return self.base_lrs |
|
elif self.step_in_cycle < self.warmup_steps: |
|
return [(self.max_lr - base_lr)*self.step_in_cycle / self.warmup_steps + base_lr for base_lr in self.base_lrs] |
|
else: |
|
return [base_lr + (self.max_lr - base_lr) \ |
|
* (1 + math.cos(math.pi * (self.step_in_cycle-self.warmup_steps) \ |
|
/ (self.cur_cycle_steps - self.warmup_steps))) / 2 |
|
for base_lr in self.base_lrs] |
|
|
|
def step(self, epoch=None): |
|
if epoch is None: |
|
epoch = self.last_epoch + 1 |
|
self.step_in_cycle = self.step_in_cycle + 1 |
|
if self.step_in_cycle >= self.cur_cycle_steps: |
|
self.cycle += 1 |
|
self.step_in_cycle = self.step_in_cycle - self.cur_cycle_steps |
|
self.cur_cycle_steps = int((self.cur_cycle_steps - self.warmup_steps) * self.cycle_mult) + self.warmup_steps |
|
else: |
|
if epoch >= self.first_cycle_steps: |
|
if self.cycle_mult == 1.: |
|
self.step_in_cycle = epoch % self.first_cycle_steps |
|
self.cycle = epoch // self.first_cycle_steps |
|
else: |
|
n = int(math.log((epoch / self.first_cycle_steps * (self.cycle_mult - 1) + 1), self.cycle_mult)) |
|
self.cycle = n |
|
self.step_in_cycle = epoch - int(self.first_cycle_steps * (self.cycle_mult ** n - 1) / (self.cycle_mult - 1)) |
|
self.cur_cycle_steps = self.first_cycle_steps * self.cycle_mult ** (n) |
|
else: |
|
self.cur_cycle_steps = self.first_cycle_steps |
|
self.step_in_cycle = epoch |
|
|
|
self.max_lr = self.base_max_lr * (self.gamma**self.cycle) |
|
self.last_epoch = math.floor(epoch) |
|
for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()): |
|
param_group['lr'] = lr |
|
|
|
|
|
class DistanceLoss(_WeightedLoss): |
|
""" |
|
CrossEntropyLoss with Distance Weighted |
|
""" |
|
def __init__(self, weight=None, reduction='mean', ignore_index = None): |
|
super().__init__(weight=weight, reduction=reduction) |
|
self.weight = weight |
|
self.reduction = reduction |
|
self.ignore_index = ignore_index |
|
def forward(self, inputs, targets): |
|
if len(inputs.shape) > 2: |
|
inputs = inputs.reshape(-1, inputs.size(-1)) |
|
if len(targets.shape) > 1: |
|
targets = targets.reshape(-1) |
|
if self.ignore_index is not None: |
|
keep_index = (targets != self.ignore_index).nonzero(as_tuple=True)[0] |
|
targets = torch.index_select(targets, 0, keep_index) |
|
inputs = torch.index_select(inputs, 0, keep_index) |
|
lsm = F.log_softmax(inputs, -1) |
|
targets = torch.empty(size=(targets.size(0), inputs.size(-1)), device=targets.device).fill_(0).scatter_(1, targets.data.unsqueeze(1), 1) |
|
if self.weight is not None: |
|
lsm = lsm * self.weight.unsqueeze(0) |
|
loss = -(targets * lsm).sum(-1) |
|
inputs = nn.Softmax(dim=-1)(inputs)[..., 1:-1].argmax(dim=-1) + 1 |
|
|
|
targets = nn.Softmax(dim=-1)(targets)[..., 1:-1].argmax(dim=-1) + 1 |
|
|
|
distance = abs(inputs - targets) + 1e-2 |
|
|
|
|
|
loss = loss * distance |
|
if self.reduction == 'sum': |
|
loss = loss.sum() |
|
elif self.reduction == 'mean': |
|
loss = loss.mean() |
|
return loss |
|
|
|
|
|
class LabelSmoothCrossEntropyLoss(_WeightedLoss): |
|
""" |
|
CrossEntropyLoss with Label Somoothing |
|
""" |
|
def __init__(self, weight=None, reduction='mean', smoothing=0.0): |
|
super().__init__(weight=weight, reduction=reduction) |
|
self.smoothing = smoothing |
|
self.weight = weight |
|
self.reduction = reduction |
|
|
|
@staticmethod |
|
def _smooth_one_hot(targets: torch.Tensor, n_classes: int, smoothing=0.0): |
|
assert 0 <= smoothing < 1 |
|
with torch.no_grad(): |
|
targets = torch.empty(size=(targets.size(0), n_classes), |
|
device=targets.device) \ |
|
.fill_(smoothing / (n_classes - 1)) \ |
|
.scatter_(1, targets.data.unsqueeze(1), 1. - smoothing) |
|
return targets |
|
|
|
def forward(self, inputs, targets): |
|
targets = LabelSmoothCrossEntropyLoss._smooth_one_hot(targets, inputs.size(-1), |
|
self.smoothing) |
|
lsm = F.log_softmax(inputs, -1) |
|
|
|
if self.weight is not None: |
|
lsm = lsm * self.weight.unsqueeze(0) |
|
|
|
loss = -(targets * lsm).sum(-1) |
|
|
|
if self.reduction == 'sum': |
|
loss = loss.sum() |
|
elif self.reduction == 'mean': |
|
loss = loss.mean() |
|
|
|
return loss |