""" Optimizer Factory w/ Custom Weight Decay Hacked together by / Copyright 2020 Ross Wightman """ import re import torch from torch import optim as optim from utils.distributed import is_main_process import logging logger = logging.getLogger(__name__) try: from apex.optimizers import FusedNovoGrad, FusedAdam, FusedLAMB, FusedSGD has_apex = True except ImportError: has_apex = False def add_weight_decay(model, weight_decay, no_decay_list=(), filter_bias_and_bn=True): named_param_tuples = [] for name, param in model.named_parameters(): if not param.requires_grad: continue # frozen weights if filter_bias_and_bn and (len(param.shape) == 1 or name.endswith(".bias")): named_param_tuples.append([name, param, 0]) elif name in no_decay_list: named_param_tuples.append([name, param, 0]) else: named_param_tuples.append([name, param, weight_decay]) return named_param_tuples def add_different_lr(named_param_tuples_or_model, diff_lr_names, diff_lr, default_lr): """use lr=diff_lr for modules named found in diff_lr_names, otherwise use lr=default_lr Args: named_param_tuples_or_model: List([name, param, weight_decay]), or nn.Module diff_lr_names: List(str) diff_lr: float default_lr: float Returns: named_param_tuples_with_lr: List([name, param, weight_decay, lr]) """ named_param_tuples_with_lr = [] logger.info(f"diff_names: {diff_lr_names}, diff_lr: {diff_lr}") for name, p, wd in named_param_tuples_or_model: use_diff_lr = False for diff_name in diff_lr_names: # if diff_name in name: if re.search(diff_name, name) is not None: logger.info(f"param {name} use different_lr: {diff_lr}") use_diff_lr = True break named_param_tuples_with_lr.append( [name, p, wd, diff_lr if use_diff_lr else default_lr] ) if is_main_process(): for name, _, wd, diff_lr in named_param_tuples_with_lr: logger.info(f"param {name}: wd: {wd}, lr: {diff_lr}") return named_param_tuples_with_lr def create_optimizer_params_group(named_param_tuples_with_lr): """named_param_tuples_with_lr: List([name, param, weight_decay, lr])""" group = {} for name, p, wd, lr in named_param_tuples_with_lr: if wd not in group: group[wd] = {} if lr not in group[wd]: group[wd][lr] = [] group[wd][lr].append(p) optimizer_params_group = [] for wd, lr_groups in group.items(): for lr, p in lr_groups.items(): optimizer_params_group.append(dict( params=p, weight_decay=wd, lr=lr )) logger.info(f"optimizer -- lr={lr} wd={wd} len(p)={len(p)}") return optimizer_params_group def create_optimizer(args, model, filter_bias_and_bn=True, return_group=False): opt_lower = args.opt.lower() weight_decay = args.weight_decay # check for modules that requires different lr if hasattr(args, "different_lr") and args.different_lr.enable: diff_lr_module_names = args.different_lr.module_names diff_lr = args.different_lr.lr else: diff_lr_module_names = [] diff_lr = None no_decay = {} if hasattr(model, 'no_weight_decay'): no_decay = model.no_weight_decay() if isinstance(model, torch.nn.parallel.DistributedDataParallel): if hasattr(model.module, 'no_weight_decay'): no_decay = model.module.no_weight_decay() no_decay = {"module." + k for k in no_decay} named_param_tuples = add_weight_decay( model, weight_decay, no_decay, filter_bias_and_bn) named_param_tuples = add_different_lr( named_param_tuples, diff_lr_module_names, diff_lr, args.lr) parameters = create_optimizer_params_group(named_param_tuples) if return_group: return parameters if 'fused' in opt_lower: assert has_apex and torch.cuda.is_available(), 'APEX and CUDA required for fused optimizers' opt_args = dict(lr=args.lr, weight_decay=weight_decay) if hasattr(args, 'opt_eps') and args.opt_eps is not None: opt_args['eps'] = args.opt_eps if hasattr(args, 'opt_betas') and args.opt_betas is not None: opt_args['betas'] = args.opt_betas if hasattr(args, 'opt_args') and args.opt_args is not None: opt_args.update(args.opt_args) opt_split = opt_lower.split('_') opt_lower = opt_split[-1] if opt_lower == 'sgd' or opt_lower == 'nesterov': opt_args.pop('eps', None) optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=True, **opt_args) elif opt_lower == 'momentum': opt_args.pop('eps', None) optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=False, **opt_args) elif opt_lower == 'adam': optimizer = optim.Adam(parameters, **opt_args) elif opt_lower == 'adamw': optimizer = optim.AdamW(parameters, **opt_args) else: assert False and "Invalid optimizer" raise ValueError return optimizer