LiangRenjie
/

CONQUER_RVMR

Model card Files Files and versions Community

CONQUER_RVMR / optim /adamw.py

Liangrj5

init

a638e43 6 months ago

history blame contribute delete

4.54 kB

	"""
	AdamW optimizer (weight decay fix)
	originally from hugginface (https://github.com/huggingface/transformers).

	Copied from UNITER
	(https://github.com/ChenRocks/UNITER)
	"""
	import math

	import torch
	from torch.optim import Optimizer


	class AdamW(Optimizer):
	""" Implements Adam algorithm with weight decay fix.
	Parameters:
	lr (float): learning rate. Default 1e-3.
	betas (tuple of 2 floats): Adams beta parameters (b1, b2).
	Default: (0.9, 0.999)
	eps (float): Adams epsilon. Default: 1e-6
	weight_decay (float): Weight decay. Default: 0.0
	correct_bias (bool): can be set to False to avoid correcting bias
	in Adam (e.g. like in Bert TF repository). Default True.
	"""
	def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6,
	weight_decay=0.0, correct_bias=True):
	if lr < 0.0:
	raise ValueError(
	"Invalid learning rate: {} - should be >= 0.0".format(lr))
	if not 0.0 <= betas[0] < 1.0:
	raise ValueError("Invalid beta parameter: {} - "
	"should be in [0.0, 1.0[".format(betas[0]))
	if not 0.0 <= betas[1] < 1.0:
	raise ValueError("Invalid beta parameter: {} - "
	"should be in [0.0, 1.0[".format(betas[1]))
	if not 0.0 <= eps:
	raise ValueError("Invalid epsilon value: {} - "
	"should be >= 0.0".format(eps))
	defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
	correct_bias=correct_bias)
	super(AdamW, self).__init__(params, defaults)

	def step(self, closure=None):
	"""Performs a single optimization step.
	Arguments:
	closure (callable, optional): A closure that reevaluates the model
	and returns the loss.
	"""
	loss = None
	if closure is not None:
	loss = closure()

	for group in self.param_groups:
	for p in group['params']:
	if p.grad is None:
	continue
	grad = p.grad.data
	if grad.is_sparse:
	raise RuntimeError(
	'Adam does not support sparse '
	'gradients, please consider SparseAdam instead')

	state = self.state[p]

	# State initialization
	if len(state) == 0:
	state['step'] = 0
	# Exponential moving average of gradient values
	state['exp_avg'] = torch.zeros_like(p.data)
	# Exponential moving average of squared gradient values
	state['exp_avg_sq'] = torch.zeros_like(p.data)

	exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
	beta1, beta2 = group['betas']

	state['step'] += 1

	# Decay the first and second moment running average coefficient
	# In-place operations to update the averages at the same time
	exp_avg.mul_(beta1).add_(grad , alpha=1.0 - beta1)
	exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
	denom = exp_avg_sq.sqrt().add_(group['eps'])

	step_size = group['lr']
	if group['correct_bias']: # No bias correction for Bert
	bias_correction1 = 1.0 - beta1 ** state['step']
	bias_correction2 = 1.0 - beta2 ** state['step']
	step_size = (step_size * math.sqrt(bias_correction2)
	/ bias_correction1)

	p.data.addcdiv_(exp_avg, denom, value=-step_size)

	# Just adding the square of the weights to the loss function is
	# not the correct way of using L2 regularization/weight decay
	# with Adam, since that will interact with the m and v
	# parameters in strange ways.
	#
	# Instead we want to decay the weights in a manner that doesn't
	# interact with the m/v parameters. This is equivalent to
	# adding the square of the weights to the loss with plain
	# (non-momentum) SGD.
	# Add weight decay at the end (fixed version)
	if group['weight_decay'] > 0.0:
	p.data.add_(p.data, alpha=-group['lr'] * group['weight_decay'])

	return loss