Zerx966
/

Gpt

Text Generation

malicious-content

Model card Files Files and versions Community

Gpt / utils.py

Zerx966's picture

Upload 10 files

3ef28b3 verified 17 days ago

2.14 kB

	from operator import itemgetter
	from typing import Any, Dict, Iterable, Optional, Tuple, Union
	import math
	import torch


	def attention_mask_func(attention_scores, attention_mask):
	attention_scores.masked_fill_(attention_mask, -10000.0)
	return attention_scores


	@torch.jit.script
	def gelu_impl(x):
	"""OpenAI's gelu implementation."""
	return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x)))


	def openai_gelu(x):
	return gelu_impl(x)


	@torch.jit.script
	def bias_gelu(bias, y):
	x = bias + y
	return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))


	# gradient of tanh approximation of gelu
	# gradient of actual gelu is:
	# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
	@torch.jit.script
	def bias_gelu_back(g, bias, y):
	x = bias + y
	tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
	# sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
	ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (
	1 + tanh_out
	)
	return ff * g


	class GeLUFunction(torch.autograd.Function):
	@staticmethod
	# bias is an optional argument
	def forward(ctx, input, bias):
	ctx.save_for_backward(input, bias)
	return bias_gelu(bias, input)

	@staticmethod
	def backward(ctx, grad_output):
	input, bias = ctx.saved_tensors
	tmp = bias_gelu_back(grad_output, bias, input)
	return tmp, tmp


	bias_gelu_impl = GeLUFunction.apply



	# This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
	@torch.jit.script
	def erf_gelu(x):
	return (
	x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype) + torch.ones_like(x).to(dtype=x.dtype))
	)


	def init_method_normal(sigma):

	def init_(tensor):
	return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)

	return init_


	def scaled_init_method_normal(sigma, num_layers):
	std = sigma / math.sqrt(2.0 * num_layers)

	def init_(tensor):
	return torch.nn.init.normal_(tensor, mean=0.0, std=std)

	return init_