|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""This code is adapted from the T5 code on the Huggingface Transformers library.""" |
|
""" PyTorch GLM T5 model.""" |
|
|
|
|
|
import copy |
|
import math |
|
import os |
|
import warnings |
|
from typing import Optional, Tuple, Union |
|
import logging |
|
|
|
import torch |
|
from torch import nn |
|
from torch.nn import CrossEntropyLoss |
|
from torch.utils.checkpoint import checkpoint |
|
|
|
from transformers.activations import ACT2FN |
|
from transformers.modeling_outputs import ( |
|
BaseModelOutput, |
|
BaseModelOutputWithPastAndCrossAttentions, |
|
Seq2SeqLMOutput, |
|
Seq2SeqModelOutput, |
|
) |
|
from transformers.modeling_utils import PreTrainedModel |
|
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer |
|
from transformers.utils import ( |
|
DUMMY_INPUTS, |
|
DUMMY_MASK, |
|
add_start_docstrings, |
|
add_start_docstrings_to_model_forward, |
|
is_torch_fx_proxy, |
|
replace_return_docstrings, |
|
) |
|
from transformers.utils import logging as transformers_logging |
|
from transformers.utils.model_parallel_utils import assert_device_map, get_device_map |
|
from .configuration_t5 import T5Config |
|
from .wrapper_functions import DataProcessor |
|
|
|
|
|
logger = transformers_logging.get_logger(__name__) |
|
|
|
_CONFIG_FOR_DOC = "T5Config" |
|
_CHECKPOINT_FOR_DOC = "t5-small" |
|
|
|
|
|
|
|
|
|
|
|
T5_PRETRAINED_MODEL_ARCHIVE_LIST = [ |
|
"t5-small", |
|
"t5-base", |
|
"t5-large", |
|
"t5-3b", |
|
"t5-11b", |
|
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_tf_weights_in_t5(model, config, tf_checkpoint_path): |
|
"""Load tf checkpoints in a pytorch model.""" |
|
raise NotImplementedError("NOT TESTED; might need adjustments for GLM") |
|
try: |
|
import re |
|
|
|
import numpy as np |
|
import tensorflow as tf |
|
except ImportError: |
|
logger.error( |
|
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " |
|
"https://www.tensorflow.org/install/ for installation instructions." |
|
) |
|
raise |
|
tf_path = os.path.abspath(tf_checkpoint_path) |
|
logger.info(f"Converting TensorFlow checkpoint from {tf_path}") |
|
|
|
init_vars = tf.train.list_variables(tf_path) |
|
names = [] |
|
tf_weights = {} |
|
for name, shape in init_vars: |
|
logger.info(f"Loading TF weight {name} with shape {shape}") |
|
array = tf.train.load_variable(tf_path, name) |
|
names.append(name) |
|
tf_weights[name] = array |
|
|
|
for txt_name in names: |
|
name = txt_name.split("/") |
|
|
|
|
|
if any( |
|
n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] |
|
for n in name |
|
): |
|
logger.info(f"Skipping {'/'.join(name)}") |
|
tf_weights.pop(txt_name, None) |
|
continue |
|
if "_slot_" in name[-1]: |
|
logger.info(f"Skipping {'/'.join(name)}") |
|
tf_weights.pop(txt_name, None) |
|
continue |
|
pointer = model |
|
array = tf_weights[txt_name] |
|
|
|
for m_name in name: |
|
if re.fullmatch(r"[A-Za-z]+_\d+", m_name): |
|
scope_names = re.split(r"_(\d+)", m_name) |
|
else: |
|
scope_names = [m_name] |
|
if scope_names[0] in ["kernel", "scale", "embedding"]: |
|
pointer = getattr(pointer, "weight") |
|
elif scope_names[0] == "self_attention": |
|
pointer = getattr(pointer, "layer") |
|
pointer = pointer[0] |
|
elif scope_names[0] == "enc_dec_attention": |
|
pointer = getattr(pointer, "layer") |
|
pointer = pointer[1] |
|
elif scope_names[0] == "dense_relu_dense": |
|
pointer = getattr(pointer, "layer") |
|
pointer = pointer[2] |
|
elif scope_names[0] == "rms_norm": |
|
if hasattr(pointer, "layer_norm"): |
|
pointer = getattr(pointer, "layer_norm") |
|
elif hasattr(pointer, "final_layer_norm"): |
|
pointer = getattr(pointer, "final_layer_norm") |
|
elif scope_names[0] == "scale": |
|
pointer = getattr(pointer, "weight") |
|
elif scope_names[0] == "output_bias" or scope_names[0] == "beta": |
|
pointer = getattr(pointer, "bias") |
|
elif scope_names[0] == "squad": |
|
pointer = getattr(pointer, "classifier") |
|
elif scope_names[0] == "decoder" and name[1] == "logits": |
|
continue |
|
elif scope_names[0] == "logits": |
|
pointer = getattr(pointer, "lm_head") |
|
elif scope_names[0] == "wi" and len(scope_names) > 1 and scope_names[1].isdigit(): |
|
pointer = getattr(pointer, f"wi_{scope_names[1]}") |
|
continue |
|
else: |
|
try: |
|
pointer = getattr(pointer, scope_names[0]) |
|
except AttributeError: |
|
logger.info(f"Skipping {'/'.join(name)}") |
|
continue |
|
if len(scope_names) >= 2: |
|
num = int(scope_names[1]) |
|
pointer = pointer[num] |
|
if scope_names[0] not in ["kernel", "scale", "embedding"]: |
|
pointer = getattr(pointer, "weight") |
|
if scope_names[0] != "embedding": |
|
logger.info(f"Transposing numpy weight of shape {array.shape} for {name}") |
|
array = np.transpose(array) |
|
try: |
|
assert ( |
|
pointer.shape == array.shape |
|
), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" |
|
except AssertionError as e: |
|
e.args += (pointer.shape, array.shape) |
|
raise |
|
logger.info(f"Initialize PyTorch weight {name}") |
|
pointer.data = torch.from_numpy(array.astype(np.float32)) |
|
tf_weights.pop(txt_name, None) |
|
|
|
logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}.") |
|
return model |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PARALLELIZE_DOCSTRING = r""" |
|
This is an experimental feature and is a subject to change at a moment's notice. |
|
|
|
Uses a device map to distribute attention modules of the model across several devices. If no device map is given, |
|
it will evenly distribute blocks across all devices. |
|
|
|
Args: |
|
device_map (`Dict[int, list]`, optional, defaults to None): |
|
A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always |
|
automatically mapped to the first device (for esoteric reasons). That means that the first device should |
|
have fewer attention modules mapped to it than other devices. For reference, the t5 models have the |
|
following number of attention modules: |
|
|
|
- t5-small: 6 |
|
- t5-base: 12 |
|
- t5-large: 24 |
|
- t5-3b: 24 |
|
- t5-11b: 24 |
|
|
|
Example: |
|
|
|
```python |
|
# Here is an example of a device map on a machine with 4 GPUs using t5-3b, which has a total of 24 attention modules: |
|
model = T5ForConditionalGeneration.from_pretrained("t5-3b") |
|
device_map = { |
|
0: [0, 1, 2], |
|
1: [3, 4, 5, 6, 7, 8, 9], |
|
2: [10, 11, 12, 13, 14, 15, 16], |
|
3: [17, 18, 19, 20, 21, 22, 23], |
|
} |
|
model.parallelize(device_map) |
|
``` |
|
""" |
|
DEPARALLELIZE_DOCSTRING = r""" |
|
Moves the model to cpu from a model parallel state. |
|
|
|
Example: |
|
|
|
```python |
|
# On a 4 GPU machine with t5-3b: |
|
model = T5ForConditionalGeneration.from_pretrained("t5-3b") |
|
device_map = { |
|
0: [0, 1, 2], |
|
1: [3, 4, 5, 6, 7, 8, 9], |
|
2: [10, 11, 12, 13, 14, 15, 16], |
|
3: [17, 18, 19, 20, 21, 22, 23], |
|
} |
|
model.parallelize(device_map) # Splits the model across several devices |
|
model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache() |
|
``` |
|
""" |
|
|
|
|
|
class T5LayerNorm(nn.Module): |
|
def __init__(self, hidden_size, eps=1e-6): |
|
""" |
|
Construct a layernorm module in the T5 style. No bias and no subtraction of mean. |
|
""" |
|
super().__init__() |
|
self.weight = nn.Parameter(torch.ones(hidden_size)) |
|
self.variance_epsilon = eps |
|
|
|
def forward(self, hidden_states): |
|
|
|
|
|
|
|
|
|
|
|
variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) |
|
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) |
|
|
|
|
|
if self.weight.dtype in [torch.float16, torch.bfloat16]: |
|
hidden_states = hidden_states.to(self.weight.dtype) |
|
|
|
return self.weight * hidden_states |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ALL_LAYERNORM_LAYERS.append(T5LayerNorm) |
|
|
|
|
|
class T5DenseActDense(nn.Module): |
|
def __init__(self, config: T5Config): |
|
super().__init__() |
|
self.wi = nn.Linear(config.d_model, config.d_ff, bias=False) |
|
self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) |
|
self.dropout = nn.Dropout(config.dropout_rate) |
|
self.act = ACT2FN[config.dense_act_fn] |
|
|
|
def forward(self, hidden_states): |
|
hidden_states = self.wi(hidden_states) |
|
hidden_states = self.act(hidden_states) |
|
hidden_states = self.dropout(hidden_states) |
|
if ( |
|
isinstance(self.wo.weight, torch.Tensor) |
|
and hidden_states.dtype != self.wo.weight.dtype |
|
and self.wo.weight.dtype != torch.int8 |
|
): |
|
hidden_states = hidden_states.to(self.wo.weight.dtype) |
|
hidden_states = self.wo(hidden_states) |
|
return hidden_states |
|
|
|
|
|
class T5DenseGatedActDense(nn.Module): |
|
def __init__(self, config: T5Config): |
|
super().__init__() |
|
self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False) |
|
self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False) |
|
self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) |
|
self.dropout = nn.Dropout(config.dropout_rate) |
|
self.act = ACT2FN[config.dense_act_fn] |
|
|
|
def forward(self, hidden_states): |
|
hidden_gelu = self.act(self.wi_0(hidden_states)) |
|
hidden_linear = self.wi_1(hidden_states) |
|
hidden_states = hidden_gelu * hidden_linear |
|
hidden_states = self.dropout(hidden_states) |
|
|
|
|
|
|
|
|
|
if ( |
|
isinstance(self.wo.weight, torch.Tensor) |
|
and hidden_states.dtype != self.wo.weight.dtype |
|
and self.wo.weight.dtype != torch.int8 |
|
): |
|
hidden_states = hidden_states.to(self.wo.weight.dtype) |
|
|
|
hidden_states = self.wo(hidden_states) |
|
return hidden_states |
|
|
|
|
|
class T5LayerFF(nn.Module): |
|
def __init__(self, config: T5Config): |
|
super().__init__() |
|
if config.is_gated_act: |
|
self.DenseReluDense = T5DenseGatedActDense(config) |
|
else: |
|
self.DenseReluDense = T5DenseActDense(config) |
|
|
|
self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) |
|
self.dropout = nn.Dropout(config.dropout_rate) |
|
|
|
def forward(self, hidden_states): |
|
forwarded_states = self.layer_norm(hidden_states) |
|
forwarded_states = self.DenseReluDense(forwarded_states) |
|
hidden_states = hidden_states + self.dropout(forwarded_states) |
|
return hidden_states |
|
|
|
|
|
class T5Attention(nn.Module): |
|
def __init__(self, config: T5Config, has_relative_attention_bias=False): |
|
super().__init__() |
|
self.is_decoder = config.is_decoder |
|
self.has_relative_attention_bias = has_relative_attention_bias |
|
|
|
self.relative_attention_num_buckets = config.relative_attention_num_buckets |
|
self.relative_attention_num_additional_buckets = config.relative_attention_num_additional_buckets if 'relative_attention_num_additional_buckets' in config.__dict__.keys() else 0 |
|
self.relative_attention_max_distance = config.relative_attention_max_distance |
|
self.d_model = config.d_model |
|
self.key_value_proj_dim = config.d_kv |
|
self.n_heads = config.num_heads |
|
self.dropout = config.dropout_rate |
|
self.inner_dim = self.n_heads * self.key_value_proj_dim |
|
|
|
|
|
self.q = nn.Linear(self.d_model, self.inner_dim, bias=False) |
|
self.k = nn.Linear(self.d_model, self.inner_dim, bias=False) |
|
self.v = nn.Linear(self.d_model, self.inner_dim, bias=False) |
|
self.o = nn.Linear(self.inner_dim, self.d_model, bias=False) |
|
|
|
if self.has_relative_attention_bias: |
|
if self.is_decoder: |
|
num_buckets = self.relative_attention_num_buckets |
|
else: |
|
num_buckets = self.relative_attention_num_buckets + self.relative_attention_num_additional_buckets |
|
self.relative_attention_bias = nn.Embedding(num_buckets, self.n_heads) |
|
self.pruned_heads = set() |
|
self.gradient_checkpointing = False |
|
|
|
def prune_heads(self, heads): |
|
if len(heads) == 0: |
|
return |
|
heads, index = find_pruneable_heads_and_indices( |
|
heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads |
|
) |
|
|
|
self.q = prune_linear_layer(self.q, index) |
|
self.k = prune_linear_layer(self.k, index) |
|
self.v = prune_linear_layer(self.v, index) |
|
self.o = prune_linear_layer(self.o, index, dim=1) |
|
|
|
self.n_heads = self.n_heads - len(heads) |
|
self.inner_dim = self.key_value_proj_dim * self.n_heads |
|
self.pruned_heads = self.pruned_heads.union(heads) |
|
|
|
@staticmethod |
|
def _og_relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128): |
|
""" |
|
This is the original implementation from T5 which will be called below. |
|
|
|
Adapted from Mesh Tensorflow: |
|
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593 |
|
|
|
Translate relative position to a bucket number for relative attention. The relative position is defined as |
|
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to |
|
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for |
|
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative |
|
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket. |
|
This should allow for more graceful generalization to longer sequences than the model has been trained on |
|
|
|
Args: |
|
relative_position: an int32 Tensor |
|
bidirectional: a boolean - whether the attention is bidirectional |
|
num_buckets: an integer |
|
max_distance: an integer |
|
|
|
Returns: |
|
a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets) |
|
""" |
|
relative_buckets = 0 |
|
|
|
if bidirectional: |
|
num_buckets //= 2 |
|
relative_buckets += (relative_position > 0).to(torch.long) * num_buckets |
|
relative_position = torch.abs(relative_position) |
|
else: |
|
relative_position = -torch.min(relative_position, torch.zeros_like(relative_position)) |
|
|
|
|
|
|
|
max_exact = num_buckets // 2 |
|
is_small = relative_position < max_exact |
|
|
|
|
|
relative_position_if_large = max_exact + ( |
|
torch.log(relative_position.float() / max_exact) |
|
/ math.log(max_distance / max_exact) |
|
* (num_buckets - max_exact) |
|
).to(torch.long) |
|
relative_position_if_large = torch.min( |
|
relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1) |
|
) |
|
|
|
relative_buckets += torch.where(is_small, relative_position, relative_position_if_large) |
|
return relative_buckets |
|
|
|
@staticmethod |
|
def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128, use_additional_bucket=None): |
|
""" |
|
This function is a wrapper for _og_relative_position_bucket. It allows for additional buckets to be added via use_additional_bucket, which is a bool tensor of the same shape as relative_position. Entries where use_additional_bucket is False use the usual conventional buckets from relative_position. Entries where use_additional_bucket is True use the additional buckets. The additional buckets are added to the end of the bucket list, so the first additional bucket will be bucket `num_buckets`, the second will be `num_buckets + 1` and so on. Which one of the additional buckets is used depends on the value of relative_position. So if relative_position is 5, and use_additional_bucket is True, then the bucket used will be `num_buckets + 5`. If relative_position is 5, and use_additional_bucket is False, then the bucket corresponding to a relative position of 5 will be used. The standard relative PE buckets have indices in the range [0, num_buckets), so an entry of `0` in relative_position will use the first additional bucket if use_additional_bucket is True. Entries to relative_position where use_relative_PE is True should be non-negative intergers. Entries where use_relative_PE is False can be any integer. |
|
|
|
The additional buckets can be used to encode relative positions such as Graph to Graph for long-ranged connections (e.g. in the global GLM), Graph to Text, or Text to Graph. |
|
|
|
If use_additional_bucket is None (or a Tensor where all entries are False), then this function is equivalent to _relative_position_bucket |
|
|
|
Note that the embeddings are not initialized automatically when additional buckets are used, so they need to be initialized manually. This can be done by calling init_relative_position_bias. |
|
|
|
Params: |
|
use_additional_bucket: a Tensor with the same shape as relative_position and type torch.bool |
|
Returns: |
|
a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets + max(relative_position[use_additional_bucket]) + 1) |
|
""" |
|
|
|
relative_buckets = T5Attention._og_relative_position_bucket(relative_position, bidirectional, num_buckets, max_distance) |
|
|
|
if use_additional_bucket is None: |
|
return relative_buckets |
|
|
|
relative_buckets[use_additional_bucket] = relative_position[use_additional_bucket] + num_buckets |
|
|
|
return relative_buckets |
|
|
|
def compute_bias(self, query_length, key_length, device=None, relative_position=None, use_additional_bucket=None): |
|
"""Compute binned relative position bias |
|
:param relative_position: [MP] torch.long tensor in shape [query_length, key_length] |
|
""" |
|
if device is None: |
|
device = self.relative_attention_bias.weight.device |
|
|
|
if relative_position is None: |
|
context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] |
|
memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :] |
|
relative_position = memory_position - context_position |
|
else: |
|
assert relative_position.dtype == torch.long, f"{relative_position.dtype} should be torch.long" |
|
assert relative_position.device == device, f"{relative_position.device} should be {device}" |
|
|
|
|
|
relative_position_bucket = self._relative_position_bucket( |
|
relative_position, |
|
bidirectional=(not self.is_decoder), |
|
num_buckets=self.relative_attention_num_buckets, |
|
max_distance=self.relative_attention_max_distance, |
|
use_additional_bucket=use_additional_bucket, |
|
) |
|
|
|
values = self.relative_attention_bias(relative_position_bucket) |
|
|
|
values = values.permute([2, 0, 1]).unsqueeze(0) |
|
|
|
return values |
|
|
|
def forward( |
|
self, |
|
hidden_states, |
|
mask=None, |
|
key_value_states=None, |
|
position_bias=None, |
|
past_key_value=None, |
|
layer_head_mask=None, |
|
query_length=None, |
|
use_cache=False, |
|
output_attentions=False, |
|
relative_position=None, |
|
sparsity_mask=None, |
|
use_additional_bucket=None, |
|
): |
|
""" |
|
Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). |
|
|
|
:param relative_position: [MP] relative position for the attention. If `None`, it will be computed as in a standard sequence-to-sequence model. If not `None`, it will be used as the relative position for the attention. It is a tensor of shape [batch_size, query_length, key_length]. |
|
:param sparsity_mask: [MP] sparsity mask for the attention. If `None`, it will be computed as in a standard sequence-to-sequence model. If not `None`, it will be used as the sparsity mask for the attention. It is a tensor of shape [batch_size, query_length, key_length]. A value of 1 means that the corresponding attention weight is not masked, and a value of 0 means that the corresponding attention weight is masked. Hence, the sparsity mask is a binary mask that (kind of) can be used like a multiplicative mask. |
|
:param use_additional_bucket: [MP] whether to use additional buckets for the attention. If `None`, only standard positional encodings will be used. If not `None`, additional buckets will be used for the relative position. It is a tensor of shape [batch_size, query_length, key_length]. A value of False means that the corresponding position is a standard relative position, and a value of True means that the corresponding additional bucket should be used. |
|
""" |
|
|
|
|
|
|
|
batch_size, seq_length = hidden_states.shape[:2] |
|
|
|
real_seq_length = seq_length |
|
|
|
if past_key_value is not None: |
|
assert ( |
|
len(past_key_value) == 2 |
|
), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" |
|
real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length |
|
|
|
key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] |
|
|
|
def shape(states): |
|
"""projection""" |
|
return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) |
|
|
|
def unshape(states): |
|
"""reshape""" |
|
return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) |
|
|
|
def project(hidden_states, proj_layer, key_value_states, past_key_value): |
|
"""projects hidden states correctly to key/query states""" |
|
if key_value_states is None: |
|
|
|
|
|
hidden_states = shape(proj_layer(hidden_states)) |
|
elif past_key_value is None: |
|
|
|
|
|
hidden_states = shape(proj_layer(key_value_states)) |
|
|
|
if past_key_value is not None: |
|
if key_value_states is None: |
|
|
|
|
|
hidden_states = torch.cat([past_key_value, hidden_states], dim=2) |
|
elif past_key_value.shape[2] != key_value_states.shape[1]: |
|
|
|
|
|
|
|
|
|
hidden_states = shape(proj_layer(key_value_states)) |
|
else: |
|
|
|
hidden_states = past_key_value |
|
return hidden_states |
|
|
|
|
|
query_states = shape(self.q(hidden_states)) |
|
|
|
|
|
key_states = project( |
|
hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None |
|
) |
|
value_states = project( |
|
hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None |
|
) |
|
|
|
|
|
scores = torch.matmul( |
|
query_states, key_states.transpose(3, 2) |
|
) |
|
|
|
if position_bias is None: |
|
if not self.has_relative_attention_bias: |
|
position_bias = torch.zeros( |
|
(1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype |
|
) |
|
if self.gradient_checkpointing and self.training: |
|
position_bias.requires_grad = True |
|
else: |
|
if relative_position is None: |
|
assert use_additional_bucket is None |
|
position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device, use_additional_bucket=None) |
|
|
|
else: |
|
position_bias = torch.cat(tuple(self.compute_bias(real_seq_length, key_length, device=scores.device, relative_position=r, use_additional_bucket=u) for r, u in zip(relative_position, use_additional_bucket)), dim=0) |
|
|
|
|
|
|
|
if past_key_value is not None: |
|
position_bias = position_bias[:, :, -hidden_states.size(1) :, :] |
|
|
|
logging.debug(f"position_bias = {position_bias.shape if position_bias is not None else position_bias}") |
|
logging.debug(f"mask = {mask.shape if mask is not None else mask}") |
|
if mask is not None: |
|
position_bias = position_bias + mask |
|
|
|
if self.pruned_heads: |
|
mask = torch.ones(position_bias.shape[1]) |
|
mask[list(self.pruned_heads)] = 0 |
|
position_bias_masked = position_bias[:, mask.bool()] |
|
else: |
|
position_bias_masked = position_bias |
|
|
|
if sparsity_mask is not None: |
|
assert sparsity_mask.dtype == torch.bool, f"{relative_position.dtype} should be torch.bool" |
|
|
|
sparsity_mask = ~ sparsity_mask.unsqueeze(1) |
|
sparsity_mask = sparsity_mask.expand_as(position_bias_masked) |
|
|
|
scores += position_bias_masked |
|
|
|
|
|
if sparsity_mask is not None: |
|
scores[sparsity_mask] = float('-inf') |
|
|
|
|
|
attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( |
|
scores |
|
) |
|
|
|
|
|
attn_weights = torch.nan_to_num(attn_weights, nan=0.0) |
|
|
|
|
|
attn_weights = nn.functional.dropout( |
|
attn_weights, p=self.dropout, training=self.training |
|
) |
|
|
|
|
|
if layer_head_mask is not None: |
|
attn_weights = attn_weights * layer_head_mask |
|
|
|
attn_output = unshape(torch.matmul(attn_weights, value_states)) |
|
attn_output = self.o(attn_output) |
|
|
|
present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None |
|
outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) |
|
|
|
if output_attentions: |
|
outputs = outputs + (attn_weights,) |
|
return outputs |
|
|
|
def init_relative_position_bias(self, modelsize:str, is_decoder:bool=False, init_additional_buckets_from:list[int]=None): |
|
""" |
|
Initializes parameters for relative position bias. This is necessary, if additional buckets are used, as then the weights are not initialized automatically when calling `from_pretrained`. |
|
:param modelsize: the model size of the model from which the relative position bias should be inherited |
|
:param is_decoder: whether the SelfAttention is in the decoder or not. This determines whether the relative position bias is initialized from the parents encoder or decoder. |
|
:param init_additional_buckets_from: the indices of the buckets from which the additional buckets should be initialized. If this is an int, then all additional buckets are initialized from the same bucket. If this is a list, then the list should have the same length as the number of additional buckets, and the i-th entry of the list determines from which bucket the i-th additional bucket is initialized. Setting this to None (or an element in the list to None) means that the additional bucket is not initialized, i.e. it is left unchanged. |
|
""" |
|
if self.is_decoder: |
|
raise NotImplementedError("Decoder is not tested.") |
|
|
|
logging.debug('Loading model from which relative position bias should be inherited') |
|
parent_model = T5EncoderModel.from_pretrained(modelsize) |
|
|
|
logging.debug('Get relative position bias from parent model') |
|
if is_decoder: |
|
parent_bias = parent_model.decoder.block[0].layer[0].SelfAttention.relative_attention_bias.weight |
|
else: |
|
parent_bias = parent_model.encoder.block[0].layer[0].SelfAttention.relative_attention_bias.weight |
|
del parent_model |
|
|
|
assert parent_bias.shape[1] == self.relative_attention_bias.weight.shape[1], f"{parent_bias.shape[1]} should be {self.relative_attention_bias.weight.shape[1]}" |
|
assert parent_bias.shape[0] <= self.relative_attention_bias.weight.shape[0], f"{parent_bias.shape[0]} should be <= {self.relative_attention_bias.weight.shape[0]}" |
|
|
|
logging.debug('init normal buckets') |
|
with torch.no_grad(): |
|
self.relative_attention_bias.weight[:parent_bias.shape[0], :] = parent_bias |
|
|
|
logging.debug('get parent buckets for additional buckets') |
|
if init_additional_buckets_from is None: |
|
return |
|
num_additional_buckets = self.relative_attention_bias.weight.shape[0] - parent_bias.shape[0] |
|
if num_additional_buckets == 0: |
|
return |
|
if not isinstance(init_additional_buckets_from, list): |
|
init_additional_buckets_from = [init_additional_buckets_from]*num_additional_buckets |
|
assert len(init_additional_buckets_from) == num_additional_buckets, f"{len(init_additional_buckets_from)} should be {num_additional_buckets}" |
|
|
|
skip_bucket = [idx is None for idx in init_additional_buckets_from] |
|
init_additional_buckets_from = [0 if idx is None else idx for idx in init_additional_buckets_from] |
|
init_additional_buckets_from = torch.tensor(init_additional_buckets_from, dtype=torch.long) |
|
init_additional_buckets_from = self._relative_position_bucket(relative_position=init_additional_buckets_from, bidirectional=(not is_decoder), num_buckets=self.relative_attention_num_buckets, max_distance=self.relative_attention_max_distance, use_additional_bucket=None) |
|
|
|
logging.debug('Initialize relative position bias') |
|
with torch.no_grad(): |
|
for i, (skip, idx) in enumerate(zip(skip_bucket, init_additional_buckets_from), start=parent_bias.shape[0]): |
|
if skip: |
|
continue |
|
self.relative_attention_bias.weight[i, :] = parent_bias[idx, :] |
|
|
|
|
|
class T5LayerSelfAttention(nn.Module): |
|
def __init__(self, config, has_relative_attention_bias=False): |
|
super().__init__() |
|
self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) |
|
self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) |
|
self.dropout = nn.Dropout(config.dropout_rate) |
|
|
|
def forward( |
|
self, |
|
hidden_states, |
|
attention_mask=None, |
|
position_bias=None, |
|
layer_head_mask=None, |
|
past_key_value=None, |
|
use_cache=False, |
|
output_attentions=False, |
|
relative_position=None, |
|
sparsity_mask=None, |
|
use_additional_bucket=None, |
|
): |
|
logging.debug('### SelfAttention ###') |
|
normed_hidden_states = self.layer_norm(hidden_states) |
|
attention_output = self.SelfAttention( |
|
normed_hidden_states, |
|
mask=attention_mask, |
|
position_bias=position_bias, |
|
layer_head_mask=layer_head_mask, |
|
past_key_value=past_key_value, |
|
use_cache=use_cache, |
|
output_attentions=output_attentions, |
|
relative_position=relative_position, |
|
sparsity_mask=sparsity_mask, |
|
use_additional_bucket=use_additional_bucket, |
|
) |
|
hidden_states = hidden_states + self.dropout(attention_output[0]) |
|
outputs = (hidden_states,) + attention_output[1:] |
|
return outputs |
|
|
|
|
|
class T5LayerCrossAttention(nn.Module): |
|
def __init__(self, config): |
|
raise NotImplementedError("might need adjustments for GLM") |
|
super().__init__() |
|
self.EncDecAttention = T5Attention(config, has_relative_attention_bias=False) |
|
self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) |
|
self.dropout = nn.Dropout(config.dropout_rate) |
|
|
|
def forward( |
|
self, |
|
hidden_states, |
|
key_value_states, |
|
attention_mask=None, |
|
position_bias=None, |
|
layer_head_mask=None, |
|
past_key_value=None, |
|
use_cache=False, |
|
query_length=None, |
|
output_attentions=False, |
|
): |
|
logging.debug('### CrossAttention ###') |
|
normed_hidden_states = self.layer_norm(hidden_states) |
|
attention_output = self.EncDecAttention( |
|
normed_hidden_states, |
|
mask=attention_mask, |
|
key_value_states=key_value_states, |
|
position_bias=position_bias, |
|
layer_head_mask=layer_head_mask, |
|
past_key_value=past_key_value, |
|
use_cache=use_cache, |
|
query_length=query_length, |
|
output_attentions=output_attentions, |
|
) |
|
layer_output = hidden_states + self.dropout(attention_output[0]) |
|
outputs = (layer_output,) + attention_output[1:] |
|
return outputs |
|
|
|
|
|
class T5Block(nn.Module): |
|
def __init__(self, config, has_relative_attention_bias=False): |
|
super().__init__() |
|
self.is_decoder = config.is_decoder |
|
self.layer = nn.ModuleList() |
|
self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)) |
|
if self.is_decoder: |
|
self.layer.append(T5LayerCrossAttention(config)) |
|
|
|
self.layer.append(T5LayerFF(config)) |
|
|
|
def forward( |
|
self, |
|
hidden_states, |
|
attention_mask=None, |
|
position_bias=None, |
|
encoder_hidden_states=None, |
|
encoder_attention_mask=None, |
|
encoder_decoder_position_bias=None, |
|
layer_head_mask=None, |
|
cross_attn_layer_head_mask=None, |
|
past_key_value=None, |
|
use_cache=False, |
|
output_attentions=False, |
|
return_dict=True, |
|
relative_position=None, |
|
sparsity_mask=None, |
|
use_additional_bucket=None, |
|
): |
|
if past_key_value is not None: |
|
if not self.is_decoder: |
|
logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") |
|
expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 |
|
|
|
if len(past_key_value) != expected_num_past_key_values: |
|
raise ValueError( |
|
f"There should be {expected_num_past_key_values} past states. " |
|
f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}" |
|
f"Got {len(past_key_value)} past key / value states" |
|
) |
|
|
|
self_attn_past_key_value = past_key_value[:2] |
|
cross_attn_past_key_value = past_key_value[2:] |
|
else: |
|
self_attn_past_key_value, cross_attn_past_key_value = None, None |
|
|
|
self_attention_outputs = self.layer[0]( |
|
hidden_states, |
|
attention_mask=attention_mask, |
|
position_bias=position_bias, |
|
layer_head_mask=layer_head_mask, |
|
past_key_value=self_attn_past_key_value, |
|
use_cache=use_cache, |
|
output_attentions=output_attentions, |
|
relative_position=relative_position, |
|
sparsity_mask=sparsity_mask, |
|
use_additional_bucket=use_additional_bucket, |
|
) |
|
hidden_states, present_key_value_state = self_attention_outputs[:2] |
|
attention_outputs = self_attention_outputs[2:] |
|
|
|
|
|
if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): |
|
clamp_value = torch.finfo(hidden_states.dtype).max - 1000 |
|
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) |
|
|
|
do_cross_attention = self.is_decoder and encoder_hidden_states is not None |
|
if do_cross_attention: |
|
|
|
|
|
if present_key_value_state is not None: |
|
query_length = present_key_value_state[0].shape[2] |
|
else: |
|
query_length = None |
|
|
|
cross_attention_outputs = self.layer[1]( |
|
hidden_states, |
|
key_value_states=encoder_hidden_states, |
|
attention_mask=encoder_attention_mask, |
|
position_bias=encoder_decoder_position_bias, |
|
layer_head_mask=cross_attn_layer_head_mask, |
|
past_key_value=cross_attn_past_key_value, |
|
query_length=query_length, |
|
use_cache=use_cache, |
|
output_attentions=output_attentions, |
|
) |
|
hidden_states = cross_attention_outputs[0] |
|
|
|
|
|
if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): |
|
clamp_value = torch.finfo(hidden_states.dtype).max - 1000 |
|
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) |
|
|
|
|
|
if present_key_value_state is not None: |
|
present_key_value_state = present_key_value_state + cross_attention_outputs[1] |
|
|
|
|
|
attention_outputs = attention_outputs + cross_attention_outputs[2:] |
|
|
|
|
|
hidden_states = self.layer[-1](hidden_states) |
|
|
|
|
|
if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): |
|
clamp_value = torch.finfo(hidden_states.dtype).max - 1000 |
|
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) |
|
|
|
outputs = (hidden_states,) |
|
|
|
if use_cache: |
|
outputs = outputs + (present_key_value_state,) + attention_outputs |
|
else: |
|
outputs = outputs + attention_outputs |
|
|
|
return outputs |
|
|
|
|
|
class T5PreTrainedModel(PreTrainedModel): |
|
""" |
|
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained |
|
models. |
|
""" |
|
|
|
config_class = T5Config |
|
load_tf_weights = load_tf_weights_in_t5 |
|
base_model_prefix = "transformer" |
|
is_parallelizable = True |
|
supports_gradient_checkpointing = True |
|
_no_split_modules = ["T5Block"] |
|
_keep_in_fp32_modules = ["wo"] |
|
data_processor = DataProcessor |
|
|
|
@property |
|
def dummy_inputs(self): |
|
input_ids = torch.tensor(DUMMY_INPUTS) |
|
input_mask = torch.tensor(DUMMY_MASK) |
|
dummy_inputs = { |
|
"decoder_input_ids": input_ids, |
|
"input_ids": input_ids, |
|
"decoder_attention_mask": input_mask, |
|
} |
|
return dummy_inputs |
|
|
|
def _init_weights(self, module): |
|
"""Initialize the weights""" |
|
factor = self.config.initializer_factor |
|
if isinstance(module, T5LayerNorm): |
|
module.weight.data.fill_(factor * 1.0) |
|
elif isinstance(module, (T5Model, T5ForConditionalGeneration, T5EncoderModel)): |
|
|
|
|
|
module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0) |
|
if hasattr(module, "lm_head") and not self.config.tie_word_embeddings: |
|
module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0) |
|
elif isinstance(module, T5DenseActDense): |
|
|
|
|
|
|
|
module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) |
|
if hasattr(module.wi, "bias") and module.wi.bias is not None: |
|
module.wi.bias.data.zero_() |
|
module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5)) |
|
if hasattr(module.wo, "bias") and module.wo.bias is not None: |
|
module.wo.bias.data.zero_() |
|
elif isinstance(module, T5DenseGatedActDense): |
|
module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) |
|
if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None: |
|
module.wi_0.bias.data.zero_() |
|
module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) |
|
if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None: |
|
module.wi_1.bias.data.zero_() |
|
module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5)) |
|
if hasattr(module.wo, "bias") and module.wo.bias is not None: |
|
module.wo.bias.data.zero_() |
|
elif isinstance(module, T5Attention): |
|
|
|
|
|
d_model = self.config.d_model |
|
key_value_proj_dim = self.config.d_kv |
|
n_heads = self.config.num_heads |
|
module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5)) |
|
module.k.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5)) |
|
module.v.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5)) |
|
module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5)) |
|
if module.has_relative_attention_bias: |
|
module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5)) |
|
|
|
def _set_gradient_checkpointing(self, module, value=False): |
|
if isinstance(module, (T5Attention, T5Stack)): |
|
module.gradient_checkpointing = value |
|
|
|
def _shift_right(self, input_ids): |
|
decoder_start_token_id = self.config.decoder_start_token_id |
|
pad_token_id = self.config.pad_token_id |
|
|
|
assert decoder_start_token_id is not None, ( |
|
"self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id." |
|
" See T5 docs for more information" |
|
) |
|
|
|
|
|
if is_torch_fx_proxy(input_ids): |
|
|
|
shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id) |
|
shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1) |
|
else: |
|
shifted_input_ids = input_ids.new_zeros(input_ids.shape) |
|
shifted_input_ids[..., 1:] = input_ids[..., :-1].clone() |
|
shifted_input_ids[..., 0] = decoder_start_token_id |
|
|
|
assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined." |
|
|
|
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) |
|
|
|
return shifted_input_ids |
|
|
|
|
|
class T5Stack(T5PreTrainedModel): |
|
def __init__(self, config, embed_tokens=None): |
|
super().__init__(config) |
|
|
|
self.embed_tokens = embed_tokens |
|
self.is_decoder = config.is_decoder |
|
|
|
self.block = nn.ModuleList( |
|
[T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] |
|
) |
|
self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) |
|
self.dropout = nn.Dropout(config.dropout_rate) |
|
|
|
|
|
self.post_init() |
|
|
|
self.model_parallel = False |
|
self.device_map = None |
|
self.gradient_checkpointing = False |
|
|
|
@add_start_docstrings(PARALLELIZE_DOCSTRING) |
|
def parallelize(self, device_map=None): |
|
warnings.warn( |
|
"`T5Stack.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model" |
|
" with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own" |
|
" `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0," |
|
" 'block.1': 1, ...}", |
|
FutureWarning, |
|
) |
|
|
|
self.device_map = ( |
|
get_device_map(len(self.block), range(torch.cuda.device_count())) if device_map is None else device_map |
|
) |
|
assert_device_map(self.device_map, len(self.block)) |
|
self.model_parallel = True |
|
self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys())) |
|
self.last_device = "cuda:" + str(max(self.device_map.keys())) |
|
|
|
for k, v in self.device_map.items(): |
|
for layer in v: |
|
cuda_device = "cuda:" + str(k) |
|
self.block[layer] = self.block[layer].to(cuda_device) |
|
|
|
|
|
self.embed_tokens = self.embed_tokens.to(self.first_device) |
|
|
|
self.final_layer_norm = self.final_layer_norm.to(self.last_device) |
|
|
|
@add_start_docstrings(DEPARALLELIZE_DOCSTRING) |
|
def deparallelize(self): |
|
warnings.warn( |
|
"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", |
|
FutureWarning, |
|
) |
|
self.model_parallel = False |
|
self.device_map = None |
|
self.first_device = "cpu" |
|
self.last_device = "cpu" |
|
for i in range(len(self.block)): |
|
self.block[i] = self.block[i].to("cpu") |
|
self.embed_tokens = self.embed_tokens.to("cpu") |
|
self.final_layer_norm = self.final_layer_norm.to("cpu") |
|
torch.cuda.empty_cache() |
|
|
|
def get_input_embeddings(self): |
|
return self.embed_tokens |
|
|
|
def set_input_embeddings(self, new_embeddings): |
|
self.embed_tokens = new_embeddings |
|
|
|
def forward( |
|
self, |
|
input_ids=None, |
|
attention_mask=None, |
|
encoder_hidden_states=None, |
|
encoder_attention_mask=None, |
|
inputs_embeds=None, |
|
head_mask=None, |
|
cross_attn_head_mask=None, |
|
past_key_values=None, |
|
use_cache=None, |
|
output_attentions=None, |
|
output_hidden_states=None, |
|
return_dict=None, |
|
relative_position=None, |
|
sparsity_mask=None, |
|
use_additional_bucket=None, |
|
): |
|
|
|
if self.model_parallel: |
|
torch.cuda.set_device(self.first_device) |
|
self.embed_tokens = self.embed_tokens.to(self.first_device) |
|
use_cache = use_cache if use_cache is not None else self.config.use_cache |
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions |
|
output_hidden_states = ( |
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states |
|
) |
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict |
|
|
|
if input_ids is not None and inputs_embeds is not None: |
|
err_msg_prefix = "decoder_" if self.is_decoder else "" |
|
raise ValueError( |
|
f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time" |
|
) |
|
elif input_ids is not None: |
|
input_shape = input_ids.size() |
|
input_ids = input_ids.view(-1, input_shape[-1]) |
|
elif inputs_embeds is not None: |
|
input_shape = inputs_embeds.size()[:-1] |
|
else: |
|
err_msg_prefix = "decoder_" if self.is_decoder else "" |
|
raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds") |
|
|
|
if inputs_embeds is None: |
|
assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings" |
|
inputs_embeds = self.embed_tokens(input_ids) |
|
|
|
batch_size, seq_length = input_shape |
|
|
|
|
|
mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length |
|
|
|
if use_cache is True: |
|
assert self.is_decoder, f"`use_cache` can only be set to `True` if {self} is used as a decoder" |
|
|
|
if attention_mask is None: |
|
attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) |
|
if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None: |
|
encoder_seq_length = encoder_hidden_states.shape[1] |
|
encoder_attention_mask = torch.ones( |
|
batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long |
|
) |
|
|
|
|
|
if past_key_values is None: |
|
past_key_values = [None] * len(self.block) |
|
|
|
|
|
|
|
extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) |
|
|
|
|
|
|
|
if self.is_decoder and encoder_hidden_states is not None: |
|
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() |
|
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) |
|
if encoder_attention_mask is None: |
|
encoder_attention_mask = torch.ones(encoder_hidden_shape, device=inputs_embeds.device) |
|
encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) |
|
else: |
|
encoder_extended_attention_mask = None |
|
|
|
if self.gradient_checkpointing and self.training: |
|
if use_cache: |
|
logger.warning_once( |
|
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." |
|
) |
|
use_cache = False |
|
|
|
|
|
head_mask = self.get_head_mask(head_mask, self.config.num_layers) |
|
cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) |
|
present_key_value_states = () if use_cache else None |
|
all_hidden_states = () if output_hidden_states else None |
|
all_attentions = () if output_attentions else None |
|
all_cross_attentions = () if (output_attentions and self.is_decoder) else None |
|
position_bias = None |
|
encoder_decoder_position_bias = None |
|
|
|
hidden_states = self.dropout(inputs_embeds) |
|
|
|
for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): |
|
layer_head_mask = head_mask[i] |
|
cross_attn_layer_head_mask = cross_attn_head_mask[i] |
|
|
|
if self.model_parallel: |
|
torch.cuda.set_device(hidden_states.device) |
|
|
|
if attention_mask is not None: |
|
attention_mask = attention_mask.to(hidden_states.device) |
|
if position_bias is not None: |
|
position_bias = position_bias.to(hidden_states.device) |
|
if encoder_hidden_states is not None: |
|
encoder_hidden_states = encoder_hidden_states.to(hidden_states.device) |
|
if encoder_extended_attention_mask is not None: |
|
encoder_extended_attention_mask = encoder_extended_attention_mask.to(hidden_states.device) |
|
if encoder_decoder_position_bias is not None: |
|
encoder_decoder_position_bias = encoder_decoder_position_bias.to(hidden_states.device) |
|
if layer_head_mask is not None: |
|
layer_head_mask = layer_head_mask.to(hidden_states.device) |
|
if cross_attn_layer_head_mask is not None: |
|
cross_attn_layer_head_mask = cross_attn_layer_head_mask.to(hidden_states.device) |
|
if output_hidden_states: |
|
all_hidden_states = all_hidden_states + (hidden_states,) |
|
|
|
if self.gradient_checkpointing and self.training: |
|
|
|
def create_custom_forward(module): |
|
def custom_forward(*inputs): |
|
return tuple(module(*inputs, use_cache, output_attentions)) |
|
|
|
return custom_forward |
|
|
|
layer_outputs = checkpoint( |
|
create_custom_forward(layer_module), |
|
hidden_states, |
|
extended_attention_mask, |
|
position_bias, |
|
encoder_hidden_states, |
|
encoder_extended_attention_mask, |
|
encoder_decoder_position_bias, |
|
layer_head_mask, |
|
cross_attn_layer_head_mask, |
|
None, |
|
relative_position=relative_position, |
|
sparsity_mask=sparsity_mask, |
|
use_additional_bucket=use_additional_bucket, |
|
) |
|
else: |
|
layer_outputs = layer_module( |
|
hidden_states, |
|
attention_mask=extended_attention_mask, |
|
position_bias=position_bias, |
|
encoder_hidden_states=encoder_hidden_states, |
|
encoder_attention_mask=encoder_extended_attention_mask, |
|
encoder_decoder_position_bias=encoder_decoder_position_bias, |
|
layer_head_mask=layer_head_mask, |
|
cross_attn_layer_head_mask=cross_attn_layer_head_mask, |
|
past_key_value=past_key_value, |
|
use_cache=use_cache, |
|
output_attentions=output_attentions, |
|
relative_position=relative_position, |
|
sparsity_mask=sparsity_mask, |
|
use_additional_bucket=use_additional_bucket, |
|
) |
|
|
|
|
|
|
|
if use_cache is False: |
|
layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] |
|
|
|
hidden_states, present_key_value_state = layer_outputs[:2] |
|
|
|
|
|
|
|
|
|
position_bias = layer_outputs[2] |
|
if self.is_decoder and encoder_hidden_states is not None: |
|
encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3] |
|
|
|
if use_cache: |
|
present_key_value_states = present_key_value_states + (present_key_value_state,) |
|
|
|
if output_attentions: |
|
all_attentions = all_attentions + (layer_outputs[3],) |
|
if self.is_decoder: |
|
all_cross_attentions = all_cross_attentions + (layer_outputs[5],) |
|
|
|
|
|
if self.model_parallel: |
|
for k, v in self.device_map.items(): |
|
if i == v[-1] and "cuda:" + str(k) != self.last_device: |
|
hidden_states = hidden_states.to("cuda:" + str(k + 1)) |
|
|
|
hidden_states = self.final_layer_norm(hidden_states) |
|
hidden_states = self.dropout(hidden_states) |
|
|
|
|
|
if output_hidden_states: |
|
all_hidden_states = all_hidden_states + (hidden_states,) |
|
|
|
if not return_dict: |
|
return tuple( |
|
v |
|
for v in [ |
|
hidden_states, |
|
present_key_value_states, |
|
all_hidden_states, |
|
all_attentions, |
|
all_cross_attentions, |
|
] |
|
if v is not None |
|
) |
|
return BaseModelOutputWithPastAndCrossAttentions( |
|
last_hidden_state=hidden_states, |
|
past_key_values=present_key_value_states, |
|
hidden_states=all_hidden_states, |
|
attentions=all_attentions, |
|
cross_attentions=all_cross_attentions, |
|
) |
|
|
|
|
|
T5_START_DOCSTRING = r""" |
|
|
|
The T5 model was proposed in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text |
|
Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan |
|
Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a |
|
text-to-text denoising generative setting. |
|
|
|
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the |
|
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads |
|
etc.) |
|
|
|
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. |
|
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage |
|
and behavior. |
|
|
|
Parameters: |
|
config ([`T5Config`]): Model configuration class with all the parameters of the model. |
|
Initializing with a config file does not load the weights associated with the model, only the |
|
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. |
|
""" |
|
|
|
T5_INPUTS_DOCSTRING = r""" |
|
Args: |
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): |
|
Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you |
|
should be able to pad the inputs on both the right and the left. |
|
|
|
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and |
|
[`PreTrainedTokenizer.__call__`] for detail. |
|
|
|
[What are input IDs?](../glossary#input-ids) |
|
|
|
To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training). |
|
attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): |
|
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: |
|
|
|
- 1 for tokens that are **not masked**, |
|
- 0 for tokens that are **masked**. |
|
|
|
[What are attention masks?](../glossary#attention-mask) |
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): |
|
Indices of decoder input sequence tokens in the vocabulary. |
|
|
|
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and |
|
[`PreTrainedTokenizer.__call__`] for details. |
|
|
|
[What are decoder input IDs?](../glossary#decoder-input-ids) |
|
|
|
T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values` |
|
is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`). |
|
|
|
To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5 |
|
Training](./t5#training). |
|
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): |
|
Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also |
|
be used by default. |
|
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): |
|
Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0, |
|
1]`: |
|
|
|
- 1 indicates the head is **not masked**, |
|
- 0 indicates the head is **masked**. |
|
|
|
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): |
|
Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0, |
|
1]`: |
|
|
|
- 1 indicates the head is **not masked**, |
|
- 0 indicates the head is **masked**. |
|
|
|
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): |
|
Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in |
|
`[0, 1]`: |
|
|
|
- 1 indicates the head is **not masked**, |
|
- 0 indicates the head is **masked**. |
|
|
|
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): |
|
Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*) |
|
`last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at |
|
the output of the last layer of the encoder. Used in the cross-attention of the decoder. |
|
past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): |
|
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. |
|
|
|
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that |
|
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all |
|
`decoder_input_ids` of shape `(batch_size, sequence_length)`. |
|
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): |
|
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This |
|
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the |
|
model's internal embedding lookup matrix. |
|
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): |
|
Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded |
|
representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be |
|
input (see `past_key_values`). This is useful if you want more control over how to convert |
|
`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. |
|
|
|
If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value |
|
of `inputs_embeds`. |
|
|
|
use_cache (`bool`, *optional*): |
|
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see |
|
`past_key_values`). |
|
|
|
output_attentions (`bool`, *optional*): |
|
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned |
|
tensors for more detail. |
|
output_hidden_states (`bool`, *optional*): |
|
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for |
|
more detail. |
|
return_dict (`bool`, *optional*): |
|
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. |
|
""" |
|
|
|
T5_ENCODER_INPUTS_DOCSTRING = r""" |
|
Args: |
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): |
|
Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you |
|
should be able to pad the inputs on both the right and the left. |
|
|
|
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and |
|
[`PreTrainedTokenizer.__call__`] for detail. |
|
|
|
To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training). |
|
attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): |
|
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: |
|
|
|
- 1 for tokens that are **not masked**, |
|
- 0 for tokens that are **masked**. |
|
|
|
[What are attention masks?](../glossary#attention-mask) |
|
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): |
|
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: |
|
|
|
- 1 indicates the head is **not masked**, |
|
- 0 indicates the head is **masked**. |
|
|
|
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): |
|
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This |
|
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the |
|
model's internal embedding lookup matrix. |
|
output_attentions (`bool`, *optional*): |
|
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned |
|
tensors for more detail. |
|
output_hidden_states (`bool`, *optional*): |
|
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for |
|
more detail. |
|
return_dict (`bool`, *optional*): |
|
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. |
|
""" |
|
|
|
|
|
__HEAD_MASK_WARNING_MSG = """ |
|
The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently, |
|
`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions. |
|
If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers, |
|
num_heads)`. |
|
""" |
|
|
|
|
|
@add_start_docstrings( |
|
"The bare T5 Model transformer outputting raw hidden-states without any specific head on top.", |
|
T5_START_DOCSTRING, |
|
) |
|
class T5Model(T5PreTrainedModel): |
|
_keys_to_ignore_on_load_missing = [ |
|
r"encoder.embed_tokens.weight", |
|
r"decoder.embed_tokens.weight", |
|
] |
|
_keys_to_ignore_on_load_unexpected = [ |
|
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", |
|
] |
|
|
|
def __init__(self, config: T5Config): |
|
raise NotImplementedError("might need adjustments for GLM") |
|
super().__init__(config) |
|
self.shared = nn.Embedding(config.vocab_size, config.d_model) |
|
|
|
encoder_config = copy.deepcopy(config) |
|
encoder_config.is_decoder = False |
|
encoder_config.use_cache = False |
|
encoder_config.is_encoder_decoder = False |
|
self.encoder = T5Stack(encoder_config, self.shared) |
|
|
|
decoder_config = copy.deepcopy(config) |
|
decoder_config.is_decoder = True |
|
decoder_config.is_encoder_decoder = False |
|
decoder_config.num_layers = config.num_decoder_layers |
|
self.decoder = T5Stack(decoder_config, self.shared) |
|
|
|
|
|
self.post_init() |
|
|
|
|
|
self.model_parallel = False |
|
self.device_map = None |
|
|
|
@add_start_docstrings(PARALLELIZE_DOCSTRING) |
|
def parallelize(self, device_map=None): |
|
warnings.warn( |
|
"`T5Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model" |
|
" with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own" |
|
" `device_map` but it needs to be a dictionary module_name to device, so for instance {'encoder.block.0':" |
|
" 0, 'encoder.block.1': 1, ...}", |
|
FutureWarning, |
|
) |
|
self.device_map = ( |
|
get_device_map(len(self.encoder.block), range(torch.cuda.device_count())) |
|
if device_map is None |
|
else device_map |
|
) |
|
assert_device_map(self.device_map, len(self.encoder.block)) |
|
self.encoder.parallelize(self.device_map) |
|
self.decoder.parallelize(self.device_map) |
|
self.model_parallel = True |
|
|
|
@add_start_docstrings(DEPARALLELIZE_DOCSTRING) |
|
def deparallelize(self): |
|
warnings.warn( |
|
"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", |
|
FutureWarning, |
|
) |
|
self.encoder.deparallelize() |
|
self.decoder.deparallelize() |
|
self.encoder = self.encoder.to("cpu") |
|
self.decoder = self.decoder.to("cpu") |
|
self.model_parallel = False |
|
self.device_map = None |
|
torch.cuda.empty_cache() |
|
|
|
def get_input_embeddings(self): |
|
return self.shared |
|
|
|
def set_input_embeddings(self, new_embeddings): |
|
self.shared = new_embeddings |
|
self.encoder.set_input_embeddings(new_embeddings) |
|
self.decoder.set_input_embeddings(new_embeddings) |
|
|
|
def get_encoder(self): |
|
return self.encoder |
|
|
|
def get_decoder(self): |
|
return self.decoder |
|
|
|
def _prune_heads(self, heads_to_prune): |
|
""" |
|
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base |
|
class PreTrainedModel |
|
""" |
|
for layer, heads in heads_to_prune.items(): |
|
self.encoder.layer[layer].attention.prune_heads(heads) |
|
|
|
@add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING) |
|
@replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC) |
|
def forward( |
|
self, |
|
input_ids: Optional[torch.LongTensor] = None, |
|
attention_mask: Optional[torch.FloatTensor] = None, |
|
decoder_input_ids: Optional[torch.LongTensor] = None, |
|
decoder_attention_mask: Optional[torch.BoolTensor] = None, |
|
head_mask: Optional[torch.FloatTensor] = None, |
|
decoder_head_mask: Optional[torch.FloatTensor] = None, |
|
cross_attn_head_mask: Optional[torch.Tensor] = None, |
|
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, |
|
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, |
|
inputs_embeds: Optional[torch.Tensor] = None, |
|
decoder_inputs_embeds: Optional[torch.Tensor] = None, |
|
use_cache: Optional[bool] = None, |
|
output_attentions: Optional[bool] = None, |
|
output_hidden_states: Optional[bool] = None, |
|
return_dict: Optional[bool] = None, |
|
) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]: |
|
r""" |
|
Returns: |
|
|
|
Example: |
|
|
|
```python |
|
>>> from transformers import AutoTokenizer, T5Model |
|
|
|
>>> tokenizer = AutoTokenizer.from_pretrained("t5-small") |
|
>>> model = T5Model.from_pretrained("t5-small") |
|
|
|
>>> input_ids = tokenizer( |
|
... "Studies have been shown that owning a dog is good for you", return_tensors="pt" |
|
... ).input_ids # Batch size 1 |
|
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 |
|
|
|
>>> # preprocess: Prepend decoder_input_ids with start token which is pad token for T5Model. |
|
>>> # This is not needed for torch's T5ForConditionalGeneration as it does this internally using labels arg. |
|
>>> decoder_input_ids = model._shift_right(decoder_input_ids) |
|
|
|
>>> # forward pass |
|
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) |
|
>>> last_hidden_states = outputs.last_hidden_state |
|
```""" |
|
use_cache = use_cache if use_cache is not None else self.config.use_cache |
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict |
|
|
|
|
|
if head_mask is not None and decoder_head_mask is None: |
|
if self.config.num_layers == self.config.num_decoder_layers: |
|
warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) |
|
decoder_head_mask = head_mask |
|
|
|
|
|
if encoder_outputs is None: |
|
encoder_outputs = self.encoder( |
|
input_ids=input_ids, |
|
attention_mask=attention_mask, |
|
inputs_embeds=inputs_embeds, |
|
head_mask=head_mask, |
|
output_attentions=output_attentions, |
|
output_hidden_states=output_hidden_states, |
|
return_dict=return_dict, |
|
) |
|
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): |
|
encoder_outputs = BaseModelOutput( |
|
last_hidden_state=encoder_outputs[0], |
|
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, |
|
attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, |
|
) |
|
|
|
hidden_states = encoder_outputs[0] |
|
|
|
|
|
if self.model_parallel: |
|
torch.cuda.set_device(self.decoder.first_device) |
|
hidden_states = hidden_states.to(self.decoder.first_device) |
|
if decoder_input_ids is not None: |
|
decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) |
|
if attention_mask is not None: |
|
attention_mask = attention_mask.to(self.decoder.first_device) |
|
if decoder_attention_mask is not None: |
|
decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device) |
|
|
|
|
|
decoder_outputs = self.decoder( |
|
input_ids=decoder_input_ids, |
|
attention_mask=decoder_attention_mask, |
|
inputs_embeds=decoder_inputs_embeds, |
|
past_key_values=past_key_values, |
|
encoder_hidden_states=hidden_states, |
|
encoder_attention_mask=attention_mask, |
|
head_mask=decoder_head_mask, |
|
cross_attn_head_mask=cross_attn_head_mask, |
|
use_cache=use_cache, |
|
output_attentions=output_attentions, |
|
output_hidden_states=output_hidden_states, |
|
return_dict=return_dict, |
|
) |
|
|
|
if not return_dict: |
|
return decoder_outputs + encoder_outputs |
|
|
|
return Seq2SeqModelOutput( |
|
last_hidden_state=decoder_outputs.last_hidden_state, |
|
past_key_values=decoder_outputs.past_key_values, |
|
decoder_hidden_states=decoder_outputs.hidden_states, |
|
decoder_attentions=decoder_outputs.attentions, |
|
cross_attentions=decoder_outputs.cross_attentions, |
|
encoder_last_hidden_state=encoder_outputs.last_hidden_state, |
|
encoder_hidden_states=encoder_outputs.hidden_states, |
|
encoder_attentions=encoder_outputs.attentions, |
|
) |
|
|
|
|
|
@add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING) |
|
class T5ForConditionalGeneration(T5PreTrainedModel): |
|
_keys_to_ignore_on_load_missing = [ |
|
r"encoder.embed_tokens.weight", |
|
r"decoder.embed_tokens.weight", |
|
r"lm_head.weight", |
|
] |
|
_keys_to_ignore_on_load_unexpected = [ |
|
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", |
|
] |
|
|
|
def __init__(self, config: T5Config): |
|
raise NotImplementedError("might need adjustments for GLM") |
|
super().__init__(config) |
|
self.model_dim = config.d_model |
|
|
|
self.shared = nn.Embedding(config.vocab_size, config.d_model) |
|
|
|
encoder_config = copy.deepcopy(config) |
|
encoder_config.is_decoder = False |
|
encoder_config.use_cache = False |
|
encoder_config.is_encoder_decoder = False |
|
self.encoder = T5Stack(encoder_config, self.shared) |
|
|
|
decoder_config = copy.deepcopy(config) |
|
decoder_config.is_decoder = True |
|
decoder_config.is_encoder_decoder = False |
|
decoder_config.num_layers = config.num_decoder_layers |
|
self.decoder = T5Stack(decoder_config, self.shared) |
|
|
|
self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) |
|
|
|
|
|
self.post_init() |
|
|
|
|
|
self.model_parallel = False |
|
self.device_map = None |
|
|
|
@add_start_docstrings(PARALLELIZE_DOCSTRING) |
|
def parallelize(self, device_map=None): |
|
warnings.warn( |
|
"`T5ForConditionalGeneration.parallelize` is deprecated and will be removed in v5 of Transformers, you" |
|
" should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also" |
|
" provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance" |
|
" {'encoder.block.0': 0, 'encoder.block.1': 1, ...}", |
|
FutureWarning, |
|
) |
|
self.device_map = ( |
|
get_device_map(len(self.encoder.block), range(torch.cuda.device_count())) |
|
if device_map is None |
|
else device_map |
|
) |
|
assert_device_map(self.device_map, len(self.encoder.block)) |
|
self.encoder.parallelize(self.device_map) |
|
self.decoder.parallelize(self.device_map) |
|
self.lm_head = self.lm_head.to(self.decoder.first_device) |
|
self.model_parallel = True |
|
|
|
@add_start_docstrings(DEPARALLELIZE_DOCSTRING) |
|
def deparallelize(self): |
|
warnings.warn( |
|
"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", |
|
FutureWarning, |
|
) |
|
self.encoder.deparallelize() |
|
self.decoder.deparallelize() |
|
self.encoder = self.encoder.to("cpu") |
|
self.decoder = self.decoder.to("cpu") |
|
self.lm_head = self.lm_head.to("cpu") |
|
self.model_parallel = False |
|
self.device_map = None |
|
torch.cuda.empty_cache() |
|
|
|
def get_input_embeddings(self): |
|
return self.shared |
|
|
|
def set_input_embeddings(self, new_embeddings): |
|
self.shared = new_embeddings |
|
self.encoder.set_input_embeddings(new_embeddings) |
|
self.decoder.set_input_embeddings(new_embeddings) |
|
|
|
def set_output_embeddings(self, new_embeddings): |
|
self.lm_head = new_embeddings |
|
|
|
def get_output_embeddings(self): |
|
return self.lm_head |
|
|
|
def get_encoder(self): |
|
return self.encoder |
|
|
|
def get_decoder(self): |
|
return self.decoder |
|
|
|
@add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING) |
|
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) |
|
def forward( |
|
self, |
|
input_ids: Optional[torch.LongTensor] = None, |
|
attention_mask: Optional[torch.FloatTensor] = None, |
|
decoder_input_ids: Optional[torch.LongTensor] = None, |
|
decoder_attention_mask: Optional[torch.BoolTensor] = None, |
|
head_mask: Optional[torch.FloatTensor] = None, |
|
decoder_head_mask: Optional[torch.FloatTensor] = None, |
|
cross_attn_head_mask: Optional[torch.Tensor] = None, |
|
encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, |
|
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, |
|
inputs_embeds: Optional[torch.FloatTensor] = None, |
|
decoder_inputs_embeds: Optional[torch.FloatTensor] = None, |
|
labels: Optional[torch.LongTensor] = None, |
|
use_cache: Optional[bool] = None, |
|
output_attentions: Optional[bool] = None, |
|
output_hidden_states: Optional[bool] = None, |
|
return_dict: Optional[bool] = None, |
|
relative_position = None, |
|
sparsity_mask = None, |
|
is_concept: Optional[torch.BoolTensor] = None, |
|
concept_indices: Optional[list[dict[str,tuple[int,int]]]] = None, |
|
use_additional_bucket: Optional[torch.BoolTensor] = None, |
|
) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: |
|
r""" |
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): |
|
Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ..., |
|
config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for |
|
labels in `[0, ..., config.vocab_size]` |
|
|
|
Returns: |
|
|
|
Examples: |
|
|
|
```python |
|
>>> from transformers import AutoTokenizer, T5ForConditionalGeneration |
|
|
|
>>> tokenizer = AutoTokenizer.from_pretrained("t5-small") |
|
>>> model = T5ForConditionalGeneration.from_pretrained("t5-small") |
|
|
|
>>> # training |
|
>>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids |
|
>>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids |
|
>>> outputs = model(input_ids=input_ids, labels=labels) |
|
>>> loss = outputs.loss |
|
>>> logits = outputs.logits |
|
|
|
>>> # inference |
|
>>> input_ids = tokenizer( |
|
... "summarize: studies have shown that owning a dog is good for you", return_tensors="pt" |
|
... ).input_ids # Batch size 1 |
|
>>> outputs = model.generate(input_ids) |
|
>>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) |
|
>>> # studies have shown that owning a dog is good for you. |
|
```""" |
|
use_cache = use_cache if use_cache is not None else self.config.use_cache |
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict |
|
|
|
|
|
if head_mask is not None and decoder_head_mask is None: |
|
if self.config.num_layers == self.config.num_decoder_layers: |
|
warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) |
|
decoder_head_mask = head_mask |
|
|
|
|
|
if encoder_outputs is None: |
|
|
|
encoder_outputs = self.encoder( |
|
input_ids=input_ids, |
|
attention_mask=attention_mask, |
|
inputs_embeds=inputs_embeds, |
|
head_mask=head_mask, |
|
output_attentions=output_attentions, |
|
output_hidden_states=output_hidden_states, |
|
return_dict=return_dict, |
|
relative_position=relative_position, |
|
sparsity_mask=sparsity_mask, |
|
use_additional_bucket=use_additional_bucket, |
|
) |
|
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): |
|
encoder_outputs = BaseModelOutput( |
|
last_hidden_state=encoder_outputs[0], |
|
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, |
|
attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, |
|
) |
|
|
|
hidden_states = encoder_outputs[0] |
|
|
|
if self.model_parallel: |
|
torch.cuda.set_device(self.decoder.first_device) |
|
|
|
if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: |
|
|
|
decoder_input_ids = self._shift_right(labels) |
|
|
|
|
|
if self.model_parallel: |
|
torch.cuda.set_device(self.decoder.first_device) |
|
hidden_states = hidden_states.to(self.decoder.first_device) |
|
if decoder_input_ids is not None: |
|
decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) |
|
if attention_mask is not None: |
|
attention_mask = attention_mask.to(self.decoder.first_device) |
|
if decoder_attention_mask is not None: |
|
decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device) |
|
|
|
if is_concept is not None: |
|
attention_mask = attention_mask * is_concept |
|
|
|
if concept_indices is not None: |
|
|
|
|
|
|
|
|
|
|
|
|
|
for b, tmp_batch_concept_indices in enumerate(concept_indices): |
|
for tmp_concept_indices in tmp_batch_concept_indices.values(): |
|
tmp_indices = torch.tensor([[i for i in range(*one_concept_occurance)] for one_concept_occurance in tmp_concept_indices], device=hidden_states.device).T |
|
|
|
for tmp_tok_indices in tmp_indices: |
|
hidden_states[b,tmp_tok_indices] = torch.index_select(hidden_states[b], 0, tmp_tok_indices).mean(dim=0, keepdim=True) |
|
|
|
attention_mask[b, tmp_indices[:,1:]] = False |
|
|
|
|
|
|
|
decoder_outputs = self.decoder( |
|
input_ids=decoder_input_ids, |
|
attention_mask=decoder_attention_mask, |
|
inputs_embeds=decoder_inputs_embeds, |
|
past_key_values=past_key_values, |
|
encoder_hidden_states=hidden_states, |
|
encoder_attention_mask=attention_mask, |
|
head_mask=decoder_head_mask, |
|
cross_attn_head_mask=cross_attn_head_mask, |
|
use_cache=use_cache, |
|
output_attentions=output_attentions, |
|
output_hidden_states=output_hidden_states, |
|
return_dict=return_dict, |
|
) |
|
|
|
sequence_output = decoder_outputs[0] |
|
|
|
|
|
if self.model_parallel: |
|
torch.cuda.set_device(self.encoder.first_device) |
|
self.lm_head = self.lm_head.to(self.encoder.first_device) |
|
sequence_output = sequence_output.to(self.lm_head.weight.device) |
|
|
|
if self.config.tie_word_embeddings: |
|
|
|
|
|
sequence_output = sequence_output * (self.model_dim**-0.5) |
|
|
|
lm_logits = self.lm_head(sequence_output) |
|
|
|
loss = None |
|
if labels is not None: |
|
loss_fct = CrossEntropyLoss(ignore_index=-100) |
|
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1)) |
|
|
|
|
|
if not return_dict: |
|
output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs |
|
return ((loss,) + output) if loss is not None else output |
|
|
|
return Seq2SeqLMOutput( |
|
loss=loss, |
|
logits=lm_logits, |
|
past_key_values=decoder_outputs.past_key_values, |
|
decoder_hidden_states=decoder_outputs.hidden_states, |
|
decoder_attentions=decoder_outputs.attentions, |
|
cross_attentions=decoder_outputs.cross_attentions, |
|
encoder_last_hidden_state=encoder_outputs.last_hidden_state, |
|
encoder_hidden_states=encoder_outputs.hidden_states, |
|
encoder_attentions=encoder_outputs.attentions, |
|
) |
|
|
|
def prepare_inputs_for_generation( |
|
self, |
|
input_ids, |
|
past_key_values=None, |
|
attention_mask=None, |
|
head_mask=None, |
|
decoder_head_mask=None, |
|
cross_attn_head_mask=None, |
|
use_cache=None, |
|
encoder_outputs=None, |
|
**kwargs, |
|
): |
|
|
|
if past_key_values is not None: |
|
input_ids = input_ids[:, -1:] |
|
|
|
return { |
|
"decoder_input_ids": input_ids, |
|
"past_key_values": past_key_values, |
|
"encoder_outputs": encoder_outputs, |
|
"attention_mask": attention_mask, |
|
"head_mask": head_mask, |
|
"decoder_head_mask": decoder_head_mask, |
|
"cross_attn_head_mask": cross_attn_head_mask, |
|
"use_cache": use_cache, |
|
} |
|
|
|
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): |
|
return self._shift_right(labels) |
|
|
|
def _reorder_cache(self, past_key_values, beam_idx): |
|
|
|
|
|
if past_key_values is None: |
|
logger.warning("You might want to consider setting `use_cache=True` to speed up decoding") |
|
return past_key_values |
|
|
|
reordered_decoder_past = () |
|
for layer_past_states in past_key_values: |
|
|
|
|
|
reordered_layer_past_states = () |
|
for layer_past_state in layer_past_states: |
|
|
|
reordered_layer_past_states = reordered_layer_past_states + ( |
|
layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)), |
|
) |
|
|
|
assert reordered_layer_past_states[0].shape == layer_past_states[0].shape |
|
assert len(reordered_layer_past_states) == len(layer_past_states) |
|
|
|
reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,) |
|
return reordered_decoder_past |
|
|
|
@add_start_docstrings( |
|
"The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.", |
|
T5_START_DOCSTRING, |
|
) |
|
class T5EncoderModel(T5PreTrainedModel): |
|
_keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"] |
|
|
|
def __init__(self, config: T5Config): |
|
super().__init__(config) |
|
self.shared = nn.Embedding(config.vocab_size, config.d_model) |
|
|
|
encoder_config = copy.deepcopy(config) |
|
encoder_config.use_cache = False |
|
encoder_config.is_encoder_decoder = False |
|
self.encoder = T5Stack(encoder_config, self.shared) |
|
|
|
|
|
self.post_init() |
|
|
|
|
|
self.model_parallel = False |
|
self.device_map = None |
|
|
|
@add_start_docstrings(PARALLELIZE_DOCSTRING) |
|
def parallelize(self, device_map=None): |
|
warnings.warn( |
|
"`T5EncoderModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load" |
|
" your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own" |
|
" `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0," |
|
" 'block.1': 1, ...}", |
|
FutureWarning, |
|
) |
|
self.device_map = ( |
|
get_device_map(len(self.encoder.block), range(torch.cuda.device_count())) |
|
if device_map is None |
|
else device_map |
|
) |
|
assert_device_map(self.device_map, len(self.encoder.block)) |
|
self.encoder.parallelize(self.device_map) |
|
self.model_parallel = True |
|
|
|
@add_start_docstrings(DEPARALLELIZE_DOCSTRING) |
|
def deparallelize(self): |
|
warnings.warn( |
|
"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", |
|
FutureWarning, |
|
) |
|
self.encoder.deparallelize() |
|
self.encoder = self.encoder.to("cpu") |
|
self.model_parallel = False |
|
self.device_map = None |
|
torch.cuda.empty_cache() |
|
|
|
def get_input_embeddings(self): |
|
return self.shared |
|
|
|
def set_input_embeddings(self, new_embeddings): |
|
self.shared = new_embeddings |
|
self.encoder.set_input_embeddings(new_embeddings) |
|
|
|
def get_encoder(self): |
|
return self.encoder |
|
|
|
def _prune_heads(self, heads_to_prune): |
|
""" |
|
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base |
|
class PreTrainedModel |
|
""" |
|
for layer, heads in heads_to_prune.items(): |
|
self.encoder.block[layer].layer[0].SelfAttention.prune_heads(heads) |
|
|
|
@add_start_docstrings_to_model_forward(T5_ENCODER_INPUTS_DOCSTRING) |
|
@replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC) |
|
def forward( |
|
self, |
|
input_ids: Optional[torch.LongTensor] = None, |
|
attention_mask: Optional[torch.FloatTensor] = None, |
|
head_mask: Optional[torch.FloatTensor] = None, |
|
inputs_embeds: Optional[torch.FloatTensor] = None, |
|
output_attentions: Optional[bool] = None, |
|
output_hidden_states: Optional[bool] = None, |
|
return_dict: Optional[bool] = None, |
|
relative_position: Optional[torch.LongTensor] = None, |
|
sparsity_mask: Optional[torch.BoolTensor] = None, |
|
use_additional_bucket: Optional[torch.BoolTensor] = None, |
|
) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]: |
|
r""" |
|
Returns: |
|
|
|
Example: |
|
|
|
```python |
|
>>> from transformers import AutoTokenizer, T5EncoderModel |
|
|
|
>>> tokenizer = AutoTokenizer.from_pretrained("t5-small") |
|
>>> model = T5EncoderModel.from_pretrained("t5-small") |
|
>>> input_ids = tokenizer( |
|
... "Studies have been shown that owning a dog is good for you", return_tensors="pt" |
|
... ).input_ids # Batch size 1 |
|
>>> outputs = model(input_ids=input_ids) |
|
>>> last_hidden_states = outputs.last_hidden_state |
|
```""" |
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict |
|
|
|
encoder_outputs = self.encoder( |
|
input_ids=input_ids, |
|
attention_mask=attention_mask, |
|
inputs_embeds=inputs_embeds, |
|
head_mask=head_mask, |
|
output_attentions=output_attentions, |
|
output_hidden_states=output_hidden_states, |
|
return_dict=return_dict, |
|
relative_position=relative_position, |
|
sparsity_mask=sparsity_mask, |
|
use_additional_bucket=use_additional_bucket, |
|
) |
|
|
|
return encoder_outputs |
|
|
|
def init_relative_position_bias(self, modelsize:str, init_decoder:bool=False, init_additional_buckets_from:list[int]=None): |
|
if init_decoder: |
|
self.decoder.block[0].layer[0].SelfAttention.init_relative_position_bias(modelsize=modelsize, is_decoder=init_decoder, init_additional_buckets_from=init_additional_buckets_from) |
|
else: |
|
self.encoder.block[0].layer[0].SelfAttention.init_relative_position_bias(modelsize=modelsize, is_decoder=init_decoder, init_additional_buckets_from=init_additional_buckets_from) |
|
|
|
|
|
class GraphT5Classifier(PreTrainedModel): |
|
config_class = T5Config |
|
|
|
def __init__( |
|
self, |
|
config: T5Config, |
|
): |
|
super().__init__(config=config) |
|
self.config = config |
|
self.tokenizer = T5Tokenizer.from_pretrained(self.config.modelsize, model_max_length=self.config.model_max_length) |
|
|
|
self.t5model = T5EncoderModel.from_pretrained(self.config.modelsize, config=config, ignore_mismatched_sizes=True) |
|
self.hidden_size = self.t5model.config.d_model |
|
self.classification_head = nn.Linear(self.hidden_size, self.config.num_classes, bias=True) |
|
self.softmax = nn.Softmax(dim=-1) |
|
|
|
@staticmethod |
|
def get_config(num_classes:int, modelsize:str="t5-base", num_additional_buckets:int=0, model_max_length:int=512) -> T5Config: |
|
config = T5Config.from_pretrained(modelsize) |
|
config.num_classes = int(num_classes) |
|
config.modelsize = str(modelsize) |
|
config.relative_attention_num_additional_buckets = int(num_additional_buckets) |
|
config.model_max_length = int(model_max_length) |
|
return config |
|
|
|
def forward( |
|
self, |
|
input_ids: torch.Tensor, |
|
relative_position: torch.Tensor, |
|
sparsity_mask: torch.Tensor, |
|
use_additional_bucket: torch.Tensor, |
|
) -> torch.Tensor: |
|
logging.debug('t5 encoder model') |
|
output = self.t5model(input_ids=input_ids, relative_position=relative_position, sparsity_mask=sparsity_mask, use_additional_bucket=use_additional_bucket) |
|
logging.debug('classification head') |
|
logits = self.classification_head(output[0]) |
|
|
|
return logits |
|
|
|
def get_probabilities(self, logits: torch.Tensor) -> torch.Tensor: |
|
return self.softmax(logits) |
|
|
|
def get_label(self, logits: torch.Tensor) -> torch.Tensor: |
|
return torch.argmax(logits, dim=-1) |
|
|
|
|
|
class DualGraphT5Classifier(PreTrainedModel): |
|
""" |
|
Same as GraphT5Classifier, but with two classification heads |
|
""" |
|
config_class = T5Config |
|
|
|
def __init__( |
|
self, |
|
config: T5Config, |
|
): |
|
super().__init__(config=config) |
|
self.config = config |
|
self.tokenizer = T5Tokenizer.from_pretrained(self.config.modelsize, model_max_length=self.config.model_max_length) |
|
|
|
self.t5model = T5EncoderModel.from_pretrained(self.config.modelsize, config=config, ignore_mismatched_sizes=True) |
|
self.hidden_size = self.t5model.config.d_model |
|
self.classification_head1 = nn.Linear(self.hidden_size, self.config.num_classes1, bias=True) |
|
self.classification_head2 = nn.Linear(self.hidden_size, self.config.num_classes2, bias=True) |
|
self.softmax = nn.Softmax(dim=-1) |
|
|
|
@staticmethod |
|
def get_config(num_classes1:int, num_classes2:int, modelsize:str="t5-base", num_additional_buckets:int=0, model_max_length:int=512) -> T5Config: |
|
config = T5Config.from_pretrained(modelsize) |
|
config.num_classes1 = int(num_classes1) |
|
config.num_classes2 = int(num_classes2) |
|
config.modelsize = str(modelsize) |
|
config.relative_attention_num_additional_buckets = int(num_additional_buckets) |
|
config.model_max_length = int(model_max_length) |
|
return config |
|
|
|
def forward( |
|
self, |
|
input_ids: torch.Tensor, |
|
relative_position: torch.Tensor, |
|
sparsity_mask: torch.Tensor, |
|
use_additional_bucket: torch.Tensor, |
|
) -> torch.Tensor: |
|
logging.debug('t5 encoder model') |
|
output = self.t5model(input_ids=input_ids, relative_position=relative_position, sparsity_mask=sparsity_mask, use_additional_bucket=use_additional_bucket) |
|
logging.debug('classification head 1') |
|
logits1 = self.classification_head1(output[0]) |
|
logging.debug('classification head 2') |
|
logits2 = self.classification_head2(output[0]) |
|
|
|
return logits1, logits2 |
|
|
|
def get_probabilities(self, logits: torch.Tensor) -> torch.Tensor: |
|
return self.softmax(logits) |
|
|
|
def get_label(self, logits: torch.Tensor) -> torch.Tensor: |
|
return torch.argmax(logits, dim=-1) |