add mlm model and adjust naming

Browse files

Files changed (8) hide show

README.md +5 -0
config.json +4 -4
configuration_bert.py → configuration_xlm_roberta.py +1 -1
convert_roberta_weights_to_flash.py +29 -44
embedding.py +1 -1
modeling_bert.py → modeling_xlm_roberta.py +210 -148
pytorch_model.bin +2 -2
bert_padding.py → xlm_padding.py +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# Converting Weights
+```
+python3 -m "xlm-roberta-flash-implementation".convert_roberta_weights_to_flash --output pytorch_model_xlmr_flash.bin
+```

config.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "auto_map": {
-    "AutoConfig": "configuration_bert.XLMFlashConfig",
-    "AutoModel": "modeling_bert.BertModel",
-    "AutoModelForPreTraining": "modeling_bert.BertForPreTraining",
-    "AutoModelForMaskedLM": "modeling_bert.BertForPreTraining"
   },
   "attention_probs_dropout_prob": 0.1,
   "bos_token_id": 0,

 {
   "auto_map": {
+    "AutoConfig": "configuration_xlm_roberta.XLMRobertaFlashConfig",
+    "AutoModel": "modeling_xlm_roberta.XLMRobertaModel",
+    "AutoModelForPreTraining": "modeling_xlm_roberta.XLMRobertaForPreTraining",
+    "AutoModelForMaskedLM": "modeling_xlm_roberta.XLMRobertaForMaskedLM"
   },
   "attention_probs_dropout_prob": 0.1,
   "bos_token_id": 0,

configuration_bert.py → configuration_xlm_roberta.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from transformers import PretrainedConfig
-class XLMFlashConfig(PretrainedConfig):
     def __init__(
             self,
             vocab_size=30522,

 from transformers import PretrainedConfig
+class XLMRobertaFlashConfig(PretrainedConfig):
     def __init__(
             self,
             vocab_size=30522,

convert_roberta_weights_to_flash.py CHANGED Viewed

@@ -1,9 +1,10 @@
 import re
 from collections import OrderedDict
-from transformers import BertConfig, PretrainedConfig
 from transformers import XLMRobertaForMaskedLM
-from flash_attn.models.bert import BertModel
 import torch
 import click
@@ -16,12 +17,6 @@ def remap_state_dict(state_dict, config: PretrainedConfig):
     Map the state_dict of a Huggingface BERT model to be flash_attn compatible.
     """
-    # Replace Roberta with Bert
-    def key_mapping_roberta(key):
-        return re.sub(r"^roberta.", "bert.", key)
-    state_dict = OrderedDict((key_mapping_roberta(k), v) for k, v in state_dict.items())
     # LayerNorm
     def key_mapping_ln_gamma_beta(key):
         key = re.sub(r"LayerNorm.gamma$", "LayerNorm.weight", key)
@@ -34,21 +29,21 @@ def remap_state_dict(state_dict, config: PretrainedConfig):
     # Layers
     def key_mapping_layers(key):
-        return re.sub(r"^bert.encoder.layer.", "bert.encoder.layers.", key)
     state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
     # LayerNorm
     def key_mapping_ln(key):
-        key = re.sub(r"^bert.embeddings.LayerNorm.", "bert.emb_ln.", key)
         key = re.sub(
-            r"^bert.encoder.layers.(\d+).attention.output.LayerNorm.(weight|bias)",
-            r"bert.encoder.layers.\1.norm1.\2",
             key,
         )
         key = re.sub(
-            r"^bert.encoder.layers.(\d+).output.LayerNorm.(weight|bias)",
-            r"bert.encoder.layers.\1.norm2.\2",
             key,
         )
         key = re.sub(
@@ -63,13 +58,13 @@ def remap_state_dict(state_dict, config: PretrainedConfig):
     # MLP
     def key_mapping_mlp(key):
         key = re.sub(
-            r"^bert.encoder.layers.(\d+).intermediate.dense.(weight|bias)",
-            r"bert.encoder.layers.\1.mlp.fc1.\2",
             key,
         )
         key = re.sub(
-            r"^bert.encoder.layers.(\d+).output.dense.(weight|bias)",
-            r"bert.encoder.layers.\1.mlp.fc2.\2",
             key,
         )
         return key
@@ -79,33 +74,33 @@ def remap_state_dict(state_dict, config: PretrainedConfig):
     # Attention
     last_layer_subset = getattr(config, "last_layer_subset", False)
     for d in range(config.num_hidden_layers):
-        Wq = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.query.weight")
-        Wk = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.key.weight")
-        Wv = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.value.weight")
-        bq = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.query.bias")
-        bk = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.key.bias")
-        bv = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.value.bias")
         if not (last_layer_subset and d == config.num_hidden_layers - 1):
-            state_dict[f"bert.encoder.layers.{d}.mixer.Wqkv.weight"] = torch.cat(
                 [Wq, Wk, Wv], dim=0
             )
-            state_dict[f"bert.encoder.layers.{d}.mixer.Wqkv.bias"] = torch.cat(
                 [bq, bk, bv], dim=0
             )
         else:
-            state_dict[f"bert.encoder.layers.{d}.mixer.Wq.weight"] = Wq
-            state_dict[f"bert.encoder.layers.{d}.mixer.Wkv.weight"] = torch.cat(
                 [Wk, Wv], dim=0
             )
-            state_dict[f"bert.encoder.layers.{d}.mixer.Wq.bias"] = bq
-            state_dict[f"bert.encoder.layers.{d}.mixer.Wkv.bias"] = torch.cat(
                 [bk, bv], dim=0
             )
     def key_mapping_attn(key):
         return re.sub(
-            r"^bert.encoder.layers.(\d+).attention.output.dense.(weight|bias)",
-            r"bert.encoder.layers.\1.mixer.out_proj.\2",
             key,
         )
@@ -121,8 +116,8 @@ def remap_state_dict(state_dict, config: PretrainedConfig):
     # Word embedding
     pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
     if pad_vocab_size_multiple > 1:
-        word_embeddings = state_dict["bert.embeddings.word_embeddings.weight"]
-        state_dict["bert.embeddings.word_embeddings.weight"] = F.pad(
             word_embeddings, (0, 0, 0, config.vocab_size - word_embeddings.shape[0])
         )
         decoder_weight = state_dict["cls.predictions.decoder.weight"]
@@ -137,16 +132,6 @@ def remap_state_dict(state_dict, config: PretrainedConfig):
             decoder_bias, (0, config.vocab_size - decoder_bias.shape[0]), value=-100.0
         )
-    # Embeddings
-    def key_remove_bert(key):
-        return re.sub(r"^bert.", "", key)
-    state_dict = OrderedDict(
-        (key_remove_bert(k), v)
-        for k, v in state_dict.items()
-        if not k.startswith('lm_head')
-    )
     return state_dict

 import re
 from collections import OrderedDict
+from transformers import PretrainedConfig
 from transformers import XLMRobertaForMaskedLM
+from .configuration_xlm_roberta import XLMRobertaFlashConfig as BertConfig
+from .modeling_xlm_roberta import XLMRobertaForMaskedLM as BertModel
 import torch
 import click
     Map the state_dict of a Huggingface BERT model to be flash_attn compatible.
     """
     # LayerNorm
     def key_mapping_ln_gamma_beta(key):
         key = re.sub(r"LayerNorm.gamma$", "LayerNorm.weight", key)
     # Layers
     def key_mapping_layers(key):
+        return re.sub(r"^roberta.encoder.layer.", "roberta.encoder.layers.", key)
     state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
     # LayerNorm
     def key_mapping_ln(key):
+        key = re.sub(r"^roberta.embeddings.LayerNorm.", "roberta.emb_ln.", key)
         key = re.sub(
+            r"^roberta.encoder.layers.(\d+).attention.output.LayerNorm.(weight|bias)",
+            r"roberta.encoder.layers.\1.norm1.\2",
             key,
         )
         key = re.sub(
+            r"^roberta.encoder.layers.(\d+).output.LayerNorm.(weight|bias)",
+            r"roberta.encoder.layers.\1.norm2.\2",
             key,
         )
         key = re.sub(
     # MLP
     def key_mapping_mlp(key):
         key = re.sub(
+            r"^roberta.encoder.layers.(\d+).intermediate.dense.(weight|bias)",
+            r"roberta.encoder.layers.\1.mlp.fc1.\2",
             key,
         )
         key = re.sub(
+            r"^roberta.encoder.layers.(\d+).output.dense.(weight|bias)",
+            r"roberta.encoder.layers.\1.mlp.fc2.\2",
             key,
         )
         return key
     # Attention
     last_layer_subset = getattr(config, "last_layer_subset", False)
     for d in range(config.num_hidden_layers):
+        Wq = state_dict.pop(f"roberta.encoder.layers.{d}.attention.self.query.weight")
+        Wk = state_dict.pop(f"roberta.encoder.layers.{d}.attention.self.key.weight")
+        Wv = state_dict.pop(f"roberta.encoder.layers.{d}.attention.self.value.weight")
+        bq = state_dict.pop(f"roberta.encoder.layers.{d}.attention.self.query.bias")
+        bk = state_dict.pop(f"roberta.encoder.layers.{d}.attention.self.key.bias")
+        bv = state_dict.pop(f"roberta.encoder.layers.{d}.attention.self.value.bias")
         if not (last_layer_subset and d == config.num_hidden_layers - 1):
+            state_dict[f"roberta.encoder.layers.{d}.mixer.Wqkv.weight"] = torch.cat(
                 [Wq, Wk, Wv], dim=0
             )
+            state_dict[f"roberta.encoder.layers.{d}.mixer.Wqkv.bias"] = torch.cat(
                 [bq, bk, bv], dim=0
             )
         else:
+            state_dict[f"roberta.encoder.layers.{d}.mixer.Wq.weight"] = Wq
+            state_dict[f"roberta.encoder.layers.{d}.mixer.Wkv.weight"] = torch.cat(
                 [Wk, Wv], dim=0
             )
+            state_dict[f"roberta.encoder.layers.{d}.mixer.Wq.bias"] = bq
+            state_dict[f"roberta.encoder.layers.{d}.mixer.Wkv.bias"] = torch.cat(
                 [bk, bv], dim=0
             )
     def key_mapping_attn(key):
         return re.sub(
+            r"^roberta.encoder.layers.(\d+).attention.output.dense.(weight|bias)",
+            r"roberta.encoder.layers.\1.mixer.out_proj.\2",
             key,
         )
     # Word embedding
     pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
     if pad_vocab_size_multiple > 1:
+        word_embeddings = state_dict["roberta.embeddings.word_embeddings.weight"]
+        state_dict["roberta.embeddings.word_embeddings.weight"] = F.pad(
             word_embeddings, (0, 0, 0, config.vocab_size - word_embeddings.shape[0])
         )
         decoder_weight = state_dict["cls.predictions.decoder.weight"]
             decoder_bias, (0, config.vocab_size - decoder_bias.shape[0]), value=-100.0
         )
     return state_dict

embedding.py CHANGED Viewed

@@ -11,7 +11,7 @@ from torch import Tensor
 from transformers.models.xlm_roberta.modeling_xlm_roberta import create_position_ids_from_input_ids
-class BertEmbeddings(nn.Module):
     def __init__(
         self,
         embed_dim,

 from transformers.models.xlm_roberta.modeling_xlm_roberta import create_position_ids_from_input_ids
+class XLMRobertaEmbeddings(nn.Module):
     def __init__(
         self,
         embed_dim,

modeling_bert.py → modeling_xlm_roberta.py RENAMED Viewed

@@ -13,28 +13,32 @@ import re
 from collections import OrderedDict
 from collections.abc import Sequence
 from functools import partial
-from typing import Any, Mapping
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
-from transformers import BertConfig, PretrainedConfig
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.bert.modeling_bert import (
     BaseModelOutputWithPoolingAndCrossAttentions,
     BertForPreTrainingOutput,
 )
-from .bert_padding import (
     index_first_axis,
     index_first_axis_residual,
     pad_input,
     unpad_input,
 )
-from .configuration_bert import XLMFlashConfig
 from .block import Block
-from .embedding import BertEmbeddings
 from .mha import MHA
 from .mlp import FusedMLP, Mlp
@@ -155,8 +159,8 @@ def _init_weights(module, initializer_range=0.02):
             nn.init.zeros_(module.weight[module.padding_idx])
-class BertEncoder(nn.Module):
-    def __init__(self, config: BertConfig):
         super().__init__()
         self.use_flash_attn = getattr(config, "use_flash_attn", False)
         self.layers = nn.ModuleList(
@@ -218,7 +222,7 @@ class BertEncoder(nn.Module):
         return hidden_states
-class BertPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
         fused_bias_fc = getattr(config, "fused_bias_fc", False)
@@ -237,7 +241,7 @@ class BertPooler(nn.Module):
         return pooled_output
-class BertPredictionHeadTransform(nn.Module):
     def __init__(self, config):
         super().__init__()
         fused_bias_fc = getattr(config, "fused_bias_fc", False)
@@ -268,7 +272,7 @@ class BertPredictionHeadTransform(nn.Module):
         return hidden_states
-class BertLMPredictionHead(nn.Module):
     def __init__(self, config):
         super().__init__()
         fused_bias_fc = getattr(config, "fused_bias_fc", False)
@@ -276,7 +280,7 @@ class BertLMPredictionHead(nn.Module):
             raise ImportError("fused_dense is not installed")
         linear_cls = nn.Linear if not fused_bias_fc else FusedDense
-        self.transform = BertPredictionHeadTransform(config)
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
@@ -288,10 +292,10 @@ class BertLMPredictionHead(nn.Module):
         return hidden_states
-class BertPreTrainingHeads(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.predictions = BertLMPredictionHead(config)
         self.seq_relationship = nn.Linear(config.hidden_size, 2)
     def forward(self, sequence_output, pooled_output):
@@ -300,64 +304,22 @@ class BertPreTrainingHeads(nn.Module):
         return prediction_scores, seq_relationship_score
-# class BertPreTrainedModel(nn.Module):
-#     """An abstract class to handle weights initialization and
-#     a simple interface for dowloading and loading pretrained models.
-#     """
-#
-#     def __init__(self, config, *inputs, **kwargs):
-#         super().__init__()
-#         if not isinstance(config, BertConfig):
-#             raise ValueError(
-#                 "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
-#                 "To create a model from a Google pretrained model use "
-#                 "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-#                     self.__class__.__name__, self.__class__.__name__
-#                 )
-#             )
-#         self.config = config
-#
-#     @classmethod
-#     def from_pretrained(cls, model_name, config, *inputs, **kwargs):
-#         """
-#         Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
-#         Download and cache the pre-trained model file if needed.
-#
-#         Params:
-#             pretrained_model_name_or_path: either:
-#                 - a path or url to a pretrained model archive containing:
-#                     . `bert_config.json` a configuration file for the model
-#                     . `pytorch_model.bin` a PyTorch dump of a BertForPretraining instance
-#                 - a path or url to a pretrained model archive containing:
-#                     . `bert_config.json` a configuration file for the model
-#                     . `model.chkpt` a TensorFlow checkpoint
-#             *inputs, **kwargs: additional input for the specific Bert class
-#                 (ex: num_labels for BertForSequenceClassification)
-#         """
-#         # Instantiate model.
-#         model = cls(config, *inputs, **kwargs)
-#         load_return = model.load_state_dict(
-#             remap_state_dict(state_dict_from_pretrained(model_name), config), strict=False
-#         )
-#         logger.info(load_return)
-#         return model
-class BertPreTrainedModel(PreTrainedModel):
     """An abstract class to handle weights initialization and
     a simple interface for dowloading and loading pretrained models.
     """
-    config_class = XLMFlashConfig
-    base_model_prefix = "bert"
     supports_gradient_checkpointing = True
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, BertEncoder):
             module.gradient_checkpointing = value
-class BertModel(BertPreTrainedModel):
-    def __init__(self, config: BertConfig, add_pooling_layer=True):
         super().__init__(config)
         self.pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
         if config.vocab_size % self.pad_vocab_size_multiple != 0:
@@ -369,7 +331,7 @@ class BertModel(BertPreTrainedModel):
             raise ImportError("Triton is not installed")
         assert config.hidden_act in ["gelu", "gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
-        self.embeddings = BertEmbeddings(
             config.hidden_size,
             config.vocab_size,
             config.max_position_embeddings,
@@ -378,11 +340,12 @@ class BertModel(BertPreTrainedModel):
         )
         self.emb_drop = nn.Dropout(config.hidden_dropout_prob)
         self.emb_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.encoder = BertEncoder(config)
-        self.pooler = BertPooler(config) if add_pooling_layer else None
         self.apply(partial(_init_weights, initializer_range=config.initializer_range))
     def forward(
         self,
         input_ids,
@@ -390,12 +353,22 @@ class BertModel(BertPreTrainedModel):
         token_type_ids=None,
         attention_mask=None,
         masked_tokens_mask=None,
     ):
-        """If masked_tokens_mask is not None (i.e. last_layer_subset == True in BertForPreTraining),
         we only want the output for the masked tokens. This means that we only compute the last
         layer output for these tokens.
         masked_tokens_mask: (batch, seqlen), dtype=torch.bool
         """
         hidden_states = self.embeddings(
             input_ids, position_ids=position_ids, token_type_ids=token_type_ids
         )
@@ -437,111 +410,200 @@ class BertModel(BertPreTrainedModel):
                 sequence_output = sequence_output[masked_tokens_mask[subset_mask]]
             pooled_output = self.pooler(pool_input, pool=False) if self.pooler is not None else None
         return BaseModelOutputWithPoolingAndCrossAttentions(
             last_hidden_state=sequence_output,
             pooler_output=pooled_output,
         )
-class BertForPreTraining(BertPreTrainedModel):
-    def __init__(self, config: BertConfig):
-        import pdb
-        pdb.set_trace()
         super().__init__(config)
-        # If dense_seq_output, we only need to pass the hidden states for the masked out tokens
-        # (around 15%) to the classifier heads.
-        self.dense_seq_output = getattr(config, "dense_seq_output", False)
-        # If last_layer_subset, we only need the compute the last layer for a subset of tokens
-        # (e.g., the tokens we need to compute the masked LM loss and the next-sentence prediction).
-        self.last_layer_subset = getattr(config, "last_layer_subset", False)
-        if self.last_layer_subset:
-            assert self.dense_seq_output, "last_layer_subset requires dense_seq_output"
-        use_xentropy = getattr(config, "use_xentropy", False)
-        if use_xentropy and CrossEntropyLoss is None:
-            raise ImportError("xentropy_cuda is not installed")
-        loss_cls = (
-            nn.CrossEntropyLoss
-            if not use_xentropy
-            else partial(CrossEntropyLoss, inplace_backward=True)
-        )
-        self.bert = BertModel(config)
-        self.cls = BertPreTrainingHeads(config)
-        self.mlm_loss = loss_cls(ignore_index=0)
-        self.nsp_loss = loss_cls(ignore_index=-1)
         # Initialize weights and apply final processing
-        self.apply(partial(_init_weights, initializer_range=config.initializer_range))
-        self.tie_weights()
-    def tie_weights(self):
-        self.cls.predictions.decoder.weight = self.bert.embeddings.word_embeddings.weight
     def forward(
         self,
-        input_ids,
-        position_ids=None,
-        token_type_ids=None,
-        attention_mask=None,
-        labels=None,
-        next_sentence_label=None,
-    ):
         """
-        If labels are provided, they must be 0 for masked out tokens (as specified in the attention
-        mask).
-        Outputs:
-            if `labels` and `next_sentence_label` are not `None`:
-                Outputs the total_loss which is the sum of the masked language modeling loss and the next
-                sentence classification loss.
-            if `labels` or `next_sentence_label` is `None`:
-                Outputs a tuple comprising
-                - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
-                - the next sentence classification logits of shape [batch_size, 2].
-        """
-        masked_tokens_mask = labels > 0 if (self.last_layer_subset and labels is not None) else None
-        outputs = self.bert(
             input_ids,
-            position_ids=position_ids,
             token_type_ids=token_type_ids,
-            attention_mask=attention_mask.bool() if attention_mask is not None else None,
-            masked_tokens_mask=masked_tokens_mask,
         )
-        sequence_output, pooled_output = outputs.last_hidden_state, outputs.pooler_output
-        if self.dense_seq_output and labels is not None:
-            masked_token_idx = torch.nonzero(labels.flatten() > 0, as_tuple=False).flatten()
-            if not self.last_layer_subset:
-                sequence_output = index_first_axis(
-                    rearrange(sequence_output, "b s d -> (b s) d"), masked_token_idx
-                )
-        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
-        total_loss = None
-        if labels is not None and next_sentence_label is not None:
-            if (
-                self.dense_seq_output and labels is not None
-            ):  # prediction_scores are already flattened
-                masked_lm_loss = self.mlm_loss(
-                    prediction_scores, labels.flatten()[masked_token_idx]
-                )
-            else:
-                masked_lm_loss = self.mlm_loss(
-                    rearrange(prediction_scores, "... v -> (...) v"),
-                    rearrange(labels, "... -> (...)"),
-                )
-            next_sentence_loss = self.nsp_loss(
-                rearrange(seq_relationship_score, "... t -> (...) t"),
-                rearrange(next_sentence_label, "... -> (...)"),
-            )
-            total_loss = masked_lm_loss.float() + next_sentence_loss.float()
-        return BertForPreTrainingOutput(
-            loss=total_loss,
-            prediction_logits=prediction_scores,
-            seq_relationship_logits=seq_relationship_score,
         )
 def remap_state_dict(state_dict, config: PretrainedConfig):
     """
     Map the state_dict of a Huggingface BERT model to be flash_attn compatible.

 from collections import OrderedDict
 from collections.abc import Sequence
 from functools import partial
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
+from transformers import PretrainedConfig
 from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_outputs import MaskedLMOutput
+from transformers.models.xlm_roberta.modeling_xlm_roberta import XLMRobertaLMHead
 from transformers.models.bert.modeling_bert import (
     BaseModelOutputWithPoolingAndCrossAttentions,
     BertForPreTrainingOutput,
 )
+from typing import Optional, Tuple, Union
+from .xlm_padding import (
     index_first_axis,
     index_first_axis_residual,
     pad_input,
     unpad_input,
 )
+from .configuration_xlm_roberta import XLMRobertaFlashConfig
 from .block import Block
+from .embedding import XLMRobertaEmbeddings
 from .mha import MHA
 from .mlp import FusedMLP, Mlp
             nn.init.zeros_(module.weight[module.padding_idx])
+class XLMRobertaEncoder(nn.Module):
+    def __init__(self, config: XLMRobertaFlashConfig):
         super().__init__()
         self.use_flash_attn = getattr(config, "use_flash_attn", False)
         self.layers = nn.ModuleList(
         return hidden_states
+class XLMRobertaPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
         fused_bias_fc = getattr(config, "fused_bias_fc", False)
         return pooled_output
+class XLMRobertaPredictionHeadTransform(nn.Module):
     def __init__(self, config):
         super().__init__()
         fused_bias_fc = getattr(config, "fused_bias_fc", False)
         return hidden_states
+class XLMRobertaLMPredictionHead(nn.Module):
     def __init__(self, config):
         super().__init__()
         fused_bias_fc = getattr(config, "fused_bias_fc", False)
             raise ImportError("fused_dense is not installed")
         linear_cls = nn.Linear if not fused_bias_fc else FusedDense
+        self.transform = XLMRobertaPredictionHeadTransform(config)
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
         return hidden_states
+class XLMRobertaPreTrainingHeads(nn.Module):
     def __init__(self, config):
         super().__init__()
+        self.predictions = XLMRobertaLMPredictionHead(config)
         self.seq_relationship = nn.Linear(config.hidden_size, 2)
     def forward(self, sequence_output, pooled_output):
         return prediction_scores, seq_relationship_score
+class XLMRobertaPreTrainedModel(PreTrainedModel):
     """An abstract class to handle weights initialization and
     a simple interface for dowloading and loading pretrained models.
     """
+    config_class = XLMRobertaFlashConfig
+    base_model_prefix = "roberta"
     supports_gradient_checkpointing = True
     def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, XLMRobertaEncoder):
             module.gradient_checkpointing = value
+class XLMRobertaModel(XLMRobertaPreTrainedModel):
+    def __init__(self, config: XLMRobertaFlashConfig, add_pooling_layer=True):
         super().__init__(config)
         self.pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
         if config.vocab_size % self.pad_vocab_size_multiple != 0:
             raise ImportError("Triton is not installed")
         assert config.hidden_act in ["gelu", "gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
+        self.embeddings = XLMRobertaEmbeddings(
             config.hidden_size,
             config.vocab_size,
             config.max_position_embeddings,
         )
         self.emb_drop = nn.Dropout(config.hidden_dropout_prob)
         self.emb_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.encoder = XLMRobertaEncoder(config)
+        self.pooler = XLMRobertaPooler(config) if add_pooling_layer else None
         self.apply(partial(_init_weights, initializer_range=config.initializer_range))
     def forward(
         self,
         input_ids,
         token_type_ids=None,
         attention_mask=None,
         masked_tokens_mask=None,
+        return_dict=None,
+        **kwargs,
     ):
+        """If masked_tokens_mask is not None (i.e. last_layer_subset == True in XLMForPreTraining),
         we only want the output for the masked tokens. This means that we only compute the last
         layer output for these tokens.
         masked_tokens_mask: (batch, seqlen), dtype=torch.bool
         """
+        if kwargs:
+            for key, value in kwargs.items():
+                if value is not None:
+                    logger.warning('Flash attention implementation does not support kwargs: %s', key)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         hidden_states = self.embeddings(
             input_ids, position_ids=position_ids, token_type_ids=token_type_ids
         )
                 sequence_output = sequence_output[masked_tokens_mask[subset_mask]]
             pooled_output = self.pooler(pool_input, pool=False) if self.pooler is not None else None
+        if not return_dict:
+            return sequence_output, pooled_output
         return BaseModelOutputWithPoolingAndCrossAttentions(
             last_hidden_state=sequence_output,
             pooler_output=pooled_output,
         )
+class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel):
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
+    def __init__(self, config):
         super().__init__(config)
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `XLMRobertaForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+        self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
+        self.lm_head = XLMRobertaLMHead(config)
         # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.roberta.embeddings.word_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
     def forward(
         self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+            Used to hide legacy arguments that have been deprecated.
         """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.roberta(
             input_ids,
+            attention_mask=attention_mask,
             token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
         )
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+        masked_lm_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(prediction_scores.device)
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
+# class XLMRobertaForPreTraining(XLMRobertaPreTrainedModel):
+#     def __init__(self, config: XLMRobertaFlashConfig):
+#         super().__init__(config)
+#         # If dense_seq_output, we only need to pass the hidden states for the masked out tokens
+#         # (around 15%) to the classifier heads.
+#         self.dense_seq_output = getattr(config, "dense_seq_output", False)
+#         # If last_layer_subset, we only need the compute the last layer for a subset of tokens
+#         # (e.g., the tokens we need to compute the masked LM loss and the next-sentence prediction).
+#         self.last_layer_subset = getattr(config, "last_layer_subset", False)
+#         if self.last_layer_subset:
+#             assert self.dense_seq_output, "last_layer_subset requires dense_seq_output"
+#         use_xentropy = getattr(config, "use_xentropy", False)
+#         if use_xentropy and CrossEntropyLoss is None:
+#             raise ImportError("xentropy_cuda is not installed")
+#         loss_cls = (
+#             nn.CrossEntropyLoss
+#             if not use_xentropy
+#             else partial(CrossEntropyLoss, inplace_backward=True)
+#         )
+#
+#         self.xlm = XLMRobertaModel(config)
+#         self.cls = XLMRobertaPreTrainingHeads(config)
+#         self.mlm_loss = loss_cls(ignore_index=0)
+#         self.nsp_loss = loss_cls(ignore_index=-1)
+#
+#         # Initialize weights and apply final processing
+#         self.apply(partial(_init_weights, initializer_range=config.initializer_range))
+#         self.tie_weights()
+#
+#     def tie_weights(self):
+#         self.cls.predictions.decoder.weight = self.xlm.embeddings.word_embeddings.weight
+#
+#     def forward(
+#         self,
+#         input_ids,
+#         position_ids=None,
+#         token_type_ids=None,
+#         attention_mask=None,
+#         labels=None,
+#         next_sentence_label=None,
+#     ):
+#         """
+#         If labels are provided, they must be 0 for masked out tokens (as specified in the attention
+#         mask).
+#         Outputs:
+#             if `labels` and `next_sentence_label` are not `None`:
+#                 Outputs the total_loss which is the sum of the masked language modeling loss and the next
+#                 sentence classification loss.
+#             if `labels` or `next_sentence_label` is `None`:
+#                 Outputs a tuple comprising
+#                 - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
+#                 - the next sentence classification logits of shape [batch_size, 2].
+#
+#         """
+#         masked_tokens_mask = labels > 0 if (self.last_layer_subset and labels is not None) else None
+#         outputs = self.xlm(
+#             input_ids,
+#             position_ids=position_ids,
+#             token_type_ids=token_type_ids,
+#             attention_mask=attention_mask.bool() if attention_mask is not None else None,
+#             masked_tokens_mask=masked_tokens_mask,
+#         )
+#         sequence_output, pooled_output = outputs.last_hidden_state, outputs.pooler_output
+#         if self.dense_seq_output and labels is not None:
+#             masked_token_idx = torch.nonzero(labels.flatten() > 0, as_tuple=False).flatten()
+#             if not self.last_layer_subset:
+#                 sequence_output = index_first_axis(
+#                     rearrange(sequence_output, "b s d -> (b s) d"), masked_token_idx
+#                 )
+#         prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+#
+#         total_loss = None
+#         if labels is not None and next_sentence_label is not None:
+#             if (
+#                 self.dense_seq_output and labels is not None
+#             ):  # prediction_scores are already flattened
+#                 masked_lm_loss = self.mlm_loss(
+#                     prediction_scores, labels.flatten()[masked_token_idx]
+#                 )
+#             else:
+#                 masked_lm_loss = self.mlm_loss(
+#                     rearrange(prediction_scores, "... v -> (...) v"),
+#                     rearrange(labels, "... -> (...)"),
+#                 )
+#             next_sentence_loss = self.nsp_loss(
+#                 rearrange(seq_relationship_score, "... t -> (...) t"),
+#                 rearrange(next_sentence_label, "... -> (...)"),
+#             )
+#             total_loss = masked_lm_loss.float() + next_sentence_loss.float()
+#
+#         return BertForPreTrainingOutput(
+#             loss=total_loss,
+#             prediction_logits=prediction_scores,
+#             seq_relationship_logits=seq_relationship_score,
+#         )
 def remap_state_dict(state_dict, config: PretrainedConfig):
     """
     Map the state_dict of a Huggingface BERT model to be flash_attn compatible.

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:61bdee1ea6ae50618c387234ae94a500df9ce095e59d836b8aefef33e9d8884e
-size 1112222546

 version https://git-lfs.github.com/spec/v1
+oid sha256:cfa8fa7c7e120199548fe7149512c0adfe58f6bc13ce19f09b895aa25e8af910
+size 1113232188

bert_padding.py → xlm_padding.py RENAMED Viewed

File without changes