prajdabre
/

rotary-indictrans2-dist-en-indic-200M

PyTorch

RotaryIndicTrans

custom_code

Model card Files Files and versions Community

VarunGumma commited on 10 days ago

Commit

109f09f

verified ·

1 Parent(s): 2497708

Update modeling_rotary_indictrans.py

Browse files

Files changed (1) hide show

modeling_rotary_indictrans.py +26 -25

modeling_rotary_indictrans.py CHANGED Viewed

@@ -31,16 +31,22 @@ from transformers.generation import GenerationMixin
 from transformers.modeling_utils import PreTrainedModel
 from .configuration_rotary_indictrans import RotaryIndicTransConfig
-from flash_attn import flash_attn_func, flash_attn_varlen_func
-from flash_attn.bert_padding import (
-    index_first_axis,
-    pad_input,
-    unpad_input,
-)
 logger = logging.get_logger(__name__)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
@@ -1401,8 +1407,6 @@ class RotaryIndicTransDecoder(RotaryIndicTransPreTrainedModel):
 # Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100Model->RotaryIndicTrans
 class RotaryIndicTransModel(RotaryIndicTransPreTrainedModel):
-    _tied_weights_keys = None
     def __init__(self, config: RotaryIndicTransConfig):
         super().__init__(config)
@@ -1497,10 +1501,11 @@ class RotaryIndicTransModel(RotaryIndicTransPreTrainedModel):
 # Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100ForConditionalGeneration->RotaryIndicTrans
-class RotaryIndicTransForConditionalGeneration(RotaryIndicTransPreTrainedModel, GenerationMixin):
     base_model_prefix = "model"
-    _tied_weights_keys = None
-    _label_smoothing = 0.0
     def __init__(self, config: RotaryIndicTransConfig):
         super().__init__(config)
@@ -1509,19 +1514,16 @@ class RotaryIndicTransForConditionalGeneration(RotaryIndicTransPreTrainedModel,
             config.decoder_embed_dim, config.decoder_vocab_size, bias=False
         )
-        if config.share_decoder_input_output_embed:
-            self.lm_head.weight = self.model.decoder.embed_tokens.weight
         self.post_init()
-    def tie_weights(self):
-        pass
     def get_encoder(self):
-        return self.model.get_encoder()
     def get_decoder(self):
-        return self.model.get_decoder()
     def get_output_embeddings(self):
         return self.lm_head
@@ -1529,8 +1531,9 @@ class RotaryIndicTransForConditionalGeneration(RotaryIndicTransPreTrainedModel,
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
-    def set_label_smoothing(self, label_smoothing):
-        self._label_smoothing = label_smoothing
     def forward(
         self,
@@ -1594,8 +1597,6 @@ class RotaryIndicTransForConditionalGeneration(RotaryIndicTransPreTrainedModel,
             masked_lm_loss = F.cross_entropy(
                 input=lm_logits.view(-1, self.config.decoder_vocab_size),
                 target=labels.view(-1),
-                ignore_index=-100,
-                label_smoothing=self._label_smoothing,
             )
         if not return_dict:
@@ -1652,4 +1653,4 @@ class RotaryIndicTransForConditionalGeneration(RotaryIndicTransPreTrainedModel,
                     past_state.index_select(0, beam_idx) for past_state in layer_past
                 ),
             )
-        return reordered_past

 from transformers.modeling_utils import PreTrainedModel
 from .configuration_rotary_indictrans import RotaryIndicTransConfig
 logger = logging.get_logger(__name__)
 device = "cuda" if torch.cuda.is_available() else "cpu"
+try:
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import (
+        index_first_axis,
+        pad_input,
+        unpad_input,
+    )
+except ImportError:
+    logger.warning(
+        "It is highly recommended to use `flash_attention_2` for better performance with RotaryIndicTrans."
+        "Falling back to the default `eager` implementation."
+    )
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
 # Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100Model->RotaryIndicTrans
 class RotaryIndicTransModel(RotaryIndicTransPreTrainedModel):
     def __init__(self, config: RotaryIndicTransConfig):
         super().__init__(config)
 # Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100ForConditionalGeneration->RotaryIndicTrans
+class RotaryIndicTransForConditionalGeneration(
+    RotaryIndicTransPreTrainedModel, GenerationMixin
+):
     base_model_prefix = "model"
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "lm_head.weight"]
     def __init__(self, config: RotaryIndicTransConfig):
         super().__init__(config)
             config.decoder_embed_dim, config.decoder_vocab_size, bias=False
         )
         self.post_init()
     def get_encoder(self):
+        return self.model.encoder
     def get_decoder(self):
+        return self.model.decoder
+    def get_input_embeddings(self):
+        return self.model.encoder.embed_tokens
     def get_output_embeddings(self):
         return self.lm_head
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
+    def tie_weights(self):
+        if self.config.share_decoder_input_output_embed:
+            self._tie_or_clone_weights(self.model.decoder.embed_tokens, self.lm_head)
     def forward(
         self,
             masked_lm_loss = F.cross_entropy(
                 input=lm_logits.view(-1, self.config.decoder_vocab_size),
                 target=labels.view(-1),
             )
         if not return_dict:
                     past_state.index_select(0, beam_idx) for past_state in layer_past
                 ),
             )
+        return reordered_past