Add gradient checkpointing

Files changed (4) hide show

config.json +1 -1
configuration_chatglm.py +5 -0
modeling_chatglm.py +89 -15
tokenization_chatglm.py +20 -3

config.json CHANGED Viewed

@@ -36,5 +36,5 @@
   "transformers_version": "4.27.1",
   "tie_word_embeddings": false,
   "eos_token_id": 2,
-  "pad_token_id": 2
 }

   "transformers_version": "4.27.1",
   "tie_word_embeddings": false,
   "eos_token_id": 2,
+  "pad_token_id": 0
 }

configuration_chatglm.py CHANGED Viewed

@@ -28,9 +28,12 @@ class ChatGLMConfig(PretrainedConfig):
         attention_softmax_in_fp32=True,
         fp32_residual_connection=False,
         quantization_bit=0,
         **kwargs
     ):
         self.num_layers = num_layers
         self.padded_vocab_size = padded_vocab_size
         self.hidden_size = hidden_size
         self.ffn_hidden_size = ffn_hidden_size
@@ -52,4 +55,6 @@ class ChatGLMConfig(PretrainedConfig):
         self.attention_softmax_in_fp32 = attention_softmax_in_fp32
         self.fp32_residual_connection = fp32_residual_connection
         self.quantization_bit = quantization_bit
         super().__init__(**kwargs)

         attention_softmax_in_fp32=True,
         fp32_residual_connection=False,
         quantization_bit=0,
+        pre_seq_len=None,
+        prefix_projection=False,
         **kwargs
     ):
         self.num_layers = num_layers
+        self.vocab_size = padded_vocab_size
         self.padded_vocab_size = padded_vocab_size
         self.hidden_size = hidden_size
         self.ffn_hidden_size = ffn_hidden_size
         self.attention_softmax_in_fp32 = attention_softmax_in_fp32
         self.fp32_residual_connection = fp32_residual_connection
         self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
         super().__init__(**kwargs)

modeling_chatglm.py CHANGED Viewed

@@ -56,6 +56,36 @@ class InvalidScoreLogitsProcessor(LogitsProcessor):
         return scores
 def split_tensor_along_last_dim(
         tensor: torch.Tensor,
         num_partitions: int,
@@ -566,6 +596,8 @@ class GLMTransformer(torch.nn.Module):
             self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
                                                  dtype=config.torch_dtype)
     def _get_layer(self, layer_number):
         return self.layers[layer_number]
@@ -577,6 +609,13 @@ class GLMTransformer(torch.nn.Module):
         if not kv_caches:
             kv_caches = [None for _ in range(self.num_layers)]
         presents = () if use_cache else None
         all_self_attentions = None
         all_hidden_states = () if output_hidden_states else None
         for index in range(self.num_layers):
@@ -584,14 +623,24 @@ class GLMTransformer(torch.nn.Module):
                 all_hidden_states = all_hidden_states + (hidden_states,)
             layer = self._get_layer(index)
-            hidden_states, kv_cache = layer(
-                hidden_states,
-                attention_mask,
-                rotary_pos_emb,
-                kv_cache=kv_caches[index],
-                use_cache=use_cache
-            )
             if use_cache:
                 presents = presents + (kv_cache,)
@@ -645,7 +694,7 @@ class ChatGLMPreTrainedModel(PreTrainedModel):
         return position_ids
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, ChatGLMModel):
             module.gradient_checkpointing = value
@@ -700,11 +749,33 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
         self.encoder = init_method(GLMTransformer, config, **init_kwargs)
         self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
                                         dtype=config.torch_dtype, **init_kwargs)
-        self.gradient_checkpointing = False
     def get_input_embeddings(self):
         return self.embedding.word_embeddings
     def forward(
             self,
             input_ids,
@@ -740,6 +811,11 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
             rotary_pos_emb = rotary_pos_emb[None, :seq_length]
         rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
         # Run encoder.
         hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
             inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
@@ -913,10 +989,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
         return response
     def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = None):
-        prompt = ""
-        for i, (old_query, response) in enumerate(history):
-            prompt += "[Round {}]\n\n问：{}\n\n答：{}\n\n".format(i + 1, old_query, response)
-        prompt += "[Round {}]\n\n问：{}\n\n答：".format(len(history) + 1, query)
         inputs = tokenizer([prompt], return_tensors="pt")
         inputs = inputs.to(self.device)
         return inputs
@@ -933,7 +1006,6 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
         inputs = inputs.to(self.device)
         return inputs
     @torch.no_grad()
     def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 8192, num_beams=1,
              do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None, **kwargs):
@@ -969,6 +1041,8 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
             inputs = self.build_stream_inputs(tokenizer, query, history=history)
         if past_key_values is not None:
             past_length = past_key_values[0][0].shape[0]
             inputs.position_ids += past_length
             attention_mask = inputs.attention_mask
             attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)

         return scores
+class PrefixEncoder(torch.nn.Module):
+    """
+    The torch.nn model to encode the prefix
+    Input shape: (batch-size, prefix-length)
+    Output shape: (batch-size, prefix-length, 2*layers*hidden)
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.prefix_projection = config.prefix_projection
+        if self.prefix_projection:
+            # Use a two-layer MLP to encode the prefix
+            self.embedding = torch.nn.Embedding(config.pre_seq_len, config.hidden_size)
+            self.trans = torch.nn.Sequential(
+                torch.nn.Linear(config.hidden_size, config.hidden_size),
+                torch.nn.Tanh(),
+                torch.nn.Linear(config.hidden_size, config.num_layers * config.hidden_size * 2)
+            )
+        else:
+            self.embedding = torch.nn.Embedding(config.pre_seq_len, config.num_layers * config.hidden_size * 2)
+    def forward(self, prefix: torch.Tensor):
+        if self.prefix_projection:
+            prefix_tokens = self.embedding(prefix)
+            past_key_values = self.trans(prefix_tokens)
+        else:
+            past_key_values = self.embedding(prefix)
+        return past_key_values
 def split_tensor_along_last_dim(
         tensor: torch.Tensor,
         num_partitions: int,
             self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
                                                  dtype=config.torch_dtype)
+        self.gradient_checkpointing = False
     def _get_layer(self, layer_number):
         return self.layers[layer_number]
         if not kv_caches:
             kv_caches = [None for _ in range(self.num_layers)]
         presents = () if use_cache else None
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
         all_self_attentions = None
         all_hidden_states = () if output_hidden_states else None
         for index in range(self.num_layers):
                 all_hidden_states = all_hidden_states + (hidden_states,)
             layer = self._get_layer(index)
+            if self.gradient_checkpointing and self.training:
+                layer_ret = torch.utils.checkpoint.checkpoint(
+                    layer,
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_cache=kv_caches[index],
+                    use_cache=use_cache
+                )
+            else:
+                layer_ret = layer(
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_cache=kv_caches[index],
+                    use_cache=use_cache
+                )
+            hidden_states, kv_cache = layer_ret
             if use_cache:
                 presents = presents + (kv_cache,)
         return position_ids
     def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, GLMTransformer):
             module.gradient_checkpointing = value
         self.encoder = init_method(GLMTransformer, config, **init_kwargs)
         self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
                                         dtype=config.torch_dtype, **init_kwargs)
+        self.pre_seq_len = config.pre_seq_len
+        self.prefix_projection = config.prefix_projection
+        if self.pre_seq_len is not None:
+            for param in self.parameters():
+                param.requires_grad = False
+            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
+            self.prefix_encoder = PrefixEncoder(config)
+            self.dropout = torch.nn.Dropout(0.1)
     def get_input_embeddings(self):
         return self.embedding.word_embeddings
+    def get_prompt(self, batch_size, device, dtype=torch.half):
+        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
+        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
+        past_key_values = past_key_values.view(
+            batch_size,
+            self.pre_seq_len,
+            self.num_layers * 2,
+            self.num_attention_heads,
+            self.hidden_size // self.num_attention_heads
+        )
+        # seq_len, b, nh, hidden_size
+        past_key_values = self.dropout(past_key_values)
+        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
+        return past_key_values
     def forward(
             self,
             input_ids,
             rotary_pos_emb = rotary_pos_emb[None, :seq_length]
         rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
+        if past_key_values is None:
+            if self.pre_seq_len is not None:
+                past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device,
+                                                  dtype=inputs_embeds.dtype)
         # Run encoder.
         hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
             inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
         return response
     def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = None):
+        prompt = tokenizer.build_prompt(query, history=history)
         inputs = tokenizer([prompt], return_tensors="pt")
         inputs = inputs.to(self.device)
         return inputs
         inputs = inputs.to(self.device)
         return inputs
     @torch.no_grad()
     def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 8192, num_beams=1,
              do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None, **kwargs):
             inputs = self.build_stream_inputs(tokenizer, query, history=history)
         if past_key_values is not None:
             past_length = past_key_values[0][0].shape[0]
+            if self.transformer.pre_seq_len is not None:
+                past_length -= self.transformer.pre_seq_len
             inputs.position_ids += past_length
             attention_mask = inputs.attention_mask
             attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)

tokenization_chatglm.py CHANGED Viewed

@@ -17,7 +17,7 @@ class SPTokenizer:
         self.n_words: int = self.sp_model.vocab_size()
         self.bos_id: int = self.sp_model.bos_id()
         self.eos_id: int = self.sp_model.eos_id()
-        self.pad_id: int = self.sp_model.eos_id()
         assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
         special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"]
@@ -55,7 +55,7 @@ class SPTokenizer:
     def convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
-        if index in self.index_special_tokens:
             return ""
         return self.sp_model.IdToPiece(index)
@@ -85,12 +85,20 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
     @property
     def pad_token(self) -> str:
-        return "</s>"
     @property
     def pad_token_id(self):
         return self.get_command("<pad>")
     @property
     def vocab_size(self):
         return self.tokenizer.n_words
@@ -147,6 +155,15 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
         prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")]
         return prefix_tokens
     def build_inputs_with_special_tokens(
             self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]:

         self.n_words: int = self.sp_model.vocab_size()
         self.bos_id: int = self.sp_model.bos_id()
         self.eos_id: int = self.sp_model.eos_id()
+        self.pad_id: int = self.sp_model.unk_id()
         assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
         special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"]
     def convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.index_special_tokens or index in [self.eos_id, self.bos_id, self.pad_id] or index < 0:
             return ""
         return self.sp_model.IdToPiece(index)
     @property
     def pad_token(self) -> str:
+        return "<unk>"
     @property
     def pad_token_id(self):
         return self.get_command("<pad>")
+    @property
+    def eos_token(self) -> str:
+        return "</s>"
+    @property
+    def eos_token_id(self):
+        return self.get_command("<eos>")
     @property
     def vocab_size(self):
         return self.tokenizer.n_words
         prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")]
         return prefix_tokens
+    def build_prompt(self, query, history=None):
+        if history is None:
+            history = []
+        prompt = ""
+        for i, (old_query, response) in enumerate(history):
+            prompt += "[Round {}]\n\n问：{}\n\n答：{}\n\n".format(i + 1, old_query, response)
+        prompt += "[Round {}]\n\n问：{}\n\n答：".format(len(history) + 1, query)
+        return prompt
     def build_inputs_with_special_tokens(
             self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]: