shwu commited on Apr 16, 2023

Commit

769e287

•

1 Parent(s): d1844e4

feat: better modeling_blip2chatglm

Browse files

Files changed (20) hide show

config.json +5 -5
configuration_blip2chatglm.py +1 -1
generation_config.json +4 -0
pytorch_model.bin → ice_text.model +2 -2
modeling_blip2chatglm.py +244 -41
modeling_chatglm.py +82 -54
preprocessor_config.json +24 -0
pytorch_model-00001-of-00009.bin +3 -0
pytorch_model-00002-of-00009.bin +3 -0
pytorch_model-00003-of-00009.bin +3 -0
pytorch_model-00004-of-00009.bin +3 -0
pytorch_model-00005-of-00009.bin +3 -0
pytorch_model-00006-of-00009.bin +3 -0
pytorch_model-00007-of-00009.bin +3 -0
pytorch_model-00008-of-00009.bin +3 -0
pytorch_model-00009-of-00009.bin +3 -0
pytorch_model.bin.index.json +0 -0
special_tokens_map.json +7 -0
tokenization_chatglm.py +433 -0
tokenizer_config.json +23 -0

config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "_commit_hash": null,
   "architectures": [
-    "BlipFor2ChatGLM"
   ],
   "initializer_factor": 1.0,
   "initializer_range": 0.02,
@@ -174,7 +174,7 @@
   "tie_word_embeddings": false,
   "torch_dtype": "float32",
   "transformers_version": null,
-  "use_decoder_only_language_model": false,
   "vision_config": {
     "_name_or_path": "",
     "add_cross_attention": false,
@@ -248,7 +248,7 @@
     "tokenizer_class": null,
     "top_k": 50,
     "top_p": 1.0,
-    "torch_dtype": null,
     "torchscript": false,
     "transformers_version": "4.27.3",
     "typical_p": 1.0,
@@ -256,7 +256,7 @@
   },
   "auto_map": {
     "AutoConfig": "configuration_blip2chatglm.Blip2ChatGLMConfig",
-    "AutoModel": "modeling_blip2chatglm.Blip2ForChatGLM",
-    "AutoModelForCausalLM": "modeling_blip2chatglm.Blip2ChatGLM"
   }
 }

 {
   "_commit_hash": null,
   "architectures": [
+    "Blip2ChatGLMForConditionalGeneration"
   ],
   "initializer_factor": 1.0,
   "initializer_range": 0.02,
   "tie_word_embeddings": false,
   "torch_dtype": "float32",
   "transformers_version": null,
+  "use_decoder_only_language_model": true,
   "vision_config": {
     "_name_or_path": "",
     "add_cross_attention": false,
     "tokenizer_class": null,
     "top_k": 50,
     "top_p": 1.0,
+    "torch_dtype": "float16",
     "torchscript": false,
     "transformers_version": "4.27.3",
     "typical_p": 1.0,
   },
   "auto_map": {
     "AutoConfig": "configuration_blip2chatglm.Blip2ChatGLMConfig",
+    "AutoModel": "modeling_blip2chatglm.Blip2ChatGLMForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_blip2chatglm.Blip2ChatGLMForConditionalGeneration"
   }
 }

configuration_blip2chatglm.py CHANGED Viewed

@@ -49,7 +49,7 @@ class Blip2ChatGLMConfig(PretrainedConfig):
         self.num_query_tokens = num_query_tokens
         self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
         # self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
-        self.use_decoder_only_language_model = False    # chatglm is an encoder-decoder model
         self.initializer_factor = 1.0
         self.initializer_range = 0.02

         self.num_query_tokens = num_query_tokens
         self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
         # self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+        self.use_decoder_only_language_model = True             # chatglm has no encoder
         self.initializer_factor = 1.0
         self.initializer_range = 0.02

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.27.3"
+}

pytorch_model.bin → ice_text.model RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4f62d72c97cb28762f2fcb9e9b00e1d23c7d546da79fb4cfde386231b9b8d956
-size 4377310673

 version https://git-lfs.github.com/spec/v1
+oid sha256:5e974d9a69c242ce014c88c2b26089270f6198f3c0b700a887666cd3e816f17e
+size 2706249

modeling_blip2chatglm.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import copy
 from typing import Callable, List, Optional, Tuple, Union
 import torch
 import warnings
 from torch import Tensor, nn
@@ -8,8 +10,14 @@ from transformers import (
     PreTrainedModel,
     Blip2VisionModel,
     Blip2QFormerModel,
     GenerationConfig,
 )
 from transformers.utils import logging
 from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList
@@ -23,9 +31,13 @@ from .configuration_blip2chatglm import Blip2ChatGLMConfig
 logger = logging.get_logger(__name__)
-class Blip2ForChatGLM(PreTrainedModel):
     def __init__(self, config: Blip2ChatGLMConfig):
-        super().__init__(config)
         self.vision_model = Blip2VisionModel(config.vision_config)
@@ -37,21 +49,65 @@ class Blip2ForChatGLM(PreTrainedModel):
         self.language_projection = nn.Linear(
             config.qformer_config.hidden_size, config.text_config.hidden_size
         )
     def forward(
         self,
         pixel_values: torch.FloatTensor,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ):
         return_dict = (
             return_dict if return_dict is not None else self.config.use_return_dict
         )
         # step 1: forward the images through the vision encoder,
         # to get image embeddings of shape (batch_size, seq_len, hidden_size)
-        vision_outputs = self.vision_model.forward(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -65,7 +121,7 @@ class Blip2ForChatGLM(PreTrainedModel):
         )
         query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
-        query_outputs = self.qformer.forward(
             query_embeds=query_tokens,
             encoder_hidden_states=image_embeds,
             encoder_attention_mask=image_attention_mask,
@@ -76,23 +132,54 @@ class Blip2ForChatGLM(PreTrainedModel):
         query_output = query_outputs[0]
         # step 3: use the language model, conditioned on the query outputs and the prompt
-        language_model_inputs = self.language_projection.forward(query_output)
-        return vision_outputs, query_outputs, language_model_inputs
-class Blip2ChatGLM(PreTrainedModel):
-    config_class = Blip2ChatGLMConfig
-    def __init__(
-        self,
-        config: Blip2ChatGLMConfig,
-        blip2: Blip2ForChatGLM,
-        lm: ChatGLMForConditionalGeneration,
-    ) -> None:
-        super().__init__(config)
-        self.blip2 = blip2
-        self.language = lm
     @torch.no_grad()
     def stream_chat(
@@ -106,12 +193,12 @@ class Blip2ChatGLM(PreTrainedModel):
         do_sample=True,
         temperature=1,
     ):
-        device = self.blip2.device
         # 1. Prepare token ids
         images = []
         image_slots = []
-        nvtokens = self.blip2.query_tokens.size(1)
         if history:
             input_ids = tokenizer(
                 f"[Round {len(history)}]\n问：", add_special_tokens=False
@@ -181,27 +268,27 @@ class Blip2ChatGLM(PreTrainedModel):
         # 2. Prepare image embeddings
         if len(images) != 0:
             image = torch.cat(list(images), dim=0)
-            vision_outputs = self.blip2.vision_model.forward(image)
             image_embeds = vision_outputs[0]
             image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(
                 device
             )
-            query_tokens = self.blip2.query_tokens.expand(image_embeds.shape[0], -1, -1)
-            query_outputs = self.blip2.qformer.forward(
                 query_embeds=query_tokens,
                 encoder_hidden_states=image_embeds,
                 encoder_attention_mask=image_atts,
             )
             query_output = query_outputs[0]
-            vtokens = self.blip2.language_projection(query_output)
         else:
             vtokens = []
         # 3. Place image embeddings into slots
         input_ids = torch.as_tensor(input_ids, dtype=torch.long).to(device).unsqueeze(0)
-        inputs_embeds = self.language.transformer.word_embeddings(input_ids)
         for slot, vimg in zip(image_slots, vtokens):
             inputs_embeds[0][-slot : -slot + nvtokens, :] = vimg
@@ -216,17 +303,16 @@ class Blip2ChatGLM(PreTrainedModel):
             "logits_processor": logits_processor,
         }
-        for outputs in self.mm_stream_generate(
             input_ids=input_ids, inputs_embeds=inputs_embeds, **gen_kwargs
         ):
             outputs = outputs.tolist()[0][len(input_ids[0]) :]
             response = tokenizer.decode(outputs)
-            response = self.language.process_response(response)
-            new_history = history + [(query, response)]
-            yield response, new_history
     @torch.no_grad()
-    def mm_stream_generate(
         self,
         input_ids,
         inputs_embeds,
@@ -238,10 +324,23 @@ class Blip2ChatGLM(PreTrainedModel):
         ] = None,
         **kwargs,
     ):
         batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
         if generation_config is None:
-            generation_config = self.language.generation_config
         generation_config = copy.deepcopy(generation_config)
         model_kwargs = generation_config.update(**kwargs)
         bos_token_id, eos_token_id = (
@@ -279,7 +378,7 @@ class Blip2ChatGLM(PreTrainedModel):
         if input_ids_seq_length >= generation_config.max_length:
             input_ids_string = (
                 "decoder_input_ids"
-                if self.language.config.is_encoder_decoder
                 else "input_ids"
             )
             logger.warning(
@@ -298,7 +397,7 @@ class Blip2ChatGLM(PreTrainedModel):
             else StoppingCriteriaList()
         )
-        logits_processor = self.language._get_logits_processor(
             generation_config=generation_config,
             input_ids_seq_length=input_ids_seq_length,
             encoder_input_ids=input_ids,
@@ -306,19 +405,19 @@ class Blip2ChatGLM(PreTrainedModel):
             logits_processor=logits_processor,
         )
-        stopping_criteria = self.language._get_stopping_criteria(
             generation_config=generation_config, stopping_criteria=stopping_criteria
         )
-        logits_warper = self.language._get_logits_warper(generation_config)
         unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
         scores = None
         while True:
-            model_inputs = self.language.prepare_inputs_for_generation(
                 input_ids, inputs_embeds=inputs_embeds, **model_kwargs
             )
             # forward pass to get next token
-            outputs = self.language(
                 **model_inputs,
                 return_dict=True,
                 output_attentions=False,
@@ -343,14 +442,14 @@ class Blip2ChatGLM(PreTrainedModel):
             inputs_embeds = torch.cat(
                 [
                     inputs_embeds,
-                    self.language.get_input_embeddings()(next_tokens)[:, None, :],
                 ],
                 dim=1,
             )
-            model_kwargs = self.language._update_model_kwargs_for_generation(
                 outputs,
                 model_kwargs,
-                is_encoder_decoder=self.language.config.is_encoder_decoder,
             )
             unfinished_sequences = unfinished_sequences.mul(
                 (sum(next_tokens != i for i in eos_token_id)).long()
@@ -360,3 +459,107 @@ class Blip2ChatGLM(PreTrainedModel):
             if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
                 break
             yield input_ids

 import copy
+import os
 from typing import Callable, List, Optional, Tuple, Union
 import torch
+from torch.nn import CrossEntropyLoss
 import warnings
 from torch import Tensor, nn
     PreTrainedModel,
     Blip2VisionModel,
     Blip2QFormerModel,
+    Blip2Model,
+    Blip2PreTrainedModel,
+    Blip2ForConditionalGeneration,
     GenerationConfig,
 )
+from transformers.models.blip_2.modeling_blip_2 import (
+    Blip2ForConditionalGenerationModelOutput,
+)
 from transformers.utils import logging
 from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList
 logger = logging.get_logger(__name__)
+class Blip2ChatGLMForConditionalGeneration(Blip2ForConditionalGeneration):
+    config_class = Blip2ChatGLMConfig
     def __init__(self, config: Blip2ChatGLMConfig):
+        Blip2PreTrainedModel.__init__(self, config)
+        # NOTE: we only initialize Blip2PreTrainedModel
+        # directly call super().__init__() will cause error since ChatGLM cannot be found by AutoModel
         self.vision_model = Blip2VisionModel(config.vision_config)
         self.language_projection = nn.Linear(
             config.qformer_config.hidden_size, config.text_config.hidden_size
         )
+        self.language_model = ChatGLMForConditionalGeneration(config.text_config)
+        # Initialize weights and apply final processing
+        # self.post_init()
+    def setup_dtype(self, vision_encoder_dtype: str = "fp32", lm_dtype: str = "fp16"):
+        if vision_encoder_dtype == "fp32":
+            self.vision_model = self.vision_model.float()
+        elif vision_encoder_dtype == "fp16":
+            self.vision_model = self.vision_model.half()
+        else:
+            raise NotImplementedError(
+                f"Unsupported vision_encoder_dtype: {vision_encoder_dtype}"
+            )
+        if lm_dtype == "fp32":
+            self.language_model = self.language_model.float()
+        elif lm_dtype == "fp16":
+            self.language_model = self.language_model.half()
+        elif lm_dtype == "int4":
+            self.language_model = self.language_model.half().quantize(4)
+        elif lm_dtype == "int8":
+            self.language_model = self.language_model.half().quantize(8)
+        else:
+            raise NotImplementedError(f"Unsupported lm_dtype: {lm_dtype}")
     def forward(
         self,
         pixel_values: torch.FloatTensor,
+        input_ids: torch.FloatTensor,
+        image_slot_offset: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
         return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]:
+        """_summary_
+        Args:
+            pixel_values (torch.FloatTensor): _description_
+            input_ids (torch.FloatTensor): input_ids[:, :num_query_tokens] should be filled with tokenizer.unk_token_id
+            image_slot_offset (Optional[torch.LongTensor], optional): if not set, all vtokens are placed as prefix (image_slot_offset = torch.zeros(bsz)). Defaults to None.
+            attention_mask (Optional[torch.LongTensor], optional): _description_. Defaults to None.
+            output_attentions (Optional[bool], optional): _description_. Defaults to None.
+            output_hidden_states (Optional[bool], optional): _description_. Defaults to None.
+            labels (Optional[torch.LongTensor], optional): _description_. Defaults to None.
+            return_dict (Optional[bool], optional): _description_. Defaults to None.
+        Returns:
+            Union[Tuple, Blip2ForConditionalGenerationModelOutput]: _description_
+        """
         return_dict = (
             return_dict if return_dict is not None else self.config.use_return_dict
         )
         # step 1: forward the images through the vision encoder,
         # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        vision_outputs = self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
         )
         query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_outputs = self.qformer(
             query_embeds=query_tokens,
             encoder_hidden_states=image_embeds,
             encoder_attention_mask=image_attention_mask,
         query_output = query_outputs[0]
         # step 3: use the language model, conditioned on the query outputs and the prompt
+        language_model_inputs = self.language_projection(query_output)
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        if image_slot_offset is None:
+            # image as prefix
+            # update data to avoid inplace operation of leaf Variable
+            inputs_embeds.data[:, : self.config.num_query_tokens, :] = language_model_inputs
+        else:
+            for i, offset in enumerate(image_slot_offset):
+                inputs_embeds.data[i, offset : offset + self.config.num_query_tokens, :] = (
+                    language_model_inputs[i]
+                )
+        outputs = self.language_model(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = outputs.logits if return_dict else outputs[0]
+        loss = None
+        # we compute the loss here since we need to take into account the sequence length of the query embeds
+        if labels is not None:
+            logits = logits[:, -labels.size(1) :, :]
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous().to(logits.device)
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(reduction="mean")
+            loss = loss_fct(
+                shift_logits.view(-1, self.config.text_config.vocab_size),
+                shift_labels.view(-1),
+            )
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+        return Blip2ForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
     @torch.no_grad()
     def stream_chat(
         do_sample=True,
         temperature=1,
     ):
+        device = self.device
         # 1. Prepare token ids
         images = []
         image_slots = []
+        nvtokens = self.config.num_query_tokens
         if history:
             input_ids = tokenizer(
                 f"[Round {len(history)}]\n问：", add_special_tokens=False
         # 2. Prepare image embeddings
         if len(images) != 0:
             image = torch.cat(list(images), dim=0)
+            vision_outputs = self.vision_model.forward(image)
             image_embeds = vision_outputs[0]
             image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(
                 device
             )
+            query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+            query_outputs = self.qformer.forward(
                 query_embeds=query_tokens,
                 encoder_hidden_states=image_embeds,
                 encoder_attention_mask=image_atts,
             )
             query_output = query_outputs[0]
+            vtokens = self.language_projection(query_output)
         else:
             vtokens = []
         # 3. Place image embeddings into slots
         input_ids = torch.as_tensor(input_ids, dtype=torch.long).to(device).unsqueeze(0)
+        inputs_embeds = self.language_model.transformer.word_embeddings(input_ids)
         for slot, vimg in zip(image_slots, vtokens):
             inputs_embeds[0][-slot : -slot + nvtokens, :] = vimg
             "logits_processor": logits_processor,
         }
+        for outputs in self.stream_generate(
             input_ids=input_ids, inputs_embeds=inputs_embeds, **gen_kwargs
         ):
             outputs = outputs.tolist()[0][len(input_ids[0]) :]
             response = tokenizer.decode(outputs)
+            response = self.language_model.process_response(response)
+            yield response
     @torch.no_grad()
+    def stream_generate(
         self,
         input_ids,
         inputs_embeds,
         ] = None,
         **kwargs,
     ):
+        """slightly modified from chatglm implementation to support inputs_embeds
+        Args:
+            input_ids (_type_): _description_
+            inputs_embeds (_type_): _description_
+            generation_config (Optional[GenerationConfig], optional): _description_. Defaults to None.
+            logits_processor (Optional[LogitsProcessorList], optional): _description_. Defaults to None.
+            stopping_criteria (Optional[StoppingCriteriaList], optional): _description_. Defaults to None.
+            prefix_allowed_tokens_fn (Optional[ Callable[[int, torch.Tensor], List[int]] ], optional): _description_. Defaults to None.
+        Yields:
+            _type_: _description_
+        """
         batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
         if generation_config is None:
+            generation_config = self.language_model.generation_config
         generation_config = copy.deepcopy(generation_config)
         model_kwargs = generation_config.update(**kwargs)
         bos_token_id, eos_token_id = (
         if input_ids_seq_length >= generation_config.max_length:
             input_ids_string = (
                 "decoder_input_ids"
+                if self.language_model.config.is_encoder_decoder
                 else "input_ids"
             )
             logger.warning(
             else StoppingCriteriaList()
         )
+        logits_processor = self.language_model._get_logits_processor(
             generation_config=generation_config,
             input_ids_seq_length=input_ids_seq_length,
             encoder_input_ids=input_ids,
             logits_processor=logits_processor,
         )
+        stopping_criteria = self.language_model._get_stopping_criteria(
             generation_config=generation_config, stopping_criteria=stopping_criteria
         )
+        logits_warper = self.language_model._get_logits_warper(generation_config)
         unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
         scores = None
         while True:
+            model_inputs = self.prepare_inputs_for_generation(
                 input_ids, inputs_embeds=inputs_embeds, **model_kwargs
             )
             # forward pass to get next token
+            outputs = self.language_model(
                 **model_inputs,
                 return_dict=True,
                 output_attentions=False,
             inputs_embeds = torch.cat(
                 [
                     inputs_embeds,
+                    self.language_model.get_input_embeddings()(next_tokens)[:, None, :],
                 ],
                 dim=1,
             )
+            model_kwargs = self.language_model._update_model_kwargs_for_generation(
                 outputs,
                 model_kwargs,
+                is_encoder_decoder=self.language_model.config.is_encoder_decoder,
             )
             unfinished_sequences = unfinished_sequences.mul(
                 (sum(next_tokens != i for i in eos_token_id)).long()
             if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
                 break
             yield input_ids
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past: Optional[torch.Tensor] = None,
+        past_key_values: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> dict:
+        """slightly modified from chatglm implementation to support inputs_embeds
+        Args:
+            input_ids (torch.LongTensor): _description_
+            inputs_embeds (Optional[torch.Tensor], optional): _description_. Defaults to None.
+            past (Optional[torch.Tensor], optional): _description_. Defaults to None.
+            past_key_values (Optional[torch.Tensor], optional): _description_. Defaults to None.
+            attention_mask (Optional[torch.Tensor], optional): _description_. Defaults to None.
+            position_ids (Optional[torch.Tensor], optional): _description_. Defaults to None.
+        Returns:
+            dict: _description_
+        """
+        batch_size, seq_length = input_ids.shape
+        MASK, gMASK = self.language_model.config.mask_token_id, self.language_model.config.gmask_token_id
+        seqs = input_ids.tolist()
+        mask_positions, use_gmasks = [], []
+        for seq in seqs:
+            mask_token = gMASK if gMASK in seq else MASK
+            use_gmask = mask_token == gMASK
+            mask_positions.append(seq.index(mask_token))
+            use_gmasks.append(use_gmask)
+        # only last token for input_ids if past is not None
+        if past is not None or past_key_values is not None:
+            last_token = input_ids[:, -1].unsqueeze(-1)
+            if attention_mask is not None and attention_mask.dtype == torch.bool:
+                attention_mask = attention_mask[:, :, -1:]
+            else:
+                attention_mask = None
+            if position_ids is not None:
+                position_ids = position_ids[..., -1:]
+            else:
+                context_lengths = [seq.index(self.language_model.config.bos_token_id) for seq in seqs]
+                if self.language_model.position_encoding_2d:
+                    position_ids = torch.tensor(
+                        [
+                            [mask_position, seq_length - context_length]
+                            for mask_position, context_length in zip(
+                                mask_positions, context_lengths
+                            )
+                        ],
+                        dtype=torch.long,
+                        device=input_ids.device,
+                    ).unsqueeze(-1)
+                else:
+                    position_ids = torch.tensor(
+                        [mask_position for mask_position in mask_positions],
+                        dtype=torch.long,
+                        device=input_ids.device,
+                    ).unsqueeze(-1)
+            if past is None:
+                past = past_key_values
+            return {
+                "input_ids": last_token,
+                "past_key_values": past,
+                "position_ids": position_ids,
+                "attention_mask": attention_mask,
+            }
+        else:
+            if attention_mask is not None and attention_mask.dtype != torch.bool:
+                logger.warning_once(
+                    f"The dtype of attention mask ({attention_mask.dtype}) is not bool"
+                )
+                attention_mask = None
+            if attention_mask is None:
+                attention_mask = self.language_model.get_masks(input_ids, device=input_ids.device)
+            if position_ids is None:
+                position_ids = self.language_model.get_position_ids(
+                    input_ids,
+                    device=input_ids.device,
+                    mask_positions=mask_positions,
+                    use_gmasks=use_gmasks,
+                )
+            if inputs_embeds is not None:
+                assert input_ids.size(1) == inputs_embeds.size(
+                    1
+                ), f"Make sure that both input_ids ({input_ids.size(1)}) and inputs_embeds ({inputs_embeds.size(1)}) have the same length."
+                return {
+                    "inputs_embeds": inputs_embeds,
+                    "past_key_values": past,
+                    "position_ids": position_ids,
+                    "attention_mask": attention_mask,
+                }
+            else:
+                return {
+                    "input_ids": input_ids,
+                    "past_key_values": past,
+                    "position_ids": position_ids,
+                    "attention_mask": attention_mask,
+                }

modeling_chatglm.py CHANGED Viewed

@@ -55,7 +55,7 @@ class InvalidScoreLogitsProcessor(LogitsProcessor):
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         if torch.isnan(scores).any() or torch.isinf(scores).any():
             scores.zero_()
-            scores[..., 20005] = 5e4
         return scores
@@ -280,10 +280,8 @@ def attention_fn(
     # [sk, b, np, hn] -> [sk, b * np, hn]
     key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
-    matmul_result = torch.empty(
-        output_size[0] * output_size[1],
-        output_size[2],
-        output_size[3],
         dtype=query_layer.dtype,
         device=query_layer.device,
     )
@@ -348,10 +346,18 @@ def attention_fn(
     return outputs
 class SelfAttention(torch.nn.Module):
     def __init__(self, hidden_size, num_attention_heads,
                  layer_id, hidden_size_per_attention_head=None, bias=True,
-                 params_dtype=torch.float, position_encoding_2d=True):
         super(SelfAttention, self).__init__()
         self.layer_id = layer_id
@@ -379,7 +385,7 @@ class SelfAttention(torch.nn.Module):
         self.inner_hidden_size = num_attention_heads * self.hidden_size_per_attention_head
         # Strided linear layer.
-        self.query_key_value = skip_init(
             torch.nn.Linear,
             hidden_size,
             3 * self.inner_hidden_size,
@@ -387,7 +393,7 @@ class SelfAttention(torch.nn.Module):
             dtype=params_dtype,
         )
-        self.dense = skip_init(
             torch.nn.Linear,
             self.inner_hidden_size,
             hidden_size,
@@ -500,8 +506,12 @@ class GEGLU(torch.nn.Module):
 class GLU(torch.nn.Module):
     def __init__(self, hidden_size, inner_hidden_size=None,
-                 layer_id=None, bias=True, activation_func=gelu, params_dtype=torch.float):
         super(GLU, self).__init__()
         self.layer_id = layer_id
         self.activation_func = activation_func
@@ -510,7 +520,7 @@ class GLU(torch.nn.Module):
         if inner_hidden_size is None:
             inner_hidden_size = 4 * hidden_size
         self.inner_hidden_size = inner_hidden_size
-        self.dense_h_to_4h = skip_init(
             torch.nn.Linear,
             self.hidden_size,
             self.inner_hidden_size,
@@ -518,7 +528,7 @@ class GLU(torch.nn.Module):
             dtype=params_dtype,
         )
         # Project back to h.
-        self.dense_4h_to_h = skip_init(
             torch.nn.Linear,
             self.inner_hidden_size,
             self.hidden_size,
@@ -554,7 +564,8 @@ class GLMBlock(torch.nn.Module):
             use_bias=True,
             params_dtype=torch.float,
             num_layers=28,
-            position_encoding_2d=True
     ):
         super(GLMBlock, self).__init__()
         # Set output layer initialization if not provided.
@@ -574,7 +585,8 @@ class GLMBlock(torch.nn.Module):
             hidden_size_per_attention_head=hidden_size_per_attention_head,
             bias=use_bias,
             params_dtype=params_dtype,
-            position_encoding_2d=self.position_encoding_2d
         )
         # Layernorm on the input data.
@@ -589,6 +601,7 @@ class GLMBlock(torch.nn.Module):
             bias=use_bias,
             layer_id=layer_id,
             params_dtype=params_dtype,
         )
     def forward(
@@ -676,8 +689,10 @@ class ChatGLMPreTrainedModel(PreTrainedModel):
         return attention_mask
-    def get_position_ids(self, input_ids, mask_positions, device, gmask=False):
         batch_size, seq_length = input_ids.shape
         context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids]
         if self.position_encoding_2d:
             position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
@@ -691,8 +706,8 @@ class ChatGLMPreTrainedModel(PreTrainedModel):
             position_ids = torch.stack((position_ids, block_position_ids), dim=1)
         else:
             position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
-            if not gmask:
-                for i, context_length in enumerate(context_lengths):
                     position_ids[context_length:] = mask_positions[i]
         return position_ids
@@ -783,9 +798,12 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
     `encoder_hidden_states` is then expected as an input to the forward pass.
     """
-    def __init__(self, config: ChatGLMConfig):
         super().__init__(config)
         # recording parameters
         self.max_sequence_length = config.max_sequence_length
         self.hidden_size = config.hidden_size
@@ -800,7 +818,7 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
         self.pre_seq_len = config.pre_seq_len
         self.prefix_projection = config.prefix_projection
-        self.word_embeddings = skip_init(
             torch.nn.Embedding,
             num_embeddings=self.vocab_size, embedding_dim=self.hidden_size,
             dtype=self.params_dtype
@@ -819,6 +837,7 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
                 use_bias=True,
                 params_dtype=self.params_dtype,
                 position_encoding_2d=self.position_encoding_2d,
             )
         self.layers = torch.nn.ModuleList(
@@ -894,12 +913,18 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
                 )
                 use_cache = False
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
             batch_size, seq_length = input_ids.shape[:2]
         elif inputs_embeds is not None:
-            # NOTE: fix
             batch_size, seq_length = inputs_embeds.shape[:2]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
@@ -923,15 +948,20 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
             if position_ids is None:
                 MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
-                mask_token = gMASK if gMASK in input_ids else MASK
-                use_gmask = True if gMASK in input_ids else False
-                mask_positions = [seq.tolist().index(mask_token) for seq in input_ids]
                 position_ids = self.get_position_ids(
                     input_ids,
                     mask_positions=mask_positions,
                     device=input_ids.device,
-                    gmask=use_gmask
                 )
         if self.pre_seq_len is not None and attention_mask is not None:
@@ -950,10 +980,10 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
         if attention_mask is None:
             attention_mask = torch.zeros(1, 1, device=input_ids.device).bool()
-        else:
-            pass
-            # NOTE: this is a hack to make the code work with the LAVIS training
-            # attention_mask = attention_mask.to(input_ids.device)
         for i, layer in enumerate(self.layers):
@@ -1009,8 +1039,12 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
 class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
-    def __init__(self, config: ChatGLMConfig):
         super().__init__(config)
         # self.hidden_size = config.hidden_size
         # self.params_dtype = torch.half
@@ -1019,9 +1053,9 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
         self.position_encoding_2d = config.position_encoding_2d
-        self.transformer = ChatGLMModel(config)
-        self.lm_head = skip_init(
             nn.Linear,
             config.hidden_size,
             config.vocab_size,
@@ -1080,7 +1114,6 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
     def prepare_inputs_for_generation(
             self,
             input_ids: torch.LongTensor,
-            inputs_embeds: Optional[torch.Tensor] = None,
             past: Optional[torch.Tensor] = None,
             past_key_values: Optional[torch.Tensor] = None,
             attention_mask: Optional[torch.Tensor] = None,
@@ -1089,10 +1122,13 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
     ) -> dict:
         batch_size, seq_length = input_ids.shape
         MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
-        mask_token = gMASK if gMASK in input_ids else MASK
-        use_gmask = True if gMASK in input_ids else False
         seqs = input_ids.tolist()
-        mask_positions = [seq.index(mask_token) for seq in seqs]
         # only last token for input_ids if past is not None
         if past is not None or past_key_values is not None:
@@ -1135,23 +1171,15 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
                     input_ids,
                     device=input_ids.device,
                     mask_positions=mask_positions,
-                    gmask=use_gmask
                 )
-            if inputs_embeds is not None:
-                assert input_ids.size(1) == inputs_embeds.size(1), f"Make sure that both input_ids ({input_ids.size(1)}) and inputs_embeds ({inputs_embeds.size(1)}) have the same length."
-                return {
-                    "inputs_embeds": inputs_embeds,
-                    "past_key_values": past,
-                    "position_ids": position_ids,
-                    "attention_mask": attention_mask
-                }
-            else:
-                return {
-                    "input_ids": input_ids,
-                    "past_key_values": past,
-                    "position_ids": position_ids,
-                    "attention_mask": attention_mask
-                }
     def forward(
             self,

     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         if torch.isnan(scores).any() or torch.isinf(scores).any():
             scores.zero_()
+            scores[..., 5] = 5e4
         return scores
     # [sk, b, np, hn] -> [sk, b * np, hn]
     key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
+    matmul_result = torch.zeros(
+        1, 1, 1,
         dtype=query_layer.dtype,
         device=query_layer.device,
     )
     return outputs
+def default_init(cls, *args, **kwargs):
+    return cls(*args, **kwargs)
 class SelfAttention(torch.nn.Module):
     def __init__(self, hidden_size, num_attention_heads,
                  layer_id, hidden_size_per_attention_head=None, bias=True,
+                 params_dtype=torch.float, position_encoding_2d=True, empty_init=True):
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
         super(SelfAttention, self).__init__()
         self.layer_id = layer_id
         self.inner_hidden_size = num_attention_heads * self.hidden_size_per_attention_head
         # Strided linear layer.
+        self.query_key_value = init_method(
             torch.nn.Linear,
             hidden_size,
             3 * self.inner_hidden_size,
             dtype=params_dtype,
         )
+        self.dense = init_method(
             torch.nn.Linear,
             self.inner_hidden_size,
             hidden_size,
 class GLU(torch.nn.Module):
     def __init__(self, hidden_size, inner_hidden_size=None,
+                 layer_id=None, bias=True, activation_func=gelu, params_dtype=torch.float, empty_init=True):
         super(GLU, self).__init__()
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
         self.layer_id = layer_id
         self.activation_func = activation_func
         if inner_hidden_size is None:
             inner_hidden_size = 4 * hidden_size
         self.inner_hidden_size = inner_hidden_size
+        self.dense_h_to_4h = init_method(
             torch.nn.Linear,
             self.hidden_size,
             self.inner_hidden_size,
             dtype=params_dtype,
         )
         # Project back to h.
+        self.dense_4h_to_h = init_method(
             torch.nn.Linear,
             self.inner_hidden_size,
             self.hidden_size,
             use_bias=True,
             params_dtype=torch.float,
             num_layers=28,
+            position_encoding_2d=True,
+            empty_init=True
     ):
         super(GLMBlock, self).__init__()
         # Set output layer initialization if not provided.
             hidden_size_per_attention_head=hidden_size_per_attention_head,
             bias=use_bias,
             params_dtype=params_dtype,
+            position_encoding_2d=self.position_encoding_2d,
+            empty_init=empty_init
         )
         # Layernorm on the input data.
             bias=use_bias,
             layer_id=layer_id,
             params_dtype=params_dtype,
+            empty_init=empty_init
         )
     def forward(
         return attention_mask
+    def get_position_ids(self, input_ids, mask_positions, device, use_gmasks=None):
         batch_size, seq_length = input_ids.shape
+        if use_gmasks is None:
+            use_gmasks = [False] * batch_size
         context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids]
         if self.position_encoding_2d:
             position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
             position_ids = torch.stack((position_ids, block_position_ids), dim=1)
         else:
             position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
+            for i, context_length in enumerate(context_lengths):
+                if not use_gmasks[i]:
                     position_ids[context_length:] = mask_positions[i]
         return position_ids
     `encoder_hidden_states` is then expected as an input to the forward pass.
     """
+    def __init__(self, config: ChatGLMConfig, empty_init=True):
         super().__init__(config)
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
         # recording parameters
         self.max_sequence_length = config.max_sequence_length
         self.hidden_size = config.hidden_size
         self.pre_seq_len = config.pre_seq_len
         self.prefix_projection = config.prefix_projection
+        self.word_embeddings = init_method(
             torch.nn.Embedding,
             num_embeddings=self.vocab_size, embedding_dim=self.hidden_size,
             dtype=self.params_dtype
                 use_bias=True,
                 params_dtype=self.params_dtype,
                 position_encoding_2d=self.position_encoding_2d,
+                empty_init=empty_init
             )
         self.layers = torch.nn.ModuleList(
                 )
                 use_cache = False
+        # if input_ids is not None and inputs_embeds is not None:
+        #     raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        # elif input_ids is not None:
+        #     batch_size, seq_length = input_ids.shape[:2]
+        # elif inputs_embeds is not None:
+        #     batch_size, seq_length. _ = inputs_embeds.shape[:2]
+        # else:
+        #     raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if input_ids is not None:
             batch_size, seq_length = input_ids.shape[:2]
         elif inputs_embeds is not None:
             batch_size, seq_length = inputs_embeds.shape[:2]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
             if position_ids is None:
                 MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
+                seqs = input_ids.tolist()
+                mask_positions, use_gmasks = [], []
+                for seq in seqs:
+                    mask_token = gMASK if gMASK in seq else MASK
+                    use_gmask = mask_token == gMASK
+                    mask_positions.append(seq.index(mask_token))
+                    use_gmasks.append(use_gmask)
                 position_ids = self.get_position_ids(
                     input_ids,
                     mask_positions=mask_positions,
                     device=input_ids.device,
+                    use_gmasks=use_gmasks
                 )
         if self.pre_seq_len is not None and attention_mask is not None:
         if attention_mask is None:
             attention_mask = torch.zeros(1, 1, device=input_ids.device).bool()
+        # NOTE: this is a hack to make the code work with the LAVIS training
+        # else:
+        #     pass
+        #     attention_mask = attention_mask.to(input_ids.device)
         for i, layer in enumerate(self.layers):
 class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, empty_init=True):
         super().__init__(config)
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
         # self.hidden_size = config.hidden_size
         # self.params_dtype = torch.half
         self.position_encoding_2d = config.position_encoding_2d
+        self.transformer = ChatGLMModel(config, empty_init=empty_init)
+        self.lm_head = init_method(
             nn.Linear,
             config.hidden_size,
             config.vocab_size,
     def prepare_inputs_for_generation(
             self,
             input_ids: torch.LongTensor,
             past: Optional[torch.Tensor] = None,
             past_key_values: Optional[torch.Tensor] = None,
             attention_mask: Optional[torch.Tensor] = None,
     ) -> dict:
         batch_size, seq_length = input_ids.shape
         MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
         seqs = input_ids.tolist()
+        mask_positions, use_gmasks = [], []
+        for seq in seqs:
+            mask_token = gMASK if gMASK in seq else MASK
+            use_gmask = mask_token == gMASK
+            mask_positions.append(seq.index(mask_token))
+            use_gmasks.append(use_gmask)
         # only last token for input_ids if past is not None
         if past is not None or past_key_values is not None:
                     input_ids,
                     device=input_ids.device,
                     mask_positions=mask_positions,
+                    use_gmasks=use_gmasks
                 )
+            return {
+                "input_ids": input_ids,
+                "past_key_values": past,
+                "position_ids": position_ids,
+                "attention_mask": attention_mask
+            }
     def forward(
             self,

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "BlipImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "processor_class": "Blip2Processor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 224,
+    "width": 224
+  }
+}

pytorch_model-00001-of-00009.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:81ec9cc9a7e6034300a115898aac9fda06c69cf15d1b3c470d633ae7ce0ad3c9
+size 1995030990

pytorch_model-00002-of-00009.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de3166cf720b1a7cf6be0872f773d1f5e587e109435540f6511c37391827f1d6
+size 1983142386

pytorch_model-00003-of-00009.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a21fec7efe30123a73cd4bc77a4f8bf58c26e808743d47c606950215afd5e6c
+size 1913134013

pytorch_model-00004-of-00009.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b31bbd7aa605cde4258795220732aa24505ef451bf7e86a434c23c7fb75207e3
+size 1879578439

pytorch_model-00005-of-00009.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d45c57fd01a8e6b10a5d31d01af88580212e991705c5308ffcfe76bce8eb9df1
+size 1879571453

pytorch_model-00006-of-00009.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d1032cc31e7f2cda475a12f3a4016934c7d1c82c35b1cec93e159f0bbbc428c
+size 1980242201

pytorch_model-00007-of-00009.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0088535e5adf2f7b2cc2064aff91ffb979fb895a8cb2e2eee14e97a358c192a
+size 1913134077

pytorch_model-00008-of-00009.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a45b9e99ae15bad8d1722abd5f6e441c6cb4fe87bfa32f6c01e5b0a58409ec5d
+size 1208293115

pytorch_model-00009-of-00009.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6323bcad07ce5cc7934323c438abe9a8f45029553cd29098fe22314b14edb9a
+size 1069286314

pytorch_model.bin.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "bos_token": "<sop>",
+  "eos_token": "<eop>",
+  "mask_token": "[MASK]",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

tokenization_chatglm.py ADDED Viewed

	@@ -0,0 +1,433 @@

+"""Tokenization classes for ChatGLM."""
+from typing import List, Optional, Union
+import os
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging, PaddingStrategy
+from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
+from typing import Dict
+import sentencepiece as spm
+import numpy as np
+logger = logging.get_logger(__name__)
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "THUDM/chatglm-6b": 2048,
+}
+class TextTokenizer:
+    def __init__(self, model_path):
+        self.sp = spm.SentencePieceProcessor()
+        self.sp.Load(model_path)
+        self.num_tokens = self.sp.vocab_size()
+    def encode(self, text):
+        return self.sp.EncodeAsIds(text)
+    def decode(self, ids: List[int]):
+        return self.sp.DecodeIds(ids)
+    def tokenize(self, text):
+        return self.sp.EncodeAsPieces(text)
+    def convert_tokens_to_ids(self, tokens):
+        return [self.sp.PieceToId(token) for token in tokens]
+    def convert_token_to_id(self, token):
+        return self.sp.PieceToId(token)
+    def convert_id_to_token(self, idx):
+        return self.sp.IdToPiece(idx)
+    def __len__(self):
+        return self.num_tokens
+class SPTokenizer:
+    def __init__(
+            self,
+            vocab_file,
+            num_image_tokens=20000,
+            max_blank_length=80,
+            byte_fallback=True,
+    ):
+        assert vocab_file is not None
+        self.vocab_file = vocab_file
+        self.num_image_tokens = num_image_tokens
+        self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
+        self.max_blank_length = max_blank_length
+        self.byte_fallback = byte_fallback
+        self.text_tokenizer = TextTokenizer(vocab_file)
+    def _get_text_tokenizer(self):
+        return self.text_tokenizer
+    @staticmethod
+    def get_blank_token(length: int):
+        assert length >= 2
+        return f"<|blank_{length}|>"
+    @staticmethod
+    def get_tab_token():
+        return f"<|tab|>"
+    @property
+    def num_text_tokens(self):
+        return self.text_tokenizer.num_tokens
+    @property
+    def num_tokens(self):
+        return self.num_image_tokens + self.num_text_tokens
+    @staticmethod
+    def _encode_whitespaces(text: str, max_len: int = 80):
+        text = text.replace("\t", SPTokenizer.get_tab_token())
+        for i in range(max_len, 1, -1):
+            text = text.replace(" " * i, SPTokenizer.get_blank_token(i))
+        return text
+    def _preprocess(self, text: str, linebreak=True, whitespaces=True):
+        if linebreak:
+            text = text.replace("\n", "<n>")
+        if whitespaces:
+            text = self._encode_whitespaces(text, max_len=self.max_blank_length)
+        return text
+    def encode(
+            self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True
+    ) -> List[int]:
+        """
+        @param text: Text to encode.
+        @param linebreak: Whether to encode newline (\n) in text.
+        @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
+        @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
+        @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
+        """
+        text = self._preprocess(text, linebreak, whitespaces)
+        if not add_dummy_prefix:
+            text = "<n>" + text
+        tmp = self._get_text_tokenizer().encode(text)
+        tokens = [x + self.num_image_tokens for x in tmp]
+        return tokens if add_dummy_prefix else tokens[2:]
+    def decode(self, text_ids: List[int]) -> str:
+        ids = [int(_id) - self.num_image_tokens for _id in text_ids]
+        ids = [_id for _id in ids if _id >= 0]
+        text = self._get_text_tokenizer().decode(ids)
+        text = text.replace("<n>", "\n")
+        text = text.replace(SPTokenizer.get_tab_token(), "\t")
+        for i in range(2, self.max_blank_length + 1):
+            text = text.replace(self.get_blank_token(i), " " * i)
+        return text
+    def tokenize(
+            self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True
+    ) -> List[str]:
+        """
+        @param text: Text to encode.
+        @param linebreak: Whether to encode newline (\n) in text.
+        @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
+        @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
+        @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
+        """
+        text = self._preprocess(text, linebreak, whitespaces)
+        if not add_dummy_prefix:
+            text = "<n>" + text
+        tokens = self._get_text_tokenizer().tokenize(text)
+        return tokens if add_dummy_prefix else tokens[2:]
+    def __getitem__(self, x: Union[int, str]):
+        if isinstance(x, int):
+            if x < self.num_image_tokens:
+                return "<image_{}>".format(x)
+            else:
+                return self.text_tokenizer.convert_id_to_token(x - self.num_image_tokens)
+        elif isinstance(x, str):
+            if x.startswith("<image_") and x.endswith(">") and x[7:-1].isdigit():
+                return int(x[7:-1])
+            else:
+                return self.text_tokenizer.convert_token_to_id(x) + self.num_image_tokens
+        else:
+            raise ValueError("The key should be str or int.")
+class ChatGLMTokenizer(PreTrainedTokenizer):
+    """
+    Construct a ChatGLM tokenizer. Based on byte-level Byte-Pair-Encoding.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+    vocab_files_names = {"vocab_file": "ice_text.model"}
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+    def __init__(
+            self,
+            vocab_file,
+            do_lower_case=False,
+            remove_space=False,
+            bos_token='<sop>',
+            eos_token='<eop>',
+            end_token='</s>',
+            mask_token='[MASK]',
+            gmask_token='[gMASK]',
+            padding_side="left",
+            num_image_tokens=20000,
+            **kwargs
+    ) -> None:
+        super().__init__(
+            do_lower_case=do_lower_case,
+            remove_space=remove_space,
+            padding_side=padding_side,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            end_token=end_token,
+            mask_token=mask_token,
+            gmask_token=gmask_token,
+            num_image_tokens=num_image_tokens,
+            **kwargs
+        )
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.vocab_file = vocab_file
+        self.bos_token = bos_token
+        self.eos_token = eos_token
+        self.end_token = end_token
+        self.mask_token = mask_token
+        self.gmask_token = gmask_token
+        self.sp_tokenizer = SPTokenizer(vocab_file, num_image_tokens=num_image_tokens)
+        """ Initialisation """
+    @property
+    def gmask_token_id(self) -> Optional[int]:
+        if self.gmask_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.gmask_token)
+    @property
+    def end_token_id(self) -> Optional[int]:
+        """
+        `Optional[int]`: Id of the end of context token in the vocabulary. Returns `None` if the token has not been
+        set.
+        """
+        if self.end_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.end_token)
+    @property
+    def vocab_size(self):
+        """ Returns vocab size """
+        return self.sp_tokenizer.num_tokens
+    def get_vocab(self):
+        """ Returns vocab as a dict """
+        vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def preprocess_text(self, inputs):
+        if self.remove_space:
+            outputs = " ".join(inputs.strip().split())
+        else:
+            outputs = inputs
+        if self.do_lower_case:
+            outputs = outputs.lower()
+        return outputs
+    def _tokenize(self, text, **kwargs):
+        """ Returns a tokenized string. """
+        text = self.preprocess_text(text)
+        seq = self.sp_tokenizer.tokenize(text)
+        return seq
+    def _decode(
+            self,
+            token_ids: Union[int, List[int]],
+            skip_special_tokens: bool = False,
+            clean_up_tokenization_spaces: bool = True,
+            **kwargs
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if len(token_ids) == 0:
+            return ""
+        if self.pad_token_id in token_ids:  # remove pad
+            token_ids = list(filter((self.pad_token_id).__ne__, token_ids))
+        return self.sp_tokenizer.decode(token_ids)
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.sp_tokenizer[token]
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.sp_tokenizer[index]
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, self.vocab_files_names["vocab_file"]
+            )
+        else:
+            vocab_file = save_directory
+        with open(self.vocab_file, 'rb') as fin:
+            proto_str = fin.read()
+        with open(vocab_file, "wb") as writer:
+            writer.write(proto_str)
+        return (vocab_file,)
+    def build_inputs_with_special_tokens(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        mask_ids = self.sp_tokenizer[self.mask_token]
+        gmask_ids = self.sp_tokenizer[self.gmask_token]
+        eos_id = self.sp_tokenizer[self.eos_token]
+        if mask_ids not in token_ids_0 and gmask_ids not in token_ids_0:
+            token_ids_0 += [gmask_ids]
+        if token_ids_0[-1] != mask_ids and token_ids_0[-1] != gmask_ids:
+            token_ids_0 += [self.sp_tokenizer[self.end_token]]
+        token_ids_0 += [self.sp_tokenizer[self.bos_token]]
+        if token_ids_1 is not None:
+            if not token_ids_1 or token_ids_1[-1] != eos_id:
+                token_ids_1 += [eos_id]
+            token_ids_0 += token_ids_1
+        return token_ids_0
+    def _pad(
+            self,
+            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+            max_length: Optional[int] = None,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            pad_to_multiple_of: Optional[int] = None,
+            return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        bos_token_id = self.sp_tokenizer[self.bos_token]
+        mask_token_id = self.sp_tokenizer[self.mask_token]
+        gmask_token_id = self.sp_tokenizer[self.gmask_token]
+        assert self.padding_side == "left"
+        required_input = encoded_inputs[self.model_input_names[0]]
+        seq_length = len(required_input)
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+        # Initialize attention mask if not present.
+        if max_length is not None:
+            if "attention_mask" not in encoded_inputs:
+                if bos_token_id in required_input:
+                    context_length = required_input.index(bos_token_id)
+                else:
+                    context_length = seq_length
+                attention_mask = np.ones((1, seq_length, seq_length))
+                attention_mask = np.tril(attention_mask)
+                attention_mask[:, :, :context_length] = 1
+                attention_mask = np.bool_(attention_mask < 0.5)
+                encoded_inputs["attention_mask"] = attention_mask
+            if "position_ids" not in encoded_inputs:
+                position_ids = np.arange(seq_length, dtype=np.int64)
+                mask_token = mask_token_id if mask_token_id in required_input else gmask_token_id
+                if mask_token in required_input:
+                    mask_position = required_input.index(mask_token)
+                    position_ids[context_length:] = mask_position
+                block_position_ids = np.concatenate(
+                    [np.zeros(context_length, dtype=np.int64),
+                     np.arange(1, seq_length - context_length + 1, dtype=np.int64)])
+                encoded_inputs["position_ids"] = np.stack([position_ids, block_position_ids], axis=0)
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = np.pad(encoded_inputs["attention_mask"],
+                                                          pad_width=[(0, 0), (difference, 0), (difference, 0)],
+                                                          mode='constant', constant_values=True)
+            if "token_type_ids" in encoded_inputs:
+                encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                    "token_type_ids"
+                ]
+            if "special_tokens_mask" in encoded_inputs:
+                encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+            if "position_ids" in encoded_inputs:
+                encoded_inputs["position_ids"] = np.pad(encoded_inputs["position_ids"],
+                                                        pad_width=[(0, 0), (difference, 0)])
+            encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+        return encoded_inputs

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_chatglm.ChatGLMTokenizer",
+      null
+    ]
+  },
+  "bos_token": "<sop>",
+  "do_lower_case": false,
+  "end_token": "</s>",
+  "eos_token": "<eop>",
+  "gmask_token": "[gMASK]",
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "num_image_tokens": 0,
+  "pad_token": "<pad>",
+  "padding_side": "left",
+  "processor_class": "Blip2Processor",
+  "remove_space": false,
+  "special_tokens_map_file": null,
+  "tokenizer_class": "ChatGLMTokenizer",
+  "unk_token": "<unk>"
+}