Files changed (3) hide show
  1. README.md +15 -10
  2. modeling_chatglm.py +7 -97
  3. tokenization_chatglm.py +2 -3
README.md CHANGED
@@ -74,17 +74,22 @@ For more instructions, including how to run CLI and web demos, and model quantiz
74
 
75
  ## 引用
76
 
77
- 如果你觉得我们的工作有帮助的话,请考虑引用下列论文。
78
-
79
- If you find our work helpful, please consider citing the following paper.
80
 
81
  ```
82
- @misc{glm2024chatglm,
83
- title={ChatGLM: A Family of Large Language Models from GLM-130B to GLM-4 All Tools},
84
- author={Team GLM and Aohan Zeng and Bin Xu and Bowen Wang and Chenhui Zhang and Da Yin and Diego Rojas and Guanyu Feng and Hanlin Zhao and Hanyu Lai and Hao Yu and Hongning Wang and Jiadai Sun and Jiajie Zhang and Jiale Cheng and Jiayi Gui and Jie Tang and Jing Zhang and Juanzi Li and Lei Zhao and Lindong Wu and Lucen Zhong and Mingdao Liu and Minlie Huang and Peng Zhang and Qinkai Zheng and Rui Lu and Shuaiqi Duan and Shudan Zhang and Shulin Cao and Shuxun Yang and Weng Lam Tam and Wenyi Zhao and Xiao Liu and Xiao Xia and Xiaohan Zhang and Xiaotao Gu and Xin Lv and Xinghan Liu and Xinyi Liu and Xinyue Yang and Xixuan Song and Xunkai Zhang and Yifan An and Yifan Xu and Yilin Niu and Yuantao Yang and Yueyan Li and Yushi Bai and Yuxiao Dong and Zehan Qi and Zhaoyu Wang and Zhen Yang and Zhengxiao Du and Zhenyu Hou and Zihan Wang},
85
- year={2024},
86
- eprint={2406.12793},
87
- archivePrefix={arXiv},
88
- primaryClass={id='cs.CL' full_name='Computation and Language' is_active=True alt_name='cmp-lg' in_archive='cs' is_general=False description='Covers natural language processing. Roughly includes material in ACM Subject Class I.2.7. Note that work on artificial languages (programming languages, logics, formal systems) that does not explicitly address natural-language issues broadly construed (natural-language processing, computational linguistics, speech, text retrieval, etc.) is not appropriate for this area.'}
 
 
 
 
 
 
 
89
  }
90
  ```
 
74
 
75
  ## 引用
76
 
77
+ 如果你觉得我们的工作有帮助的话,请考虑引用下列论文,ChatGLM2-6B 的论文会在近期公布,尽情期待~
 
 
78
 
79
  ```
80
+ @article{zeng2022glm,
81
+ title={Glm-130b: An open bilingual pre-trained model},
82
+ author={Zeng, Aohan and Liu, Xiao and Du, Zhengxiao and Wang, Zihan and Lai, Hanyu and Ding, Ming and Yang, Zhuoyi and Xu, Yifan and Zheng, Wendi and Xia, Xiao and others},
83
+ journal={arXiv preprint arXiv:2210.02414},
84
+ year={2022}
85
+ }
86
+ ```
87
+ ```
88
+ @inproceedings{du2022glm,
89
+ title={GLM: General Language Model Pretraining with Autoregressive Blank Infilling},
90
+ author={Du, Zhengxiao and Qian, Yujie and Liu, Xiao and Ding, Ming and Qiu, Jiezhong and Yang, Zhilin and Tang, Jie},
91
+ booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
92
+ pages={320--335},
93
+ year={2022}
94
  }
95
  ```
modeling_chatglm.py CHANGED
@@ -11,14 +11,12 @@ import torch.utils.checkpoint
11
  import torch.nn.functional as F
12
  from torch import nn
13
  from torch.nn import CrossEntropyLoss, LayerNorm
14
- from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
15
  from torch.nn.utils import skip_init
16
  from typing import Optional, Tuple, Union, List, Callable, Dict, Any
17
 
18
  from transformers.modeling_outputs import (
19
  BaseModelOutputWithPast,
20
  CausalLMOutputWithPast,
21
- SequenceClassifierOutputWithPast,
22
  )
23
  from transformers.modeling_utils import PreTrainedModel
24
  from transformers.utils import logging
@@ -865,10 +863,12 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
865
  outputs: ModelOutput,
866
  model_kwargs: Dict[str, Any],
867
  is_encoder_decoder: bool = False,
 
868
  ) -> Dict[str, Any]:
869
  # update past_key_values
870
- cache_name, cache = self._extract_past_from_model_output(outputs)
871
- model_kwargs[cache_name] = cache
 
872
 
873
  # update attention mask
874
  if "attention_mask" in model_kwargs:
@@ -895,7 +895,6 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
895
  past_key_values: Optional[torch.Tensor] = None,
896
  attention_mask: Optional[torch.Tensor] = None,
897
  position_ids: Optional[torch.Tensor] = None,
898
- use_cache: Optional[bool] = None,
899
  is_first_forward: bool = True,
900
  **kwargs
901
  ) -> dict:
@@ -903,16 +902,14 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
903
  if position_ids is None:
904
  position_ids = self.get_position_ids(input_ids, device=input_ids.device)
905
  if not is_first_forward:
906
- if past_key_values is not None:
907
- position_ids = position_ids[..., -1:]
908
- input_ids = input_ids[:, -1:]
909
  return {
910
  "input_ids": input_ids,
911
  "past_key_values": past_key_values,
912
  "position_ids": position_ids,
913
  "attention_mask": attention_mask,
914
- "return_last_logit": True,
915
- "use_cache": use_cache
916
  }
917
 
918
  def forward(
@@ -1089,7 +1086,6 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
1089
  generation_config = self.generation_config
1090
  generation_config = copy.deepcopy(generation_config)
1091
  model_kwargs = generation_config.update(**kwargs)
1092
- model_kwargs["use_cache"] = generation_config.use_cache
1093
  bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
1094
 
1095
  if isinstance(eos_token_id, int):
@@ -1195,89 +1191,3 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
1195
  self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device,
1196
  **kwargs)
1197
  return self
1198
-
1199
-
1200
- class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
1201
- def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
1202
- super().__init__(config)
1203
-
1204
- self.num_labels = config.num_labels
1205
- self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
1206
-
1207
- self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half)
1208
- if config.classifier_dropout is not None:
1209
- self.dropout = nn.Dropout(config.classifier_dropout)
1210
- else:
1211
- self.dropout = None
1212
- self.config = config
1213
-
1214
- if self.config.quantization_bit:
1215
- self.quantize(self.config.quantization_bit, empty_init=True)
1216
-
1217
- def forward(
1218
- self,
1219
- input_ids: Optional[torch.LongTensor] = None,
1220
- position_ids: Optional[torch.LongTensor] = None,
1221
- attention_mask: Optional[torch.Tensor] = None,
1222
- full_attention_mask: Optional[torch.Tensor] = None,
1223
- past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
1224
- inputs_embeds: Optional[torch.LongTensor] = None,
1225
- labels: Optional[torch.LongTensor] = None,
1226
- use_cache: Optional[bool] = None,
1227
- output_hidden_states: Optional[bool] = None,
1228
- return_dict: Optional[bool] = None,
1229
- ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]:
1230
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1231
-
1232
- transformer_outputs = self.transformer(
1233
- input_ids=input_ids,
1234
- position_ids=position_ids,
1235
- attention_mask=attention_mask,
1236
- full_attention_mask=full_attention_mask,
1237
- past_key_values=past_key_values,
1238
- inputs_embeds=inputs_embeds,
1239
- use_cache=use_cache,
1240
- output_hidden_states=output_hidden_states,
1241
- return_dict=return_dict,
1242
- )
1243
-
1244
- hidden_states = transformer_outputs[0]
1245
- pooled_hidden_states = hidden_states[-1]
1246
- if self.dropout is not None:
1247
- pooled_hidden_states = self.dropout(pooled_hidden_states)
1248
- logits = self.classifier_head(pooled_hidden_states)
1249
-
1250
- loss = None
1251
- if labels is not None:
1252
- if self.config.problem_type is None:
1253
- if self.num_labels == 1:
1254
- self.config.problem_type = "regression"
1255
- elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1256
- self.config.problem_type = "single_label_classification"
1257
- else:
1258
- self.config.problem_type = "multi_label_classification"
1259
-
1260
- if self.config.problem_type == "regression":
1261
- loss_fct = MSELoss()
1262
- if self.num_labels == 1:
1263
- loss = loss_fct(logits.squeeze().float(), labels.squeeze())
1264
- else:
1265
- loss = loss_fct(logits.float(), labels)
1266
- elif self.config.problem_type == "single_label_classification":
1267
- loss_fct = CrossEntropyLoss()
1268
- loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1))
1269
- elif self.config.problem_type == "multi_label_classification":
1270
- loss_fct = BCEWithLogitsLoss()
1271
- loss = loss_fct(logits.float(), labels.view(-1, self.num_labels))
1272
-
1273
- if not return_dict:
1274
- output = (logits,) + transformer_outputs[1:]
1275
- return ((loss,) + output) if loss is not None else output
1276
-
1277
- return SequenceClassifierOutputWithPast(
1278
- loss=loss,
1279
- logits=logits,
1280
- past_key_values=transformer_outputs.past_key_values,
1281
- hidden_states=transformer_outputs.hidden_states,
1282
- attentions=transformer_outputs.attentions,
1283
- )
 
11
  import torch.nn.functional as F
12
  from torch import nn
13
  from torch.nn import CrossEntropyLoss, LayerNorm
 
14
  from torch.nn.utils import skip_init
15
  from typing import Optional, Tuple, Union, List, Callable, Dict, Any
16
 
17
  from transformers.modeling_outputs import (
18
  BaseModelOutputWithPast,
19
  CausalLMOutputWithPast,
 
20
  )
21
  from transformers.modeling_utils import PreTrainedModel
22
  from transformers.utils import logging
 
863
  outputs: ModelOutput,
864
  model_kwargs: Dict[str, Any],
865
  is_encoder_decoder: bool = False,
866
+ standardize_cache_format: bool = False,
867
  ) -> Dict[str, Any]:
868
  # update past_key_values
869
+ model_kwargs["past_key_values"] = self._extract_past_from_model_output(
870
+ outputs, standardize_cache_format=standardize_cache_format
871
+ )
872
 
873
  # update attention mask
874
  if "attention_mask" in model_kwargs:
 
895
  past_key_values: Optional[torch.Tensor] = None,
896
  attention_mask: Optional[torch.Tensor] = None,
897
  position_ids: Optional[torch.Tensor] = None,
 
898
  is_first_forward: bool = True,
899
  **kwargs
900
  ) -> dict:
 
902
  if position_ids is None:
903
  position_ids = self.get_position_ids(input_ids, device=input_ids.device)
904
  if not is_first_forward:
905
+ position_ids = position_ids[..., -1:]
906
+ input_ids = input_ids[:, -1:]
 
907
  return {
908
  "input_ids": input_ids,
909
  "past_key_values": past_key_values,
910
  "position_ids": position_ids,
911
  "attention_mask": attention_mask,
912
+ "return_last_logit": True
 
913
  }
914
 
915
  def forward(
 
1086
  generation_config = self.generation_config
1087
  generation_config = copy.deepcopy(generation_config)
1088
  model_kwargs = generation_config.update(**kwargs)
 
1089
  bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
1090
 
1091
  if isinstance(eos_token_id, int):
 
1191
  self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device,
1192
  **kwargs)
1193
  return self
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenization_chatglm.py CHANGED
@@ -65,7 +65,8 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
65
 
66
  model_input_names = ["input_ids", "attention_mask", "position_ids"]
67
 
68
- def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs):
 
69
  self.name = "GLMTokenizer"
70
 
71
  self.vocab_file = vocab_file
@@ -75,7 +76,6 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
75
  "<eos>": self.tokenizer.eos_id,
76
  "<pad>": self.tokenizer.pad_id
77
  }
78
- super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
79
 
80
  def get_command(self, token):
81
  if token in self.special_tokens:
@@ -197,7 +197,6 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
197
  self,
198
  encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
199
  max_length: Optional[int] = None,
200
- padding_side: str = "left",
201
  padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
202
  pad_to_multiple_of: Optional[int] = None,
203
  return_attention_mask: Optional[bool] = None,
 
65
 
66
  model_input_names = ["input_ids", "attention_mask", "position_ids"]
67
 
68
+ def __init__(self, vocab_file, padding_side="left", **kwargs):
69
+ super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=False, **kwargs)
70
  self.name = "GLMTokenizer"
71
 
72
  self.vocab_file = vocab_file
 
76
  "<eos>": self.tokenizer.eos_id,
77
  "<pad>": self.tokenizer.pad_id
78
  }
 
79
 
80
  def get_command(self, token):
81
  if token in self.special_tokens:
 
197
  self,
198
  encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
199
  max_length: Optional[int] = None,
 
200
  padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
201
  pad_to_multiple_of: Optional[int] = None,
202
  return_attention_mask: Optional[bool] = None,