zR commited on
Commit
7031540
1 Parent(s): ade85af
Files changed (5) hide show
  1. README.md +4 -1
  2. README_en.md +3 -1
  3. config.json +1 -1
  4. generation_config.json +1 -1
  5. modeling_chatglm.py +5 -223
README.md CHANGED
@@ -46,7 +46,10 @@ GLM-4V-9B 是一个多模态语言模型,具备视觉理解能力,其相关
46
 
47
  ## 运行模型
48
 
49
- 更多推理代码和依赖信息,请访问我们的 [github](https://github.com/THUDM/GLM-4)
 
 
 
50
 
51
  ```python
52
  import torch
 
46
 
47
  ## 运行模型
48
 
49
+ **更多推理代码和依赖信息,请访问我们的 [github](https://github.com/THUDM/GLM-4)。**
50
+
51
+ **请严格按照[依赖](https://github.com/THUDM/GLM-4/blob/main/basic_demo/requirements.txt)安装,否则无法正常运行。**
52
+
53
 
54
  ```python
55
  import torch
README_en.md CHANGED
@@ -29,7 +29,9 @@ GLM-4V-9B is a multimodal language model with visual understanding capabilities.
29
 
30
  ## Quick Start
31
 
32
- For more inference code and requirements, please visit our [github page](https://github.com/THUDM/GLM-4).
 
 
33
 
34
 
35
  ```python
 
29
 
30
  ## Quick Start
31
 
32
+ **For more inference code and requirements, please visit our [github page](https://github.com/THUDM/GLM-4).**
33
+
34
+ **Please strictly follow the [dependencies](https://github.com/THUDM/GLM-4/blob/main/basic_demo/requirements.txt) to install, otherwise it will not run properly**
35
 
36
 
37
  ```python
config.json CHANGED
@@ -50,7 +50,7 @@
50
  "seq_length": 8192,
51
  "use_cache": true,
52
  "torch_dtype": "bfloat16",
53
- "transformers_version": "4.40.2",
54
  "tie_word_embeddings": false,
55
  "eos_token_id": [151329, 151336, 151338],
56
  "pad_token_id": 151329,
 
50
  "seq_length": 8192,
51
  "use_cache": true,
52
  "torch_dtype": "bfloat16",
53
+ "transformers_version": "4.42.4",
54
  "tie_word_embeddings": false,
55
  "eos_token_id": [151329, 151336, 151338],
56
  "pad_token_id": 151329,
generation_config.json CHANGED
@@ -9,5 +9,5 @@
9
  "temperature": 0.8,
10
  "max_length": 8192,
11
  "top_p": 0.8,
12
- "transformers_version": "4.40.2"
13
  }
 
9
  "temperature": 0.8,
10
  "max_length": 8192,
11
  "top_p": 0.8,
12
+ "transformers_version": "4.42.4"
13
  }
modeling_chatglm.py CHANGED
@@ -1,18 +1,13 @@
1
- """ PyTorch ChatGLM model. """
2
- import json
3
  import math
4
- import copy
5
- import warnings
6
  import sys
7
-
8
  import torch
9
  import torch.utils.checkpoint
10
  import torch.nn.functional as F
11
  from torch import nn
12
  from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
13
  from torch.nn.utils import skip_init
14
- from typing import Optional, Tuple, Union, List, Callable, Dict, Any
15
- from copy import deepcopy
16
 
17
  from transformers.modeling_outputs import (
18
  BaseModelOutputWithPast,
@@ -853,11 +848,6 @@ class ChatGLMPreTrainedModel(PreTrainedModel):
853
  batch_size, seq_length = input_ids.shape
854
  position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
855
 
856
- def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
857
- if not self.supports_gradient_checkpointing:
858
- raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
859
-
860
-
861
  class Embedding(torch.nn.Module):
862
  """Language model embeddings."""
863
 
@@ -1095,9 +1085,10 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
1095
  standardize_cache_format: bool = False,
1096
  ) -> Dict[str, Any]:
1097
  # update past_key_values
1098
- model_kwargs["past_key_values"] = self._extract_past_from_model_output(
1099
  outputs, standardize_cache_format=standardize_cache_format
1100
  )
 
1101
 
1102
  # update attention mask
1103
  if "attention_mask" in model_kwargs:
@@ -1204,7 +1195,6 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
1204
 
1205
  loss = None
1206
  if labels is not None:
1207
- # https://github.com/THUDM/GLM-4/issues/264
1208
  new_labels = []
1209
  for i in range(len(input_ids)):
1210
  input_id = input_ids[i].tolist()
@@ -1216,16 +1206,12 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
1216
  (
1217
  labels[i, :boi_token_pos + 1],
1218
  torch.tensor([-100]).to(labels.device).to(labels.dtype).repeat(1600),
1219
- labels[i, eoi_token_pos:]))) # 在两个token之间加入
1220
 
1221
  labels = torch.stack(new_labels, dim=0)
1222
-
1223
  lm_logits = lm_logits.to(torch.float32)
1224
-
1225
- # Shift so that tokens < n predict n
1226
  shift_logits = lm_logits[..., :-1, :].contiguous()
1227
  shift_labels = labels[..., 1:].contiguous()
1228
- # Flatten the tokens
1229
  loss_fct = CrossEntropyLoss(ignore_index=-100)
1230
  loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
1231
 
@@ -1263,210 +1249,6 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
1263
  for layer_past in past
1264
  )
1265
 
1266
- def process_response(self, output, history):
1267
- content = ""
1268
- history = deepcopy(history)
1269
- for response in output.split("<|assistant|>"):
1270
- if "\n" in response:
1271
- metadata, content = response.split("\n", maxsplit=1)
1272
- else:
1273
- metadata, content = "", response
1274
- if not metadata.strip():
1275
- content = content.strip()
1276
- history.append({"role": "assistant", "metadata": metadata, "content": content})
1277
- content = content.replace("[[训练时间]]", "2023年")
1278
- else:
1279
- history.append({"role": "assistant", "metadata": metadata, "content": content})
1280
- if history[0]["role"] == "system" and "tools" in history[0]:
1281
- parameters = json.loads(content)
1282
- content = {"name": metadata.strip(), "parameters": parameters}
1283
- else:
1284
- content = {"name": metadata.strip(), "content": content}
1285
- return content, history
1286
-
1287
- @torch.inference_mode()
1288
- def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user", image=None,
1289
- max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
1290
- **kwargs):
1291
- if history is None:
1292
- history = []
1293
- if logits_processor is None:
1294
- logits_processor = LogitsProcessorList()
1295
- logits_processor.append(InvalidScoreLogitsProcessor())
1296
- gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
1297
- "temperature": temperature, "logits_processor": logits_processor, **kwargs}
1298
- message = {"role": role, "content": query}
1299
- if image is not None:
1300
- message["image"] = image
1301
- history.append(message)
1302
- inputs = tokenizer.apply_chat_template(history, add_generation_prompt=True, tokenize=True,
1303
- return_tensors="pt", return_dict=True)
1304
- inputs = inputs.to(self.device)
1305
- eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|user|>"),
1306
- tokenizer.convert_tokens_to_ids("<|observation|>")]
1307
- outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id)
1308
- outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
1309
- response = tokenizer.decode(outputs)
1310
- response, history = self.process_response(response, history)
1311
- return response, history
1312
-
1313
- @torch.inference_mode()
1314
- def stream_chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user", image=None,
1315
- past_key_values=None, max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8,
1316
- logits_processor=None, return_past_key_values=False, **kwargs):
1317
- if history is None:
1318
- history = []
1319
- if logits_processor is None:
1320
- logits_processor = LogitsProcessorList()
1321
- logits_processor.append(InvalidScoreLogitsProcessor())
1322
- eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|user|>"),
1323
- tokenizer.convert_tokens_to_ids("<|observation|>")]
1324
- gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
1325
- "temperature": temperature, "logits_processor": logits_processor, **kwargs}
1326
- message = {"role": role, "content": "query"}
1327
- if image is not None:
1328
- message["image"] = image
1329
- if past_key_values is None:
1330
- inputs = tokenizer.apply_chat_template(history + [message],
1331
- add_generation_prompt=True, tokenize=True, return_tensors="pt",
1332
- return_dict=True)
1333
- else:
1334
- inputs = tokenizer.apply_chat_template([message], add_special_tokens=False,
1335
- add_generation_prompt=True, tokenize=True, return_tensors="pt",
1336
- return_dict=True)
1337
- inputs = inputs.to(self.device)
1338
- if past_key_values is not None:
1339
- past_length = past_key_values[0][0].shape[2]
1340
- if self.transformer.pre_seq_len is not None:
1341
- past_length -= self.transformer.pre_seq_len
1342
- inputs.position_ids += past_length
1343
- attention_mask = inputs.attention_mask
1344
- attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
1345
- inputs['attention_mask'] = attention_mask
1346
- history.append({"role": role, "content": query})
1347
- for outputs in self.stream_generate(**inputs, past_key_values=past_key_values,
1348
- eos_token_id=eos_token_id, return_past_key_values=return_past_key_values,
1349
- **gen_kwargs):
1350
- if return_past_key_values:
1351
- outputs, past_key_values = outputs
1352
- outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
1353
- response = tokenizer.decode(outputs)
1354
- if response and response[-1] != "�":
1355
- response, new_history = self.process_response(response, history)
1356
- if return_past_key_values:
1357
- yield response, new_history, past_key_values
1358
- else:
1359
- yield response, new_history
1360
-
1361
- @torch.inference_mode()
1362
- def stream_generate(
1363
- self,
1364
- input_ids,
1365
- generation_config: Optional[GenerationConfig] = None,
1366
- logits_processor: Optional[LogitsProcessorList] = None,
1367
- stopping_criteria: Optional[StoppingCriteriaList] = None,
1368
- prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
1369
- return_past_key_values=False,
1370
- **kwargs,
1371
- ):
1372
- batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
1373
-
1374
- if generation_config is None:
1375
- generation_config = self.generation_config
1376
- generation_config = copy.deepcopy(generation_config)
1377
- model_kwargs = generation_config.update(**kwargs)
1378
- model_kwargs["use_cache"] = generation_config.use_cache
1379
- bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
1380
-
1381
- if isinstance(eos_token_id, int):
1382
- eos_token_id = [eos_token_id]
1383
- eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
1384
-
1385
- has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
1386
- if has_default_max_length and generation_config.max_new_tokens is None:
1387
- warnings.warn(
1388
- f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
1389
- "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
1390
- " recommend using `max_new_tokens` to control the maximum length of the generation.",
1391
- UserWarning,
1392
- )
1393
- elif generation_config.max_new_tokens is not None:
1394
- generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
1395
- if not has_default_max_length:
1396
- logger.warn(
1397
- f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
1398
- f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
1399
- "Please refer to the documentation for more information. "
1400
- "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
1401
- UserWarning,
1402
- )
1403
-
1404
- if input_ids_seq_length >= generation_config.max_length:
1405
- input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
1406
- logger.warning(
1407
- f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
1408
- f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
1409
- " increasing `max_new_tokens`."
1410
- )
1411
-
1412
- # 2. Set generation parameters if not already defined
1413
- logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
1414
- stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
1415
-
1416
- logits_processor = self._get_logits_processor(
1417
- generation_config=generation_config,
1418
- input_ids_seq_length=input_ids_seq_length,
1419
- encoder_input_ids=input_ids,
1420
- prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
1421
- logits_processor=logits_processor,
1422
- )
1423
-
1424
- stopping_criteria = self._get_stopping_criteria(
1425
- generation_config=generation_config, stopping_criteria=stopping_criteria
1426
- )
1427
- logits_warper = self._get_logits_warper(generation_config)
1428
-
1429
- unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
1430
- scores = None
1431
- while True:
1432
- model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
1433
- # forward pass to get next token
1434
- outputs = self(
1435
- **model_inputs,
1436
- return_dict=True,
1437
- output_attentions=False,
1438
- output_hidden_states=False,
1439
- )
1440
-
1441
- next_token_logits = outputs.logits[:, -1, :]
1442
-
1443
- # pre-process distribution
1444
- next_token_scores = logits_processor(input_ids, next_token_logits)
1445
- next_token_scores = logits_warper(input_ids, next_token_scores)
1446
-
1447
- # sample
1448
- probs = nn.functional.softmax(next_token_scores, dim=-1)
1449
- if generation_config.do_sample:
1450
- next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
1451
- else:
1452
- next_tokens = torch.argmax(probs, dim=-1)
1453
- # update generated ids, model inputs, and length for next step
1454
- input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
1455
- model_kwargs = self._update_model_kwargs_for_generation(
1456
- outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
1457
- )
1458
- unfinished_sequences = unfinished_sequences.mul(
1459
- next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
1460
- )
1461
- if return_past_key_values:
1462
- yield input_ids, outputs.past_key_values
1463
- else:
1464
- yield input_ids
1465
- # stop when each sentence is finished, or if we exceed the maximum length
1466
- if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
1467
- break
1468
-
1469
-
1470
  class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
1471
  def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
1472
  super().__init__(config)
 
1
+ """ PyTorch GLM-4V model. """
 
2
  import math
 
 
3
  import sys
 
4
  import torch
5
  import torch.utils.checkpoint
6
  import torch.nn.functional as F
7
  from torch import nn
8
  from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
9
  from torch.nn.utils import skip_init
10
+ from typing import Optional, Tuple, Union, List, Dict, Any
 
11
 
12
  from transformers.modeling_outputs import (
13
  BaseModelOutputWithPast,
 
848
  batch_size, seq_length = input_ids.shape
849
  position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
850
 
 
 
 
 
 
851
  class Embedding(torch.nn.Module):
852
  """Language model embeddings."""
853
 
 
1085
  standardize_cache_format: bool = False,
1086
  ) -> Dict[str, Any]:
1087
  # update past_key_values
1088
+ cache_name, cache = self._extract_past_from_model_output(
1089
  outputs, standardize_cache_format=standardize_cache_format
1090
  )
1091
+ model_kwargs[cache_name] = cache
1092
 
1093
  # update attention mask
1094
  if "attention_mask" in model_kwargs:
 
1195
 
1196
  loss = None
1197
  if labels is not None:
 
1198
  new_labels = []
1199
  for i in range(len(input_ids)):
1200
  input_id = input_ids[i].tolist()
 
1206
  (
1207
  labels[i, :boi_token_pos + 1],
1208
  torch.tensor([-100]).to(labels.device).to(labels.dtype).repeat(1600),
1209
+ labels[i, eoi_token_pos:])))
1210
 
1211
  labels = torch.stack(new_labels, dim=0)
 
1212
  lm_logits = lm_logits.to(torch.float32)
 
 
1213
  shift_logits = lm_logits[..., :-1, :].contiguous()
1214
  shift_labels = labels[..., 1:].contiguous()
 
1215
  loss_fct = CrossEntropyLoss(ignore_index=-100)
1216
  loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
1217
 
 
1249
  for layer_past in past
1250
  )
1251
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1252
  class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
1253
  def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
1254
  super().__init__(config)