czczup commited on
Commit
186a8d8
1 Parent(s): 810c831

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -74,8 +74,10 @@ We provide an example code to run InternVL2-2B using `transformers`.
74
  > Please use transformers==4.37.2 to ensure the model works normally.
75
 
76
  ```python
 
77
  import torch
78
  import torchvision.transforms as T
 
79
  from PIL import Image
80
  from torchvision.transforms.functional import InterpolationMode
81
  from transformers import AutoModel, AutoTokenizer
@@ -204,7 +206,22 @@ response, history = model.chat(tokenizer, pixel_values, question, generation_con
204
  print(f'User: {question}')
205
  print(f'Assistant: {response}')
206
 
207
- # multi-image multi-round conversation (多图多轮对话)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
209
  pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
210
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
@@ -278,7 +295,7 @@ video_path = './examples/red-panda.mp4'
278
  pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
279
  pixel_values = pixel_values.to(torch.bfloat16).cuda()
280
  video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
281
- question = video_prefix + 'What is the red panda doing?'
282
  # Frame1: <image>\nFrame2: <image>\n...\nFrame31: <image>\n{question}
283
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
284
  num_patches_list=num_patches_list,
@@ -286,7 +303,7 @@ response, history = model.chat(tokenizer, pixel_values, question, generation_con
286
  print(f'User: {question}')
287
  print(f'Assistant: {response}')
288
 
289
- question = 'Describe this video in detail.'
290
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
291
  num_patches_list=num_patches_list,
292
  history=history, return_history=True)
@@ -315,7 +332,7 @@ from lmdeploy.vl import load_image
315
  model = 'OpenGVLab/InternVL2-2B'
316
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
317
  image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
318
- chat_template_config = ChatTemplateConfig('internlm2-chat')
319
  chat_template_config.meta_instruction = system_prompt
320
  pipe = pipeline(model, chat_template_config=chat_template_config,
321
  backend_config=TurbomindEngineConfig(session_len=8192))
@@ -336,7 +353,7 @@ from lmdeploy.vl.constants import IMAGE_TOKEN
336
 
337
  model = 'OpenGVLab/InternVL2-2B'
338
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
339
- chat_template_config = ChatTemplateConfig('internlm2-chat')
340
  chat_template_config.meta_instruction = system_prompt
341
  pipe = pipeline(model, chat_template_config=chat_template_config,
342
  backend_config=TurbomindEngineConfig(session_len=8192))
@@ -362,7 +379,7 @@ from lmdeploy.vl import load_image
362
 
363
  model = 'OpenGVLab/InternVL2-2B'
364
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
365
- chat_template_config = ChatTemplateConfig('internlm2-chat')
366
  chat_template_config.meta_instruction = system_prompt
367
  pipe = pipeline(model, chat_template_config=chat_template_config,
368
  backend_config=TurbomindEngineConfig(session_len=8192))
@@ -386,7 +403,7 @@ from lmdeploy.vl import load_image
386
 
387
  model = 'OpenGVLab/InternVL2-2B'
388
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
389
- chat_template_config = ChatTemplateConfig('internlm2-chat')
390
  chat_template_config.meta_instruction = system_prompt
391
  pipe = pipeline(model, chat_template_config=chat_template_config,
392
  backend_config=TurbomindEngineConfig(session_len=8192))
@@ -509,7 +526,7 @@ from lmdeploy.vl import load_image
509
  model = 'OpenGVLab/InternVL2-2B'
510
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
511
  image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
512
- chat_template_config = ChatTemplateConfig('internlm2-chat')
513
  chat_template_config.meta_instruction = system_prompt
514
  pipe = pipeline(model, chat_template_config=chat_template_config,
515
  backend_config=TurbomindEngineConfig(session_len=8192))
@@ -530,7 +547,7 @@ from lmdeploy.vl.constants import IMAGE_TOKEN
530
 
531
  model = 'OpenGVLab/InternVL2-2B'
532
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
533
- chat_template_config = ChatTemplateConfig('internlm2-chat')
534
  chat_template_config.meta_instruction = system_prompt
535
  pipe = pipeline(model, chat_template_config=chat_template_config,
536
  backend_config=TurbomindEngineConfig(session_len=8192))
@@ -555,7 +572,7 @@ from lmdeploy.vl import load_image
555
 
556
  model = 'OpenGVLab/InternVL2-2B'
557
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
558
- chat_template_config = ChatTemplateConfig('internlm2-chat')
559
  chat_template_config.meta_instruction = system_prompt
560
  pipe = pipeline(model, chat_template_config=chat_template_config,
561
  backend_config=TurbomindEngineConfig(session_len=8192))
@@ -579,7 +596,7 @@ from lmdeploy.vl import load_image
579
 
580
  model = 'OpenGVLab/InternVL2-2B'
581
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
582
- chat_template_config = ChatTemplateConfig('internlm2-chat')
583
  chat_template_config.meta_instruction = system_prompt
584
  pipe = pipeline(model, chat_template_config=chat_template_config,
585
  backend_config=TurbomindEngineConfig(session_len=8192))
@@ -613,4 +630,4 @@ print(sess.response.text)
613
  journal={arXiv preprint arXiv:2404.16821},
614
  year={2024}
615
  }
616
- ```
 
74
  > Please use transformers==4.37.2 to ensure the model works normally.
75
 
76
  ```python
77
+ import numpy as np
78
  import torch
79
  import torchvision.transforms as T
80
+ from decord import VideoReader, cpu
81
  from PIL import Image
82
  from torchvision.transforms.functional import InterpolationMode
83
  from transformers import AutoModel, AutoTokenizer
 
206
  print(f'User: {question}')
207
  print(f'Assistant: {response}')
208
 
209
+ # multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
210
+ pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
211
+ pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
212
+ pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
213
+
214
+ question = '<image>\nDescribe the two images in detail.'
215
+ response, history = model.chat(tokenizer, pixel_values, question, generation_config,
216
+ history=None, return_history=True)
217
+
218
+ question = 'What are the similarities and differences between these two images.'
219
+ response, history = model.chat(tokenizer, pixel_values, question, generation_config,
220
+ history=history, return_history=True)
221
+ print(f'User: {question}')
222
+ print(f'Assistant: {response}')
223
+
224
+ # multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
225
  pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
226
  pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
227
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
 
295
  pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
296
  pixel_values = pixel_values.to(torch.bfloat16).cuda()
297
  video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
298
+ question = video_prefix + 'What is the red panda doing? Dont'
299
  # Frame1: <image>\nFrame2: <image>\n...\nFrame31: <image>\n{question}
300
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
301
  num_patches_list=num_patches_list,
 
303
  print(f'User: {question}')
304
  print(f'Assistant: {response}')
305
 
306
+ question = 'Describe this video in detail. Don'
307
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
308
  num_patches_list=num_patches_list,
309
  history=history, return_history=True)
 
332
  model = 'OpenGVLab/InternVL2-2B'
333
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
334
  image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
335
+ chat_template_config = ChatTemplateConfig('internvl-internlm2')
336
  chat_template_config.meta_instruction = system_prompt
337
  pipe = pipeline(model, chat_template_config=chat_template_config,
338
  backend_config=TurbomindEngineConfig(session_len=8192))
 
353
 
354
  model = 'OpenGVLab/InternVL2-2B'
355
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
356
+ chat_template_config = ChatTemplateConfig('internvl-internlm2')
357
  chat_template_config.meta_instruction = system_prompt
358
  pipe = pipeline(model, chat_template_config=chat_template_config,
359
  backend_config=TurbomindEngineConfig(session_len=8192))
 
379
 
380
  model = 'OpenGVLab/InternVL2-2B'
381
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
382
+ chat_template_config = ChatTemplateConfig('internvl-internlm2')
383
  chat_template_config.meta_instruction = system_prompt
384
  pipe = pipeline(model, chat_template_config=chat_template_config,
385
  backend_config=TurbomindEngineConfig(session_len=8192))
 
403
 
404
  model = 'OpenGVLab/InternVL2-2B'
405
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
406
+ chat_template_config = ChatTemplateConfig('internvl-internlm2')
407
  chat_template_config.meta_instruction = system_prompt
408
  pipe = pipeline(model, chat_template_config=chat_template_config,
409
  backend_config=TurbomindEngineConfig(session_len=8192))
 
526
  model = 'OpenGVLab/InternVL2-2B'
527
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
528
  image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
529
+ chat_template_config = ChatTemplateConfig('internvl-internlm2')
530
  chat_template_config.meta_instruction = system_prompt
531
  pipe = pipeline(model, chat_template_config=chat_template_config,
532
  backend_config=TurbomindEngineConfig(session_len=8192))
 
547
 
548
  model = 'OpenGVLab/InternVL2-2B'
549
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
550
+ chat_template_config = ChatTemplateConfig('internvl-internlm2')
551
  chat_template_config.meta_instruction = system_prompt
552
  pipe = pipeline(model, chat_template_config=chat_template_config,
553
  backend_config=TurbomindEngineConfig(session_len=8192))
 
572
 
573
  model = 'OpenGVLab/InternVL2-2B'
574
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
575
+ chat_template_config = ChatTemplateConfig('internvl-internlm2')
576
  chat_template_config.meta_instruction = system_prompt
577
  pipe = pipeline(model, chat_template_config=chat_template_config,
578
  backend_config=TurbomindEngineConfig(session_len=8192))
 
596
 
597
  model = 'OpenGVLab/InternVL2-2B'
598
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
599
+ chat_template_config = ChatTemplateConfig('internvl-internlm2')
600
  chat_template_config.meta_instruction = system_prompt
601
  pipe = pipeline(model, chat_template_config=chat_template_config,
602
  backend_config=TurbomindEngineConfig(session_len=8192))
 
630
  journal={arXiv preprint arXiv:2404.16821},
631
  year={2024}
632
  }
633
+ ```
config.json CHANGED
@@ -12,7 +12,7 @@
12
  "dynamic_image_size": true,
13
  "force_image_size": 448,
14
  "llm_config": {
15
- "_name_or_path": "./pretrained/internlm2-chat-1_8b",
16
  "add_cross_attention": false,
17
  "architectures": [
18
  "InternLM2ForCausalLM"
@@ -111,86 +111,32 @@
111
  "use_llm_lora": 0,
112
  "use_thumbnail": true,
113
  "vision_config": {
114
- "_name_or_path": "",
115
- "add_cross_attention": false,
116
  "architectures": [
117
  "InternVisionModel"
118
  ],
119
  "attention_dropout": 0.0,
120
- "bad_words_ids": null,
121
- "begin_suppress_tokens": null,
122
- "bos_token_id": null,
123
- "chunk_size_feed_forward": 0,
124
- "cross_attention_hidden_size": null,
125
- "decoder_start_token_id": null,
126
- "diversity_penalty": 0.0,
127
- "do_sample": false,
128
  "drop_path_rate": 0.0,
129
  "dropout": 0.0,
130
- "early_stopping": false,
131
- "encoder_no_repeat_ngram_size": 0,
132
- "eos_token_id": null,
133
- "exponential_decay_length_penalty": null,
134
- "finetuning_task": null,
135
- "forced_bos_token_id": null,
136
- "forced_eos_token_id": null,
137
  "hidden_act": "gelu",
138
  "hidden_size": 1024,
139
- "id2label": {
140
- "0": "LABEL_0",
141
- "1": "LABEL_1"
142
- },
143
  "image_size": 448,
144
  "initializer_factor": 1.0,
145
  "initializer_range": 0.02,
146
  "intermediate_size": 4096,
147
- "is_decoder": false,
148
- "is_encoder_decoder": false,
149
- "label2id": {
150
- "LABEL_0": 0,
151
- "LABEL_1": 1
152
- },
153
  "layer_norm_eps": 1e-06,
154
- "length_penalty": 1.0,
155
- "max_length": 20,
156
- "min_length": 0,
157
  "model_type": "intern_vit_6b",
158
- "no_repeat_ngram_size": 0,
159
  "norm_type": "layer_norm",
160
  "num_attention_heads": 16,
161
- "num_beam_groups": 1,
162
- "num_beams": 1,
163
  "num_channels": 3,
164
  "num_hidden_layers": 24,
165
- "num_return_sequences": 1,
166
  "output_attentions": false,
167
  "output_hidden_states": false,
168
- "output_scores": false,
169
- "pad_token_id": null,
170
  "patch_size": 14,
171
- "prefix": null,
172
- "problem_type": null,
173
- "pruned_heads": {},
174
  "qk_normalization": false,
175
  "qkv_bias": true,
176
- "remove_invalid_values": false,
177
- "repetition_penalty": 1.0,
178
  "return_dict": true,
179
- "return_dict_in_generate": false,
180
- "sep_token_id": null,
181
- "suppress_tokens": null,
182
- "task_specific_params": null,
183
- "temperature": 1.0,
184
- "tf_legacy_loss": false,
185
- "tie_encoder_decoder": false,
186
- "tie_word_embeddings": true,
187
- "tokenizer_class": null,
188
- "top_k": 50,
189
- "top_p": null,
190
  "torch_dtype": "bfloat16",
191
- "torchscript": false,
192
  "transformers_version": "4.37.2",
193
- "typical_p": 1.0,
194
  "use_bfloat16": true,
195
  "use_flash_attn": true
196
  }
 
12
  "dynamic_image_size": true,
13
  "force_image_size": 448,
14
  "llm_config": {
15
+ "_name_or_path": "internlm/internlm2-chat-1_8b",
16
  "add_cross_attention": false,
17
  "architectures": [
18
  "InternLM2ForCausalLM"
 
111
  "use_llm_lora": 0,
112
  "use_thumbnail": true,
113
  "vision_config": {
 
 
114
  "architectures": [
115
  "InternVisionModel"
116
  ],
117
  "attention_dropout": 0.0,
 
 
 
 
 
 
 
 
118
  "drop_path_rate": 0.0,
119
  "dropout": 0.0,
 
 
 
 
 
 
 
120
  "hidden_act": "gelu",
121
  "hidden_size": 1024,
 
 
 
 
122
  "image_size": 448,
123
  "initializer_factor": 1.0,
124
  "initializer_range": 0.02,
125
  "intermediate_size": 4096,
 
 
 
 
 
 
126
  "layer_norm_eps": 1e-06,
 
 
 
127
  "model_type": "intern_vit_6b",
 
128
  "norm_type": "layer_norm",
129
  "num_attention_heads": 16,
 
 
130
  "num_channels": 3,
131
  "num_hidden_layers": 24,
 
132
  "output_attentions": false,
133
  "output_hidden_states": false,
 
 
134
  "patch_size": 14,
 
 
 
135
  "qk_normalization": false,
136
  "qkv_bias": true,
 
 
137
  "return_dict": true,
 
 
 
 
 
 
 
 
 
 
 
138
  "torch_dtype": "bfloat16",
 
139
  "transformers_version": "4.37.2",
 
140
  "use_bfloat16": true,
141
  "use_flash_attn": true
142
  }
modeling_internlm2.py CHANGED
@@ -709,6 +709,7 @@ class InternLM2PreTrainedModel(PreTrainedModel):
709
  supports_gradient_checkpointing = True
710
  _no_split_modules = ['InternLM2DecoderLayer']
711
  _skip_keys_device_placement = 'past_key_values'
 
712
 
713
  def _init_weights(self, module):
714
  std = self.config.initializer_range
 
709
  supports_gradient_checkpointing = True
710
  _no_split_modules = ['InternLM2DecoderLayer']
711
  _skip_keys_device_placement = 'past_key_values'
712
+ _supports_flash_attn_2 = True
713
 
714
  def _init_weights(self, module):
715
  std = self.config.initializer_range
modeling_internvl_chat.py CHANGED
@@ -7,6 +7,7 @@ import warnings
7
  from typing import Any, List, Optional, Tuple, Union
8
 
9
  import torch.utils.checkpoint
 
10
  from torch import nn
11
  from torch.nn import CrossEntropyLoss
12
  from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM,
@@ -23,6 +24,14 @@ from .modeling_internlm2 import InternLM2ForCausalLM
23
  logger = logging.get_logger(__name__)
24
 
25
 
 
 
 
 
 
 
 
 
26
  class InternVLChatModel(PreTrainedModel):
27
  config_class = InternVLChatConfig
28
  main_input_name = 'pixel_values'
@@ -31,6 +40,7 @@ class InternVLChatModel(PreTrainedModel):
31
  def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
32
  super().__init__(config)
33
 
 
34
  image_size = config.force_image_size or config.vision_config.image_size
35
  patch_size = config.vision_config.patch_size
36
  self.patch_size = patch_size
@@ -183,36 +193,44 @@ class InternVLChatModel(PreTrainedModel):
183
  vit_embeds = self.mlp1(vit_embeds)
184
  return vit_embeds
185
 
186
- def batch_chat(self, tokenizer, pixel_values, num_patches_list, questions, generation_config, history=None,
187
- return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
188
- IMG_CONTEXT_TOKEN='<IMG_CONTEXT>', verbose=False):
189
  if history is not None or return_history:
190
  print('Now multi-turn chat is not supported in batch_chat.')
191
  raise NotImplementedError
 
 
 
 
 
192
  img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
193
  self.img_context_token_id = img_context_token_id
194
 
195
- from .conversation import get_conv_template
 
 
196
 
197
  queries = []
198
- if verbose:
199
- image_bs = pixel_values.shape[0]
200
- print(f'dynamic ViT batch size: {image_bs}, num_patches_list: {num_patches_list}')
201
  for idx, num_patches in enumerate(num_patches_list):
202
- image_token = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
203
- question = image_token + '\n' + questions[idx]
 
204
  template = get_conv_template(self.template)
205
  template.append_message(template.roles[0], question)
206
  template.append_message(template.roles[1], None)
207
  query = template.get_prompt()
 
 
 
208
  queries.append(query)
 
209
  tokenizer.padding_side = 'left'
210
  model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
211
  input_ids = model_inputs['input_ids'].cuda()
212
  attention_mask = model_inputs['attention_mask'].cuda()
213
  eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
214
  generation_config['eos_token_id'] = eos_token_id
215
-
216
  generation_output = self.generate(
217
  pixel_values=pixel_values,
218
  input_ids=input_ids,
 
7
  from typing import Any, List, Optional, Tuple, Union
8
 
9
  import torch.utils.checkpoint
10
+ import transformers
11
  from torch import nn
12
  from torch.nn import CrossEntropyLoss
13
  from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM,
 
24
  logger = logging.get_logger(__name__)
25
 
26
 
27
+ def version_cmp(v1, v2, op='eq'):
28
+ import operator
29
+
30
+ from packaging import version
31
+ op_func = getattr(operator, op)
32
+ return op_func(version.parse(v1), version.parse(v2))
33
+
34
+
35
  class InternVLChatModel(PreTrainedModel):
36
  config_class = InternVLChatConfig
37
  main_input_name = 'pixel_values'
 
40
  def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
41
  super().__init__(config)
42
 
43
+ assert version_cmp(transformers.__version__, '4.36.2', 'ge')
44
  image_size = config.force_image_size or config.vision_config.image_size
45
  patch_size = config.vision_config.patch_size
46
  self.patch_size = patch_size
 
193
  vit_embeds = self.mlp1(vit_embeds)
194
  return vit_embeds
195
 
196
+ def batch_chat(self, tokenizer, pixel_values, questions, generation_config, num_patches_list=None,
197
+ history=None, return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
198
+ IMG_CONTEXT_TOKEN='<IMG_CONTEXT>', verbose=False, image_counts=None):
199
  if history is not None or return_history:
200
  print('Now multi-turn chat is not supported in batch_chat.')
201
  raise NotImplementedError
202
+
203
+ if image_counts is not None:
204
+ num_patches_list = image_counts
205
+ print('Warning: `image_counts` is deprecated. Please use `num_patches_list` instead.')
206
+
207
  img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
208
  self.img_context_token_id = img_context_token_id
209
 
210
+ if verbose and pixel_values is not None:
211
+ image_bs = pixel_values.shape[0]
212
+ print(f'dynamic ViT batch size: {image_bs}')
213
 
214
  queries = []
 
 
 
215
  for idx, num_patches in enumerate(num_patches_list):
216
+ question = questions[idx]
217
+ if pixel_values is not None and '<image>' not in question:
218
+ question = '<image>\n' + question
219
  template = get_conv_template(self.template)
220
  template.append_message(template.roles[0], question)
221
  template.append_message(template.roles[1], None)
222
  query = template.get_prompt()
223
+
224
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
225
+ query = query.replace('<image>', image_tokens, 1)
226
  queries.append(query)
227
+
228
  tokenizer.padding_side = 'left'
229
  model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
230
  input_ids = model_inputs['input_ids'].cuda()
231
  attention_mask = model_inputs['attention_mask'].cuda()
232
  eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
233
  generation_config['eos_token_id'] = eos_token_id
 
234
  generation_output = self.generate(
235
  pixel_values=pixel_values,
236
  input_ids=input_ids,
preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": 448,
3
+ "do_center_crop": true,
4
+ "do_normalize": true,
5
+ "do_resize": true,
6
+ "feature_extractor_type": "CLIPFeatureExtractor",
7
+ "image_mean": [
8
+ 0.485,
9
+ 0.456,
10
+ 0.406
11
+ ],
12
+ "image_std": [
13
+ 0.229,
14
+ 0.224,
15
+ 0.225
16
+ ],
17
+ "resample": 3,
18
+ "size": 448
19
+ }
tokenization_internlm2_fast.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # This code is based on transformers/src/transformers/models/llama/tokenization_llama_fast.py
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ """Tokenization Fast class for InternLM."""
18
+ import os
19
+ from shutil import copyfile
20
+ from typing import Any, Dict, Optional, Tuple
21
+
22
+ from tokenizers import Tokenizer, decoders, normalizers, processors
23
+ from tokenizers.models import BPE
24
+ from transformers.convert_slow_tokenizer import (SLOW_TO_FAST_CONVERTERS,
25
+ SentencePieceExtractor,
26
+ SpmConverter)
27
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
28
+ from transformers.utils import logging
29
+
30
+ from .tokenization_internlm2 import InternLM2Tokenizer
31
+
32
+ logger = logging.get_logger(__name__)
33
+
34
+ VOCAB_FILES_NAMES = {'vocab_file': './tokenizer.model'}
35
+
36
+
37
+ # Modified from transformers.convert_slow_tokenizer.LlamaConverter
38
+ class InternLM2Converter(SpmConverter):
39
+ handle_byte_fallback = True
40
+
41
+ def vocab(self, proto):
42
+ vocab = [
43
+ ('<unk>', 0.0),
44
+ ('<s>', 0.0),
45
+ ('</s>', 0.0),
46
+ ]
47
+ vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
48
+ return vocab
49
+
50
+ def unk_id(self, proto):
51
+ unk_id = 0
52
+ return unk_id
53
+
54
+ def decoder(self, replacement, add_prefix_space):
55
+ return decoders.Sequence(
56
+ [
57
+ decoders.Replace('▁', ' '),
58
+ decoders.ByteFallback(),
59
+ decoders.Fuse(),
60
+ decoders.Strip(content=' ', left=1),
61
+ ]
62
+ )
63
+
64
+ def tokenizer(self, proto):
65
+ model_type = proto.trainer_spec.model_type
66
+ vocab_scores = self.vocab(proto)
67
+ # special tokens
68
+ added_tokens = self.original_tokenizer.added_tokens_decoder
69
+ for i in range(len(vocab_scores)):
70
+ piece, score = vocab_scores[i]
71
+ if i in added_tokens:
72
+ vocab_scores[i] = (added_tokens[i].content, score)
73
+ if model_type == 1:
74
+ raise RuntimeError('InternLM2 is supposed to be a BPE model!')
75
+
76
+ elif model_type == 2:
77
+ _, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
78
+ bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
79
+ tokenizer = Tokenizer(
80
+ BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
81
+ )
82
+ tokenizer.add_special_tokens(
83
+ [ added_token for index, added_token in added_tokens.items()]
84
+ )
85
+ else:
86
+ raise Exception(
87
+ "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
88
+ )
89
+
90
+ return tokenizer
91
+
92
+ def normalizer(self, proto):
93
+ normalizers_list = []
94
+ if proto.normalizer_spec.add_dummy_prefix:
95
+ normalizers_list.append(normalizers.Prepend(prepend='▁'))
96
+ normalizers_list.append(normalizers.Replace(pattern=' ', content='▁'))
97
+ return normalizers.Sequence(normalizers_list)
98
+
99
+ def pre_tokenizer(self, replacement, add_prefix_space):
100
+ return None
101
+
102
+
103
+ SLOW_TO_FAST_CONVERTERS['InternLM2Tokenizer'] = InternLM2Converter
104
+
105
+
106
+ # Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
107
+ class InternLM2TokenizerFast(PreTrainedTokenizerFast):
108
+ vocab_files_names = VOCAB_FILES_NAMES
109
+ slow_tokenizer_class = InternLM2Tokenizer
110
+ padding_side = 'left'
111
+ model_input_names = ['input_ids', 'attention_mask']
112
+ _auto_class = 'AutoTokenizer'
113
+
114
+ def __init__(
115
+ self,
116
+ vocab_file,
117
+ unk_token='<unk>',
118
+ bos_token='<s>',
119
+ eos_token='</s>',
120
+ pad_token='</s>',
121
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
122
+ add_bos_token=True,
123
+ add_eos_token=False,
124
+ decode_with_prefix_space=False,
125
+ clean_up_tokenization_spaces=False,
126
+ **kwargs,
127
+ ):
128
+ super().__init__(
129
+ vocab_file=vocab_file,
130
+ unk_token=unk_token,
131
+ bos_token=bos_token,
132
+ eos_token=eos_token,
133
+ pad_token=pad_token,
134
+ sp_model_kwargs=sp_model_kwargs,
135
+ add_bos_token=add_bos_token,
136
+ add_eos_token=add_eos_token,
137
+ decode_with_prefix_space=decode_with_prefix_space,
138
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
139
+ **kwargs,
140
+ )
141
+ self._add_bos_token = add_bos_token
142
+ self._add_eos_token = add_eos_token
143
+ self.update_post_processor()
144
+ self.vocab_file = vocab_file
145
+
146
+ @property
147
+ def can_save_slow_tokenizer(self) -> bool:
148
+ return os.path.isfile(self.vocab_file) if self.vocab_file else False
149
+
150
+ def update_post_processor(self):
151
+ """
152
+ Updates the underlying post processor with the current `bos_token` and `eos_token`.
153
+ """
154
+ bos = self.bos_token
155
+ bos_token_id = self.bos_token_id
156
+ if bos is None and self.add_bos_token:
157
+ raise ValueError('add_bos_token = True but bos_token = None')
158
+
159
+ eos = self.eos_token
160
+ eos_token_id = self.eos_token_id
161
+ if eos is None and self.add_eos_token:
162
+ raise ValueError('add_eos_token = True but eos_token = None')
163
+
164
+ single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
165
+ pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
166
+
167
+ special_tokens = []
168
+ if self.add_bos_token:
169
+ special_tokens.append((bos, bos_token_id))
170
+ if self.add_eos_token:
171
+ special_tokens.append((eos, eos_token_id))
172
+ self._tokenizer.post_processor = processors.TemplateProcessing(
173
+ single=single, pair=pair, special_tokens=special_tokens
174
+ )
175
+
176
+ @property
177
+ def add_eos_token(self):
178
+ return self._add_eos_token
179
+
180
+ @property
181
+ def add_bos_token(self):
182
+ return self._add_bos_token
183
+
184
+ @add_eos_token.setter
185
+ def add_eos_token(self, value):
186
+ self._add_eos_token = value
187
+ self.update_post_processor()
188
+
189
+ @add_bos_token.setter
190
+ def add_bos_token(self, value):
191
+ self._add_bos_token = value
192
+ self.update_post_processor()
193
+
194
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
195
+ if not self.can_save_slow_tokenizer:
196
+ raise ValueError(
197
+ 'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow '
198
+ 'tokenizer.'
199
+ )
200
+
201
+ if not os.path.isdir(save_directory):
202
+ logger.error(f'Vocabulary path ({save_directory}) should be a directory')
203
+ return
204
+ out_vocab_file = os.path.join(
205
+ save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file']
206
+ )
207
+
208
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
209
+ copyfile(self.vocab_file, out_vocab_file)
210
+
211
+ return (out_vocab_file,)