BAAI
/

BoyaWu10 commited on
Commit
ca14d76
1 Parent(s): 4327721
README.md CHANGED
@@ -9,19 +9,17 @@ license: apache-2.0
9
  <img src="./icon.png" alt="Logo" width="350">
10
  </p>
11
 
12
- 📖 [Technical report](https://arxiv.org/abs/2402.11530) | 🏠 [Code](https://github.com/BAAI-DCAI/Bunny) | 🐰 [Demo](https://wisemodel.cn/spaces/baai/Bunny)
13
 
14
  This is Bunny-Llama-3-8B-V.
15
 
16
  Bunny is a family of lightweight but powerful multimodal models. It offers multiple plug-and-play vision encoders, like EVA-CLIP, SigLIP and language backbones, including Llama-3-8B, Phi-1.5, StableLM-2 and Phi-2. To compensate for the decrease in model size, we construct more informative training data by curated selection from a broader data source.
17
 
18
- We provide Bunny-Llama-3-8B-V, which is built upon [SigLIP](https://huggingface.co/google/siglip-so400m-patch14-384) and [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B).
19
 
20
- The model is pretrained on LAION-2M and finetuned on Bunny-695K. More details about this model can be found in [GitHub](https://github.com/BAAI-DCAI/Bunny).
21
-
22
- | | MME \\(^{\text{P}}\\) | MME \\(^{\text{C}}\\) | MMB \\(^{\text{T/D}}\\) | SEED | MMMU \\(^{\text{V/T}}\\) | VQA \\(^{\text{v2}}\\) | GQA | SQA \\(^{\text{I}}\\) | POPE |
23
  | ------------------ | :--------------: | :--------------: | :----------------: | :--: | :-----------------: | :---------------: | :--: | :--------------: | :--: |
24
- | Bunny-Llama-3-8B-V | 1571.8 | 297.1 | 74.3/74.0 | 65.1 | 39.1/35.4 | 81.94 | 63.7 | 74.3 | 86.7 |
25
 
26
 
27
 
@@ -65,7 +63,7 @@ tokenizer = AutoTokenizer.from_pretrained(
65
  prompt = 'Why is the image funny?'
66
  text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{prompt} ASSISTANT:"
67
  text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
68
- input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
69
 
70
  # image, sample images can be found in images folder
71
  image = Image.open('example_2.png')
 
9
  <img src="./icon.png" alt="Logo" width="350">
10
  </p>
11
 
12
+ 📖 [Technical report](https://arxiv.org/abs/2402.11530) | 🏠 [Code](https://github.com/BAAI-DCAI/Bunny) | 🐰 [3B Demo](https://wisemodel.cn/spaces/baai/Bunny) | 🐰 [8B Demo](https://252412006bcde38bfa.gradio.live/)
13
 
14
  This is Bunny-Llama-3-8B-V.
15
 
16
  Bunny is a family of lightweight but powerful multimodal models. It offers multiple plug-and-play vision encoders, like EVA-CLIP, SigLIP and language backbones, including Llama-3-8B, Phi-1.5, StableLM-2 and Phi-2. To compensate for the decrease in model size, we construct more informative training data by curated selection from a broader data source.
17
 
18
+ We provide Bunny-Llama-3-8B-V, which is built upon [SigLIP](https://huggingface.co/google/siglip-so400m-patch14-384) and [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B). More details about this model can be found in [GitHub](https://github.com/BAAI-DCAI/Bunny).
19
 
20
+ | | MME \\(^{\text{P}}\\) | MME \\(^{\text{C}}\\) | MMB \\(^{\text{T/D}}\\) | SEED(-IMG) | MMMU \\(^{\text{V/T}}\\) | VQA \\(^{\text{v2}}\\) | GQA | SQA \\(^{\text{I}}\\) | POPE |
 
 
21
  | ------------------ | :--------------: | :--------------: | :----------------: | :--: | :-----------------: | :---------------: | :--: | :--------------: | :--: |
22
+ | Bunny-Llama-3-8B-V | 1592.2 | 335.0 | 76.2/75.6 | 66.0(73.3) | 39.7/36.8 | 82.5 | 64.4 | 75.7 | 87.6 |
23
 
24
 
25
 
 
63
  prompt = 'Why is the image funny?'
64
  text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{prompt} ASSISTANT:"
65
  text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
66
+ input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0)
67
 
68
  # image, sample images can be found in images folder
69
  image = Image.open('example_2.png')
config.json CHANGED
@@ -19,7 +19,7 @@
19
  "intermediate_size": 14336,
20
  "max_position_embeddings": 8192,
21
  "mm_hidden_size": 1152,
22
- "mm_projector_lr": 2e-05,
23
  "mm_projector_type": "mlp2x_gelu",
24
  "mm_vision_tower": "google/siglip-so400m-patch14-384",
25
  "model_type": "bunny-llama",
@@ -34,9 +34,9 @@
34
  "tokenizer_model_max_length": 2048,
35
  "tokenizer_padding_side": "right",
36
  "torch_dtype": "float16",
37
- "transformers_version": "4.38.2",
38
  "tune_mm_mlp_adapter": false,
39
- "unfreeze_vision_tower": false,
40
  "use_cache": true,
41
  "use_mm_proj": true,
42
  "vocab_size": 128257
 
19
  "intermediate_size": 14336,
20
  "max_position_embeddings": 8192,
21
  "mm_hidden_size": 1152,
22
+ "mm_projector_lr": 1e-05,
23
  "mm_projector_type": "mlp2x_gelu",
24
  "mm_vision_tower": "google/siglip-so400m-patch14-384",
25
  "model_type": "bunny-llama",
 
34
  "tokenizer_model_max_length": 2048,
35
  "tokenizer_padding_side": "right",
36
  "torch_dtype": "float16",
37
+ "transformers_version": "4.40.0",
38
  "tune_mm_mlp_adapter": false,
39
+ "unfreeze_vision_tower": true,
40
  "use_cache": true,
41
  "use_mm_proj": true,
42
  "vocab_size": 128257
generation_config.json CHANGED
@@ -3,5 +3,5 @@
3
  "bos_token_id": 128000,
4
  "eos_token_id": 128001,
5
  "pad_token_id": 128001,
6
- "transformers_version": "4.38.2"
7
  }
 
3
  "bos_token_id": 128000,
4
  "eos_token_id": 128001,
5
  "pad_token_id": 128001,
6
+ "transformers_version": "4.40.0"
7
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72d105f11150862dc188687e07b20cf28e5b53b47750889fc76c99ab8768c342
3
  size 4976706784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ec0a7fd9ad460c3e0a4531b16c290472d8d133c348e9649efa2e9458e936a39
3
  size 4976706784
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67b7dbce5a065f1591c9b7594a3afc5297f96af80ec3c62a8d92093e80ab0cd7
3
  size 4999802616
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec257e1403e02aaac7d2d79e67ad4f5378c4b041037aa00862b1983b014729b1
3
  size 4999802616
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a831144fabf9cb4a781bee44c48864bf5e28416d8678e1f06870e019e537335
3
  size 4915916080
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb70b23deec1b9d9512676ba5c42ea0d3c980c1cba387fc9cb14e46be6153b08
3
  size 4915916080
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:912a671e9f42e978074747d1b90819eecdc4fb63563bc04fb203d82b909e0998
3
  size 2067676408
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:423517f46806e703f787ecb8ac332c729f167baeaca7e500da9462583ed8cda1
3
  size 2067676408
modeling_bunny_llama.py CHANGED
@@ -604,7 +604,7 @@ class BunnyMetaModel:
604
  super(BunnyMetaModel, self).__init__(config)
605
 
606
  if hasattr(config, "mm_vision_tower"):
607
- self.vision_tower = build_vision_tower(config, delay_load=True)
608
  self.mm_projector = build_vision_projector(config)
609
 
610
  def get_vision_tower(self):
 
604
  super(BunnyMetaModel, self).__init__(config)
605
 
606
  if hasattr(config, "mm_vision_tower"):
607
+ self.vision_tower = build_vision_tower(config, delay_load=False)
608
  self.mm_projector = build_vision_projector(config)
609
 
610
  def get_vision_tower(self):
tokenizer.json CHANGED
@@ -2306,15 +2306,6 @@
2306
  "rstrip": false,
2307
  "normalized": false,
2308
  "special": true
2309
- },
2310
- {
2311
- "id": 128256,
2312
- "content": "<unk>",
2313
- "single_word": false,
2314
- "lstrip": false,
2315
- "rstrip": false,
2316
- "normalized": false,
2317
- "special": true
2318
  }
2319
  ],
2320
  "normalizer": null,
@@ -2338,58 +2329,69 @@
2338
  ]
2339
  },
2340
  "post_processor": {
2341
- "type": "TemplateProcessing",
2342
- "single": [
2343
- {
2344
- "SpecialToken": {
2345
- "id": "<|begin_of_text|>",
2346
- "type_id": 0
2347
- }
2348
- },
2349
- {
2350
- "Sequence": {
2351
- "id": "A",
2352
- "type_id": 0
2353
- }
2354
- }
2355
- ],
2356
- "pair": [
2357
- {
2358
- "SpecialToken": {
2359
- "id": "<|begin_of_text|>",
2360
- "type_id": 0
2361
- }
2362
- },
2363
- {
2364
- "Sequence": {
2365
- "id": "A",
2366
- "type_id": 0
2367
- }
2368
- },
2369
  {
2370
- "SpecialToken": {
2371
- "id": "<|begin_of_text|>",
2372
- "type_id": 1
2373
- }
2374
  },
2375
  {
2376
- "Sequence": {
2377
- "id": "B",
2378
- "type_id": 1
2379
- }
2380
- }
2381
- ],
2382
- "special_tokens": {
2383
- "<|begin_of_text|>": {
2384
- "id": "<|begin_of_text|>",
2385
- "ids": [
2386
- 128000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2387
  ],
2388
- "tokens": [
2389
- "<|begin_of_text|>"
2390
- ]
 
 
 
 
 
 
 
 
2391
  }
2392
- }
2393
  },
2394
  "decoder": {
2395
  "type": "ByteLevel",
@@ -2405,6 +2407,7 @@
2405
  "end_of_word_suffix": null,
2406
  "fuse_unk": false,
2407
  "byte_fallback": false,
 
2408
  "vocab": {
2409
  "!": 0,
2410
  "\"": 1,
 
2306
  "rstrip": false,
2307
  "normalized": false,
2308
  "special": true
 
 
 
 
 
 
 
 
 
2309
  }
2310
  ],
2311
  "normalizer": null,
 
2329
  ]
2330
  },
2331
  "post_processor": {
2332
+ "type": "Sequence",
2333
+ "processors": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2334
  {
2335
+ "type": "ByteLevel",
2336
+ "add_prefix_space": true,
2337
+ "trim_offsets": false,
2338
+ "use_regex": true
2339
  },
2340
  {
2341
+ "type": "TemplateProcessing",
2342
+ "single": [
2343
+ {
2344
+ "SpecialToken": {
2345
+ "id": "<|begin_of_text|>",
2346
+ "type_id": 0
2347
+ }
2348
+ },
2349
+ {
2350
+ "Sequence": {
2351
+ "id": "A",
2352
+ "type_id": 0
2353
+ }
2354
+ }
2355
+ ],
2356
+ "pair": [
2357
+ {
2358
+ "SpecialToken": {
2359
+ "id": "<|begin_of_text|>",
2360
+ "type_id": 0
2361
+ }
2362
+ },
2363
+ {
2364
+ "Sequence": {
2365
+ "id": "A",
2366
+ "type_id": 0
2367
+ }
2368
+ },
2369
+ {
2370
+ "SpecialToken": {
2371
+ "id": "<|begin_of_text|>",
2372
+ "type_id": 1
2373
+ }
2374
+ },
2375
+ {
2376
+ "Sequence": {
2377
+ "id": "B",
2378
+ "type_id": 1
2379
+ }
2380
+ }
2381
  ],
2382
+ "special_tokens": {
2383
+ "<|begin_of_text|>": {
2384
+ "id": "<|begin_of_text|>",
2385
+ "ids": [
2386
+ 128000
2387
+ ],
2388
+ "tokens": [
2389
+ "<|begin_of_text|>"
2390
+ ]
2391
+ }
2392
+ }
2393
  }
2394
+ ]
2395
  },
2396
  "decoder": {
2397
  "type": "ByteLevel",
 
2407
  "end_of_word_suffix": null,
2408
  "fuse_unk": false,
2409
  "byte_fallback": false,
2410
+ "ignore_merges": true,
2411
  "vocab": {
2412
  "!": 0,
2413
  "\"": 1,
tokenizer_config.json CHANGED
@@ -1,6 +1,4 @@
1
  {
2
- "add_bos_token": true,
3
- "add_eos_token": false,
4
  "added_tokens_decoder": {
5
  "128000": {
6
  "content": "<|begin_of_text|>",
@@ -2049,14 +2047,6 @@
2049
  "rstrip": false,
2050
  "single_word": false,
2051
  "special": true
2052
- },
2053
- "128256": {
2054
- "content": "<unk>",
2055
- "lstrip": false,
2056
- "normalized": false,
2057
- "rstrip": false,
2058
- "single_word": false,
2059
- "special": true
2060
  }
2061
  },
2062
  "bos_token": "<|begin_of_text|>",
@@ -2067,7 +2057,5 @@
2067
  "attention_mask"
2068
  ],
2069
  "model_max_length": 1000000000000000019884624838656,
2070
- "tokenizer_class": "LlamaTokenizer",
2071
- "unk_token": "<unk>",
2072
- "use_default_system_prompt": false
2073
  }
 
1
  {
 
 
2
  "added_tokens_decoder": {
3
  "128000": {
4
  "content": "<|begin_of_text|>",
 
2047
  "rstrip": false,
2048
  "single_word": false,
2049
  "special": true
 
 
 
 
 
 
 
 
2050
  }
2051
  },
2052
  "bos_token": "<|begin_of_text|>",
 
2057
  "attention_mask"
2058
  ],
2059
  "model_max_length": 1000000000000000019884624838656,
2060
+ "tokenizer_class": "PreTrainedTokenizerFast"
 
 
2061
  }