Update
Browse files- README.md +5 -7
- config.json +3 -3
- generation_config.json +1 -1
- model-00001-of-00004.safetensors +1 -1
- model-00002-of-00004.safetensors +1 -1
- model-00003-of-00004.safetensors +1 -1
- model-00004-of-00004.safetensors +1 -1
- modeling_bunny_llama.py +1 -1
- tokenizer.json +59 -56
- tokenizer_config.json +1 -13
README.md
CHANGED
@@ -9,19 +9,17 @@ license: apache-2.0
|
|
9 |
<img src="./icon.png" alt="Logo" width="350">
|
10 |
</p>
|
11 |
|
12 |
-
📖 [Technical report](https://arxiv.org/abs/2402.11530) | 🏠 [Code](https://github.com/BAAI-DCAI/Bunny) | 🐰 [Demo](https://wisemodel.cn/spaces/baai/Bunny)
|
13 |
|
14 |
This is Bunny-Llama-3-8B-V.
|
15 |
|
16 |
Bunny is a family of lightweight but powerful multimodal models. It offers multiple plug-and-play vision encoders, like EVA-CLIP, SigLIP and language backbones, including Llama-3-8B, Phi-1.5, StableLM-2 and Phi-2. To compensate for the decrease in model size, we construct more informative training data by curated selection from a broader data source.
|
17 |
|
18 |
-
We provide Bunny-Llama-3-8B-V, which is built upon [SigLIP](https://huggingface.co/google/siglip-so400m-patch14-384) and [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B).
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
| | MME \\(^{\text{P}}\\) | MME \\(^{\text{C}}\\) | MMB \\(^{\text{T/D}}\\) | SEED | MMMU \\(^{\text{V/T}}\\) | VQA \\(^{\text{v2}}\\) | GQA | SQA \\(^{\text{I}}\\) | POPE |
|
23 |
| ------------------ | :--------------: | :--------------: | :----------------: | :--: | :-----------------: | :---------------: | :--: | :--------------: | :--: |
|
24 |
-
| Bunny-Llama-3-8B-V |
|
25 |
|
26 |
|
27 |
|
@@ -65,7 +63,7 @@ tokenizer = AutoTokenizer.from_pretrained(
|
|
65 |
prompt = 'Why is the image funny?'
|
66 |
text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{prompt} ASSISTANT:"
|
67 |
text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
|
68 |
-
input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
|
69 |
|
70 |
# image, sample images can be found in images folder
|
71 |
image = Image.open('example_2.png')
|
|
|
9 |
<img src="./icon.png" alt="Logo" width="350">
|
10 |
</p>
|
11 |
|
12 |
+
📖 [Technical report](https://arxiv.org/abs/2402.11530) | 🏠 [Code](https://github.com/BAAI-DCAI/Bunny) | 🐰 [3B Demo](https://wisemodel.cn/spaces/baai/Bunny) | 🐰 [8B Demo](https://252412006bcde38bfa.gradio.live/)
|
13 |
|
14 |
This is Bunny-Llama-3-8B-V.
|
15 |
|
16 |
Bunny is a family of lightweight but powerful multimodal models. It offers multiple plug-and-play vision encoders, like EVA-CLIP, SigLIP and language backbones, including Llama-3-8B, Phi-1.5, StableLM-2 and Phi-2. To compensate for the decrease in model size, we construct more informative training data by curated selection from a broader data source.
|
17 |
|
18 |
+
We provide Bunny-Llama-3-8B-V, which is built upon [SigLIP](https://huggingface.co/google/siglip-so400m-patch14-384) and [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B). More details about this model can be found in [GitHub](https://github.com/BAAI-DCAI/Bunny).
|
19 |
|
20 |
+
| | MME \\(^{\text{P}}\\) | MME \\(^{\text{C}}\\) | MMB \\(^{\text{T/D}}\\) | SEED(-IMG) | MMMU \\(^{\text{V/T}}\\) | VQA \\(^{\text{v2}}\\) | GQA | SQA \\(^{\text{I}}\\) | POPE |
|
|
|
|
|
21 |
| ------------------ | :--------------: | :--------------: | :----------------: | :--: | :-----------------: | :---------------: | :--: | :--------------: | :--: |
|
22 |
+
| Bunny-Llama-3-8B-V | 1592.2 | 335.0 | 76.2/75.6 | 66.0(73.3) | 39.7/36.8 | 82.5 | 64.4 | 75.7 | 87.6 |
|
23 |
|
24 |
|
25 |
|
|
|
63 |
prompt = 'Why is the image funny?'
|
64 |
text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{prompt} ASSISTANT:"
|
65 |
text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
|
66 |
+
input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0)
|
67 |
|
68 |
# image, sample images can be found in images folder
|
69 |
image = Image.open('example_2.png')
|
config.json
CHANGED
@@ -19,7 +19,7 @@
|
|
19 |
"intermediate_size": 14336,
|
20 |
"max_position_embeddings": 8192,
|
21 |
"mm_hidden_size": 1152,
|
22 |
-
"mm_projector_lr":
|
23 |
"mm_projector_type": "mlp2x_gelu",
|
24 |
"mm_vision_tower": "google/siglip-so400m-patch14-384",
|
25 |
"model_type": "bunny-llama",
|
@@ -34,9 +34,9 @@
|
|
34 |
"tokenizer_model_max_length": 2048,
|
35 |
"tokenizer_padding_side": "right",
|
36 |
"torch_dtype": "float16",
|
37 |
-
"transformers_version": "4.
|
38 |
"tune_mm_mlp_adapter": false,
|
39 |
-
"unfreeze_vision_tower":
|
40 |
"use_cache": true,
|
41 |
"use_mm_proj": true,
|
42 |
"vocab_size": 128257
|
|
|
19 |
"intermediate_size": 14336,
|
20 |
"max_position_embeddings": 8192,
|
21 |
"mm_hidden_size": 1152,
|
22 |
+
"mm_projector_lr": 1e-05,
|
23 |
"mm_projector_type": "mlp2x_gelu",
|
24 |
"mm_vision_tower": "google/siglip-so400m-patch14-384",
|
25 |
"model_type": "bunny-llama",
|
|
|
34 |
"tokenizer_model_max_length": 2048,
|
35 |
"tokenizer_padding_side": "right",
|
36 |
"torch_dtype": "float16",
|
37 |
+
"transformers_version": "4.40.0",
|
38 |
"tune_mm_mlp_adapter": false,
|
39 |
+
"unfreeze_vision_tower": true,
|
40 |
"use_cache": true,
|
41 |
"use_mm_proj": true,
|
42 |
"vocab_size": 128257
|
generation_config.json
CHANGED
@@ -3,5 +3,5 @@
|
|
3 |
"bos_token_id": 128000,
|
4 |
"eos_token_id": 128001,
|
5 |
"pad_token_id": 128001,
|
6 |
-
"transformers_version": "4.
|
7 |
}
|
|
|
3 |
"bos_token_id": 128000,
|
4 |
"eos_token_id": 128001,
|
5 |
"pad_token_id": 128001,
|
6 |
+
"transformers_version": "4.40.0"
|
7 |
}
|
model-00001-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4976706784
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6ec0a7fd9ad460c3e0a4531b16c290472d8d133c348e9649efa2e9458e936a39
|
3 |
size 4976706784
|
model-00002-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4999802616
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ec257e1403e02aaac7d2d79e67ad4f5378c4b041037aa00862b1983b014729b1
|
3 |
size 4999802616
|
model-00003-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4915916080
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fb70b23deec1b9d9512676ba5c42ea0d3c980c1cba387fc9cb14e46be6153b08
|
3 |
size 4915916080
|
model-00004-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 2067676408
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:423517f46806e703f787ecb8ac332c729f167baeaca7e500da9462583ed8cda1
|
3 |
size 2067676408
|
modeling_bunny_llama.py
CHANGED
@@ -604,7 +604,7 @@ class BunnyMetaModel:
|
|
604 |
super(BunnyMetaModel, self).__init__(config)
|
605 |
|
606 |
if hasattr(config, "mm_vision_tower"):
|
607 |
-
self.vision_tower = build_vision_tower(config, delay_load=
|
608 |
self.mm_projector = build_vision_projector(config)
|
609 |
|
610 |
def get_vision_tower(self):
|
|
|
604 |
super(BunnyMetaModel, self).__init__(config)
|
605 |
|
606 |
if hasattr(config, "mm_vision_tower"):
|
607 |
+
self.vision_tower = build_vision_tower(config, delay_load=False)
|
608 |
self.mm_projector = build_vision_projector(config)
|
609 |
|
610 |
def get_vision_tower(self):
|
tokenizer.json
CHANGED
@@ -2306,15 +2306,6 @@
|
|
2306 |
"rstrip": false,
|
2307 |
"normalized": false,
|
2308 |
"special": true
|
2309 |
-
},
|
2310 |
-
{
|
2311 |
-
"id": 128256,
|
2312 |
-
"content": "<unk>",
|
2313 |
-
"single_word": false,
|
2314 |
-
"lstrip": false,
|
2315 |
-
"rstrip": false,
|
2316 |
-
"normalized": false,
|
2317 |
-
"special": true
|
2318 |
}
|
2319 |
],
|
2320 |
"normalizer": null,
|
@@ -2338,58 +2329,69 @@
|
|
2338 |
]
|
2339 |
},
|
2340 |
"post_processor": {
|
2341 |
-
"type": "
|
2342 |
-
"
|
2343 |
-
{
|
2344 |
-
"SpecialToken": {
|
2345 |
-
"id": "<|begin_of_text|>",
|
2346 |
-
"type_id": 0
|
2347 |
-
}
|
2348 |
-
},
|
2349 |
-
{
|
2350 |
-
"Sequence": {
|
2351 |
-
"id": "A",
|
2352 |
-
"type_id": 0
|
2353 |
-
}
|
2354 |
-
}
|
2355 |
-
],
|
2356 |
-
"pair": [
|
2357 |
-
{
|
2358 |
-
"SpecialToken": {
|
2359 |
-
"id": "<|begin_of_text|>",
|
2360 |
-
"type_id": 0
|
2361 |
-
}
|
2362 |
-
},
|
2363 |
-
{
|
2364 |
-
"Sequence": {
|
2365 |
-
"id": "A",
|
2366 |
-
"type_id": 0
|
2367 |
-
}
|
2368 |
-
},
|
2369 |
{
|
2370 |
-
"
|
2371 |
-
|
2372 |
-
|
2373 |
-
|
2374 |
},
|
2375 |
{
|
2376 |
-
"
|
2377 |
-
|
2378 |
-
|
2379 |
-
|
2380 |
-
|
2381 |
-
|
2382 |
-
|
2383 |
-
|
2384 |
-
|
2385 |
-
|
2386 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2387 |
],
|
2388 |
-
"
|
2389 |
-
"<|begin_of_text|>"
|
2390 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2391 |
}
|
2392 |
-
|
2393 |
},
|
2394 |
"decoder": {
|
2395 |
"type": "ByteLevel",
|
@@ -2405,6 +2407,7 @@
|
|
2405 |
"end_of_word_suffix": null,
|
2406 |
"fuse_unk": false,
|
2407 |
"byte_fallback": false,
|
|
|
2408 |
"vocab": {
|
2409 |
"!": 0,
|
2410 |
"\"": 1,
|
|
|
2306 |
"rstrip": false,
|
2307 |
"normalized": false,
|
2308 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2309 |
}
|
2310 |
],
|
2311 |
"normalizer": null,
|
|
|
2329 |
]
|
2330 |
},
|
2331 |
"post_processor": {
|
2332 |
+
"type": "Sequence",
|
2333 |
+
"processors": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2334 |
{
|
2335 |
+
"type": "ByteLevel",
|
2336 |
+
"add_prefix_space": true,
|
2337 |
+
"trim_offsets": false,
|
2338 |
+
"use_regex": true
|
2339 |
},
|
2340 |
{
|
2341 |
+
"type": "TemplateProcessing",
|
2342 |
+
"single": [
|
2343 |
+
{
|
2344 |
+
"SpecialToken": {
|
2345 |
+
"id": "<|begin_of_text|>",
|
2346 |
+
"type_id": 0
|
2347 |
+
}
|
2348 |
+
},
|
2349 |
+
{
|
2350 |
+
"Sequence": {
|
2351 |
+
"id": "A",
|
2352 |
+
"type_id": 0
|
2353 |
+
}
|
2354 |
+
}
|
2355 |
+
],
|
2356 |
+
"pair": [
|
2357 |
+
{
|
2358 |
+
"SpecialToken": {
|
2359 |
+
"id": "<|begin_of_text|>",
|
2360 |
+
"type_id": 0
|
2361 |
+
}
|
2362 |
+
},
|
2363 |
+
{
|
2364 |
+
"Sequence": {
|
2365 |
+
"id": "A",
|
2366 |
+
"type_id": 0
|
2367 |
+
}
|
2368 |
+
},
|
2369 |
+
{
|
2370 |
+
"SpecialToken": {
|
2371 |
+
"id": "<|begin_of_text|>",
|
2372 |
+
"type_id": 1
|
2373 |
+
}
|
2374 |
+
},
|
2375 |
+
{
|
2376 |
+
"Sequence": {
|
2377 |
+
"id": "B",
|
2378 |
+
"type_id": 1
|
2379 |
+
}
|
2380 |
+
}
|
2381 |
],
|
2382 |
+
"special_tokens": {
|
2383 |
+
"<|begin_of_text|>": {
|
2384 |
+
"id": "<|begin_of_text|>",
|
2385 |
+
"ids": [
|
2386 |
+
128000
|
2387 |
+
],
|
2388 |
+
"tokens": [
|
2389 |
+
"<|begin_of_text|>"
|
2390 |
+
]
|
2391 |
+
}
|
2392 |
+
}
|
2393 |
}
|
2394 |
+
]
|
2395 |
},
|
2396 |
"decoder": {
|
2397 |
"type": "ByteLevel",
|
|
|
2407 |
"end_of_word_suffix": null,
|
2408 |
"fuse_unk": false,
|
2409 |
"byte_fallback": false,
|
2410 |
+
"ignore_merges": true,
|
2411 |
"vocab": {
|
2412 |
"!": 0,
|
2413 |
"\"": 1,
|
tokenizer_config.json
CHANGED
@@ -1,6 +1,4 @@
|
|
1 |
{
|
2 |
-
"add_bos_token": true,
|
3 |
-
"add_eos_token": false,
|
4 |
"added_tokens_decoder": {
|
5 |
"128000": {
|
6 |
"content": "<|begin_of_text|>",
|
@@ -2049,14 +2047,6 @@
|
|
2049 |
"rstrip": false,
|
2050 |
"single_word": false,
|
2051 |
"special": true
|
2052 |
-
},
|
2053 |
-
"128256": {
|
2054 |
-
"content": "<unk>",
|
2055 |
-
"lstrip": false,
|
2056 |
-
"normalized": false,
|
2057 |
-
"rstrip": false,
|
2058 |
-
"single_word": false,
|
2059 |
-
"special": true
|
2060 |
}
|
2061 |
},
|
2062 |
"bos_token": "<|begin_of_text|>",
|
@@ -2067,7 +2057,5 @@
|
|
2067 |
"attention_mask"
|
2068 |
],
|
2069 |
"model_max_length": 1000000000000000019884624838656,
|
2070 |
-
"tokenizer_class": "
|
2071 |
-
"unk_token": "<unk>",
|
2072 |
-
"use_default_system_prompt": false
|
2073 |
}
|
|
|
1 |
{
|
|
|
|
|
2 |
"added_tokens_decoder": {
|
3 |
"128000": {
|
4 |
"content": "<|begin_of_text|>",
|
|
|
2047 |
"rstrip": false,
|
2048 |
"single_word": false,
|
2049 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2050 |
}
|
2051 |
},
|
2052 |
"bos_token": "<|begin_of_text|>",
|
|
|
2057 |
"attention_mask"
|
2058 |
],
|
2059 |
"model_max_length": 1000000000000000019884624838656,
|
2060 |
+
"tokenizer_class": "PreTrainedTokenizerFast"
|
|
|
|
|
2061 |
}
|