# coding=utf-8 | |
""" mPLUGOwl3 model configuration""" | |
import os | |
from typing import Union | |
from transformers.utils import logging | |
from .configuration_hyper_qwen2 import HyperQwen2Config | |
from transformers.models.siglip.configuration_siglip import SiglipVisionConfig | |
logger = logging.get_logger(__name__) | |
class mPLUGOwl3Config(HyperQwen2Config): | |
model_type = "mplugowl3" | |
keys_to_ignore_at_inference = ["past_key_values"] | |
default_vision_config = { | |
"hidden_size": 1152, | |
"image_size": 384, | |
"intermediate_size": 4304, | |
"model_type": "siglip_vision_model", | |
"num_attention_heads": 16, | |
"num_hidden_layers": 27, | |
"patch_size": 14 | |
} | |
def __init__( | |
self, | |
use_cache=True, | |
vision_config=None, | |
**kwargs, | |
): | |
self.use_cache = use_cache | |
# same as HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit add tgt_sizes | |
if vision_config is None: | |
self.vision_config = SiglipVisionConfig(**self.default_vision_config) | |
logger.info("vision_config is None, using default vision config") | |
elif isinstance(vision_config, dict): | |
self.vision_config = SiglipVisionConfig(**vision_config) | |
elif isinstance(vision_config, SiglipVisionConfig): | |
self.vision_config = vision_config | |
self.image_size = self.vision_config.image_size | |
self.patch_size = self.vision_config.patch_size | |
super().__init__(**kwargs) | |