# coding=utf-8 """ mPLUGOwl3 model configuration""" import os from typing import Union from transformers.utils import logging from .configuration_hyper_qwen2 import HyperQwen2Config from transformers.models.siglip.configuration_siglip import SiglipVisionConfig logger = logging.get_logger(__name__) class mPLUGOwl3Config(HyperQwen2Config): model_type = "mplugowl3" keys_to_ignore_at_inference = ["past_key_values"] default_vision_config = { "hidden_size": 1152, "image_size": 378, "intermediate_size": 4304, "model_type": "siglip_vision_model", "num_attention_heads": 16, "num_hidden_layers": 27, "patch_size": 14 } def __init__( self, use_cache=True, vision_config=None, **kwargs, ): self.use_cache = use_cache # same as HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit add tgt_sizes if vision_config is None: self.vision_config = SiglipVisionConfig(**self.default_vision_config) logger.info("vision_config is None, using default vision config") elif isinstance(vision_config, dict): self.vision_config = SiglipVisionConfig(**vision_config) elif isinstance(vision_config, SiglipVisionConfig): self.vision_config = vision_config self.image_size = 378 self.patch_size = self.vision_config.patch_size super().__init__(**kwargs)