apollonia-7b / vision_tower /internvideo2 /configuration_internvideo2.py
nisten's picture
Add files using upload-large-folder tool
deb6397 verified
raw
history blame
4.66 kB
from transformers import PretrainedConfig
class InternVideo2Config(PretrainedConfig):
model_type = "internvideo2"
def __init__(
self,
img_size=224,
patch_size=14,
tubelet_size=1,
num_frames=8,
d_model=1408,
num_heads=16,
depth=40,
mlp_ratio=48 / 11,
qkv_bias=False,
init_values=1e-5,
use_checkpoint=False,
checkpoint_num=0,
use_flash_attn=False,
use_fused_mlp=False,
use_fused_rmsnorm=False,
qk_normalization=True,
clip_embed_dim=1408,
attn_pool_num_heads=16,
clip_teacher_embed_dim=512,
clip_teacher_final_dim=512,
clip_student_return_interval=4,
clip_return_layer=3,
clip_norm_type="l2",
sep_image_video_pos_embed=False,
**kwargs,
):
"""
This is the configuration class to store the configuration of a `InternVideo2Model`.
It is used to instantiate a InternVideo2 model according to the specified arguments,
defining the model architecture.
Args:
img_size (int, optional): Input image size. Defaults to 224.
patch_size (int, optional): Size of each patch. Defaults to 14.
tubelet_size (int, optional): Temporal tubelet size. Defaults to 1.
num_frames (int, optional): Number of frames in the video input. Defaults to 8.
d_model (int, optional): Dimension of the model embeddings. Defaults to 1408.
num_heads (int, optional): Number of attention heads. Defaults to 16.
depth (int, optional): Number of transformer encoder layers. Defaults to 40.
mlp_ratio (float, optional): Ratio of MLP hidden dim to embedding dim. Defaults to 48/11.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Defaults to False.
init_values (float, optional): Initial values for layer scale. Defaults to 1e-5.
use_checkpoint (bool, optional): Whether to use gradient checkpointing. Defaults to False.
checkpoint_num (int, optional): Number of layers to apply checkpointing. Defaults to 0.
use_flash_attn (bool, optional): Whether to use FlashAttention. Defaults to False.
use_fused_mlp (bool, optional): Whether to use fused MLP. Defaults to False.
use_fused_rmsnorm (bool, optional): Whether to use fused RMSNorm. Defaults to False.
qk_normalization (bool, optional): Whether to apply QK normalization. Defaults to True.
clip_embed_dim (int, optional): Embedding dimension for CLIP. Defaults to 1408.
attn_pool_num_heads (int, optional): Number of heads for attention pooling. Defaults to 16.
clip_teacher_embed_dim (int, optional): Embedding dimension for CLIP teacher model. Defaults to 512.
clip_teacher_final_dim (int, optional): Final embedding dimension for CLIP teacher model. Defaults to 512.
clip_student_return_interval (int, optional): Interval for returning student layers. Defaults to 4.
clip_return_layer (int, optional): Number of layers to return for alignment. Defaults to 3.
clip_norm_type (str, optional): Normalization type for CLIP ('l2' or 'none'). Defaults to 'l2'.
sep_image_video_pos_embed (bool, optional): Whether to use separate position embeddings for image and video. Defaults to False.
**kwargs: Additional keyword arguments.
"""
super().__init__(**kwargs)
self.img_size = img_size
self.patch_size = patch_size
self.tubelet_size = tubelet_size
self.num_frames = num_frames
self.d_model = d_model
self.num_heads = num_heads
self.depth = depth
self.mlp_ratio = mlp_ratio
self.qkv_bias = qkv_bias
self.init_values = init_values
self.use_checkpoint = use_checkpoint
self.checkpoint_num = checkpoint_num
self.use_flash_attn = use_flash_attn
self.use_fused_mlp = use_fused_mlp
self.use_fused_rmsnorm = use_fused_rmsnorm
self.qk_normalization = qk_normalization
self.clip_embed_dim = clip_embed_dim
self.attn_pool_num_heads = attn_pool_num_heads
self.clip_teacher_embed_dim = clip_teacher_embed_dim
self.clip_teacher_final_dim = clip_teacher_final_dim
self.clip_student_return_interval = clip_student_return_interval
self.clip_return_layer = clip_return_layer
self.clip_norm_type = clip_norm_type
self.sep_image_video_pos_embed = sep_image_video_pos_embed