from transformers import PretrainedConfig class InternVideo2Config(PretrainedConfig): model_type = "internvideo2" def __init__( self, img_size=224, patch_size=14, tubelet_size=1, num_frames=8, d_model=1408, num_heads=16, depth=40, mlp_ratio=48 / 11, qkv_bias=False, init_values=1e-5, use_checkpoint=False, checkpoint_num=0, use_flash_attn=False, use_fused_mlp=False, use_fused_rmsnorm=False, qk_normalization=True, clip_embed_dim=1408, attn_pool_num_heads=16, clip_teacher_embed_dim=512, clip_teacher_final_dim=512, clip_student_return_interval=4, clip_return_layer=3, clip_norm_type="l2", sep_image_video_pos_embed=False, **kwargs, ): """ This is the configuration class to store the configuration of a `InternVideo2Model`. It is used to instantiate a InternVideo2 model according to the specified arguments, defining the model architecture. Args: img_size (int, optional): Input image size. Defaults to 224. patch_size (int, optional): Size of each patch. Defaults to 14. tubelet_size (int, optional): Temporal tubelet size. Defaults to 1. num_frames (int, optional): Number of frames in the video input. Defaults to 8. d_model (int, optional): Dimension of the model embeddings. Defaults to 1408. num_heads (int, optional): Number of attention heads. Defaults to 16. depth (int, optional): Number of transformer encoder layers. Defaults to 40. mlp_ratio (float, optional): Ratio of MLP hidden dim to embedding dim. Defaults to 48/11. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Defaults to False. init_values (float, optional): Initial values for layer scale. Defaults to 1e-5. use_checkpoint (bool, optional): Whether to use gradient checkpointing. Defaults to False. checkpoint_num (int, optional): Number of layers to apply checkpointing. Defaults to 0. use_flash_attn (bool, optional): Whether to use FlashAttention. Defaults to False. use_fused_mlp (bool, optional): Whether to use fused MLP. Defaults to False. use_fused_rmsnorm (bool, optional): Whether to use fused RMSNorm. Defaults to False. qk_normalization (bool, optional): Whether to apply QK normalization. Defaults to True. clip_embed_dim (int, optional): Embedding dimension for CLIP. Defaults to 1408. attn_pool_num_heads (int, optional): Number of heads for attention pooling. Defaults to 16. clip_teacher_embed_dim (int, optional): Embedding dimension for CLIP teacher model. Defaults to 512. clip_teacher_final_dim (int, optional): Final embedding dimension for CLIP teacher model. Defaults to 512. clip_student_return_interval (int, optional): Interval for returning student layers. Defaults to 4. clip_return_layer (int, optional): Number of layers to return for alignment. Defaults to 3. clip_norm_type (str, optional): Normalization type for CLIP ('l2' or 'none'). Defaults to 'l2'. sep_image_video_pos_embed (bool, optional): Whether to use separate position embeddings for image and video. Defaults to False. **kwargs: Additional keyword arguments. """ super().__init__(**kwargs) self.img_size = img_size self.patch_size = patch_size self.tubelet_size = tubelet_size self.num_frames = num_frames self.d_model = d_model self.num_heads = num_heads self.depth = depth self.mlp_ratio = mlp_ratio self.qkv_bias = qkv_bias self.init_values = init_values self.use_checkpoint = use_checkpoint self.checkpoint_num = checkpoint_num self.use_flash_attn = use_flash_attn self.use_fused_mlp = use_fused_mlp self.use_fused_rmsnorm = use_fused_rmsnorm self.qk_normalization = qk_normalization self.clip_embed_dim = clip_embed_dim self.attn_pool_num_heads = attn_pool_num_heads self.clip_teacher_embed_dim = clip_teacher_embed_dim self.clip_teacher_final_dim = clip_teacher_final_dim self.clip_student_return_interval = clip_student_return_interval self.clip_return_layer = clip_return_layer self.clip_norm_type = clip_norm_type self.sep_image_video_pos_embed = sep_image_video_pos_embed