|
from transformers import PretrainedConfig |
|
|
|
|
|
class InternVideo2Config(PretrainedConfig): |
|
model_type = "internvideo2" |
|
|
|
def __init__( |
|
self, |
|
img_size=224, |
|
patch_size=14, |
|
tubelet_size=1, |
|
num_frames=8, |
|
d_model=1408, |
|
num_heads=16, |
|
depth=40, |
|
mlp_ratio=48 / 11, |
|
qkv_bias=False, |
|
init_values=1e-5, |
|
use_checkpoint=False, |
|
checkpoint_num=0, |
|
use_flash_attn=False, |
|
use_fused_mlp=False, |
|
use_fused_rmsnorm=False, |
|
qk_normalization=True, |
|
clip_embed_dim=1408, |
|
attn_pool_num_heads=16, |
|
clip_teacher_embed_dim=512, |
|
clip_teacher_final_dim=512, |
|
clip_student_return_interval=4, |
|
clip_return_layer=3, |
|
clip_norm_type="l2", |
|
sep_image_video_pos_embed=False, |
|
**kwargs, |
|
): |
|
""" |
|
This is the configuration class to store the configuration of a `InternVideo2Model`. |
|
It is used to instantiate a InternVideo2 model according to the specified arguments, |
|
defining the model architecture. |
|
|
|
Args: |
|
img_size (int, optional): Input image size. Defaults to 224. |
|
patch_size (int, optional): Size of each patch. Defaults to 14. |
|
tubelet_size (int, optional): Temporal tubelet size. Defaults to 1. |
|
num_frames (int, optional): Number of frames in the video input. Defaults to 8. |
|
d_model (int, optional): Dimension of the model embeddings. Defaults to 1408. |
|
num_heads (int, optional): Number of attention heads. Defaults to 16. |
|
depth (int, optional): Number of transformer encoder layers. Defaults to 40. |
|
mlp_ratio (float, optional): Ratio of MLP hidden dim to embedding dim. Defaults to 48/11. |
|
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Defaults to False. |
|
init_values (float, optional): Initial values for layer scale. Defaults to 1e-5. |
|
use_checkpoint (bool, optional): Whether to use gradient checkpointing. Defaults to False. |
|
checkpoint_num (int, optional): Number of layers to apply checkpointing. Defaults to 0. |
|
use_flash_attn (bool, optional): Whether to use FlashAttention. Defaults to False. |
|
use_fused_mlp (bool, optional): Whether to use fused MLP. Defaults to False. |
|
use_fused_rmsnorm (bool, optional): Whether to use fused RMSNorm. Defaults to False. |
|
qk_normalization (bool, optional): Whether to apply QK normalization. Defaults to True. |
|
clip_embed_dim (int, optional): Embedding dimension for CLIP. Defaults to 1408. |
|
attn_pool_num_heads (int, optional): Number of heads for attention pooling. Defaults to 16. |
|
clip_teacher_embed_dim (int, optional): Embedding dimension for CLIP teacher model. Defaults to 512. |
|
clip_teacher_final_dim (int, optional): Final embedding dimension for CLIP teacher model. Defaults to 512. |
|
clip_student_return_interval (int, optional): Interval for returning student layers. Defaults to 4. |
|
clip_return_layer (int, optional): Number of layers to return for alignment. Defaults to 3. |
|
clip_norm_type (str, optional): Normalization type for CLIP ('l2' or 'none'). Defaults to 'l2'. |
|
sep_image_video_pos_embed (bool, optional): Whether to use separate position embeddings for image and video. Defaults to False. |
|
**kwargs: Additional keyword arguments. |
|
""" |
|
super().__init__(**kwargs) |
|
self.img_size = img_size |
|
self.patch_size = patch_size |
|
self.tubelet_size = tubelet_size |
|
self.num_frames = num_frames |
|
self.d_model = d_model |
|
self.num_heads = num_heads |
|
self.depth = depth |
|
self.mlp_ratio = mlp_ratio |
|
self.qkv_bias = qkv_bias |
|
self.init_values = init_values |
|
self.use_checkpoint = use_checkpoint |
|
self.checkpoint_num = checkpoint_num |
|
self.use_flash_attn = use_flash_attn |
|
self.use_fused_mlp = use_fused_mlp |
|
self.use_fused_rmsnorm = use_fused_rmsnorm |
|
self.qk_normalization = qk_normalization |
|
self.clip_embed_dim = clip_embed_dim |
|
self.attn_pool_num_heads = attn_pool_num_heads |
|
self.clip_teacher_embed_dim = clip_teacher_embed_dim |
|
self.clip_teacher_final_dim = clip_teacher_final_dim |
|
self.clip_student_return_interval = clip_student_return_interval |
|
self.clip_return_layer = clip_return_layer |
|
self.clip_norm_type = clip_norm_type |
|
self.sep_image_video_pos_embed = sep_image_video_pos_embed |
|
|