File size: 4,659 Bytes
f01c2b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from transformers import PretrainedConfig


class InternVideo2Config(PretrainedConfig):
    model_type = "internvideo2"

    def __init__(
        self,
        img_size=224,
        patch_size=14,
        tubelet_size=1,
        num_frames=8,
        d_model=1408,
        num_heads=16,
        depth=40,
        mlp_ratio=48 / 11,
        qkv_bias=False,
        init_values=1e-5,
        use_checkpoint=False,
        checkpoint_num=0,
        use_flash_attn=False,
        use_fused_mlp=False,
        use_fused_rmsnorm=False,
        qk_normalization=True,
        clip_embed_dim=1408,
        attn_pool_num_heads=16,
        clip_teacher_embed_dim=512,
        clip_teacher_final_dim=512,
        clip_student_return_interval=4,
        clip_return_layer=3,
        clip_norm_type="l2",
        sep_image_video_pos_embed=False,
        **kwargs,
    ):
        """
        This is the configuration class to store the configuration of a `InternVideo2Model`.
        It is used to instantiate a InternVideo2 model according to the specified arguments,
        defining the model architecture.

        Args:
            img_size (int, optional): Input image size. Defaults to 224.
            patch_size (int, optional): Size of each patch. Defaults to 14.
            tubelet_size (int, optional): Temporal tubelet size. Defaults to 1.
            num_frames (int, optional): Number of frames in the video input. Defaults to 8.
            d_model (int, optional): Dimension of the model embeddings. Defaults to 1408.
            num_heads (int, optional): Number of attention heads. Defaults to 16.
            depth (int, optional): Number of transformer encoder layers. Defaults to 40.
            mlp_ratio (float, optional): Ratio of MLP hidden dim to embedding dim. Defaults to 48/11.
            qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Defaults to False.
            init_values (float, optional): Initial values for layer scale. Defaults to 1e-5.
            use_checkpoint (bool, optional): Whether to use gradient checkpointing. Defaults to False.
            checkpoint_num (int, optional): Number of layers to apply checkpointing. Defaults to 0.
            use_flash_attn (bool, optional): Whether to use FlashAttention. Defaults to False.
            use_fused_mlp (bool, optional): Whether to use fused MLP. Defaults to False.
            use_fused_rmsnorm (bool, optional): Whether to use fused RMSNorm. Defaults to False.
            qk_normalization (bool, optional): Whether to apply QK normalization. Defaults to True.
            clip_embed_dim (int, optional): Embedding dimension for CLIP. Defaults to 1408.
            attn_pool_num_heads (int, optional): Number of heads for attention pooling. Defaults to 16.
            clip_teacher_embed_dim (int, optional): Embedding dimension for CLIP teacher model. Defaults to 512.
            clip_teacher_final_dim (int, optional): Final embedding dimension for CLIP teacher model. Defaults to 512.
            clip_student_return_interval (int, optional): Interval for returning student layers. Defaults to 4.
            clip_return_layer (int, optional): Number of layers to return for alignment. Defaults to 3.
            clip_norm_type (str, optional): Normalization type for CLIP ('l2' or 'none'). Defaults to 'l2'.
            sep_image_video_pos_embed (bool, optional): Whether to use separate position embeddings for image and video. Defaults to False.
            **kwargs: Additional keyword arguments.
        """
        super().__init__(**kwargs)
        self.img_size = img_size
        self.patch_size = patch_size
        self.tubelet_size = tubelet_size
        self.num_frames = num_frames
        self.d_model = d_model
        self.num_heads = num_heads
        self.depth = depth
        self.mlp_ratio = mlp_ratio
        self.qkv_bias = qkv_bias
        self.init_values = init_values
        self.use_checkpoint = use_checkpoint
        self.checkpoint_num = checkpoint_num
        self.use_flash_attn = use_flash_attn
        self.use_fused_mlp = use_fused_mlp
        self.use_fused_rmsnorm = use_fused_rmsnorm
        self.qk_normalization = qk_normalization
        self.clip_embed_dim = clip_embed_dim
        self.attn_pool_num_heads = attn_pool_num_heads
        self.clip_teacher_embed_dim = clip_teacher_embed_dim
        self.clip_teacher_final_dim = clip_teacher_final_dim
        self.clip_student_return_interval = clip_student_return_interval
        self.clip_return_layer = clip_return_layer
        self.clip_norm_type = clip_norm_type
        self.sep_image_video_pos_embed = sep_image_video_pos_embed