model upload

Browse files

Files changed (11) hide show

config.json +43 -0
configuration_flamingo.py +35 -0
flamingo_pytorch.py +220 -0
generation_config.json +7 -0
modeling_flamingo.py +516 -0
preprocessor_config.json +23 -0
pytorch_model.bin +3 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +5 -0
utils.py +37 -0

config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "_name_or_path": "facebook/opt-125m",
+  "_remove_final_layer_norm": false,
+  "activation_dropout": 0.0,
+  "activation_function": "relu",
+  "architectures": [
+    "FlamingoForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_flamingo.FlamingoConfig",
+    "AutoModelForCausalLM": "modeling_flamingo.FlamingoForCausalLM"
+  },
+  "attention_dropout": 0.0,
+  "bos_token_id": 2,
+  "cross_attn_every": 2,
+  "do_layer_norm_before": true,
+  "dropout": 0.1,
+  "enable_bias": true,
+  "eos_token_id": 2,
+  "ffn_dim": 3072,
+  "finetune_LM": true,
+  "hidden_size": 768,
+  "id_perceiver": false,
+  "init_std": 0.02,
+  "inp_dim": 768,
+  "layer_norm_elementwise_affine": true,
+  "layerdrop": 0.0,
+  "max_position_embeddings": 2048,
+  "media_token_id": 32768,
+  "model_type": "opt",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "only_attend_immediate_media": true,
+  "pad_token_id": 1,
+  "perceiver_depth": 2,
+  "perceiver_num_latents": 64,
+  "prefix": "</s>",
+  "torch_dtype": "float32",
+  "transformers_version": "4.29.0",
+  "use_cache": true,
+  "vocab_size": 32778,
+  "word_embed_proj_dim": 768
+}

configuration_flamingo.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Union
+import transformers.models.opt.configuration_opt as configuration_opt
+class FlamingoConfig(configuration_opt.OPTConfig, dict):
+    model_type = "flamingo"
+    def __init__(
+        self,
+        cross_attn_every=2,
+        vocab_size=32778,
+        media_token_id=32768,
+        **kwargs,
+    ):
+        configuration_opt.OPTConfig.__init__(
+                self, vocab_size=vocab_size, **kwargs)
+        self.media_token_id = media_token_id
+        self.cross_attn_every = cross_attn_every
+        dict.__init__(self, **self.__dict__)

flamingo_pytorch.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from einops_exts import rearrange_many, repeat_many
+import pdb
+def exists(val):
+    return val is not None
+def FeedForward(dim, mult = 4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias = False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias = False)
+    )
+class PerceiverAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_head = 64,
+        heads = 8
+    ):
+        super().__init__()
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm_media = nn.LayerNorm(dim)
+        self.norm_latents = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias = False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias = False)
+        self.to_out = nn.Linear(inner_dim, dim, bias = False)
+    def forward(self, x, latents):
+        """
+        einstein notation
+        b - batch
+        t - time
+        n - sequence
+        d - dimension
+        """
+        x = self.norm_media(x)
+        latents = self.norm_latents(latents)
+        b, m, h = *x.shape[:2], self.heads
+        q = self.to_q(latents)
+        # the paper differs from Perceiver in which they also concat the key / values derived from the latents to be attended to
+        kv_input = torch.cat((x, latents), dim = -2)
+        k, v = self.to_kv(kv_input).chunk(2, dim = -1)
+        q, k, v = rearrange_many((q, k, v), 'b t n (h d) -> b h t n d', h = h)
+        q = q * self.scale
+        # attention
+        sim = einsum('... i d, ... j d  -> ... i j', q, k)
+        sim = sim - sim.amax(dim = -1, keepdim = True).detach()
+        attn = sim.softmax(dim = -1)
+        out = einsum('... i j, ... j d -> ... i d', attn, v)
+        out = rearrange(out, 'b h t n d -> b t n (h d)', h = h)
+        return self.to_out(out)
+class PerceiverResampler(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth,
+        dim_head = 64,
+        heads = 8,
+        num_latents = 64,
+        num_time_embeds = 4,
+        ff_mult = 4,
+        inp_dim=None,
+    ):
+        super().__init__()
+        self.latents = nn.Parameter(torch.randn(num_latents, dim))
+        self.time_pos_emb = nn.Parameter(torch.randn(num_time_embeds, 1, dim))
+        if inp_dim is not None:
+            self.inp_linear = nn.Linear(inp_dim, dim, bias=False)
+        else:
+            self.inp_linear = None
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(nn.ModuleList([
+                PerceiverAttention(dim = dim, dim_head = dim_head, heads = heads),
+                FeedForward(dim = dim, mult = ff_mult)
+            ]))
+        self.norm = nn.LayerNorm(dim)
+    def forward(self, x):
+        if x.ndim == 3:
+            x = rearrange(x, 'b n d -> b 1 n d')
+        if self.inp_linear is not None:
+            x = self.inp_linear(x)
+        times = x.shape[1]
+        x = x + self.time_pos_emb[:times]
+        latents = repeat(self.latents, 'n d -> b m n d', b = x.shape[0], m = x.shape[1])
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        return self.norm(latents)
+# gated cross attention
+class MaskedCrossAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_head = 64,
+        heads = 8,
+        only_attend_immediate_media = True
+    ):
+        super().__init__()
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias = False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias = False)
+        self.to_out = nn.Linear(inner_dim, dim, bias = False)
+        # whether for text to only attend to immediate preceding image, or all images
+        self.only_attend_immediate_media = only_attend_immediate_media
+    def forward(
+        self,
+        x,
+        media,
+        media_locations = None
+    ):
+        b, t, m = media.shape[:3]
+        h = self.heads
+        x = self.norm(x)
+        q = self.to_q(x)
+        media = rearrange(media, 'b t n d -> b (t n) d')
+        k, v = self.to_kv(media).chunk(2, dim = -1)
+        q, k, v = rearrange_many((q, k, v), 'b n (h d) -> b h n d', h = h)
+        q = q * self.scale
+        sim = einsum('... i d, ... j d -> ... i j', q, k)
+        if exists(media_locations):
+            text_time = media_locations.cumsum(dim = -1) # at each boolean of True, increment the time counter (relative to media time)
+            media_time = torch.arange(t, device = x.device) + 1
+            # text time must equal media time if only attending to most immediate image
+            # otherwise, as long as text time is greater than media time (if attending to all previous images / media)
+            mask_op = torch.eq if self.only_attend_immediate_media else torch.ge
+            text_to_media_mask = mask_op(rearrange(text_time, 'b i -> b 1 i 1'), repeat(media_time, 'j -> 1 1 1 (j m)', m = m))
+            sim = sim.masked_fill(~text_to_media_mask, -torch.finfo(sim.dtype).max)
+        sim = sim - sim.amax(dim = -1, keepdim = True).detach()
+        attn = sim.softmax(dim = -1)
+        if exists(media_locations) and self.only_attend_immediate_media:
+            # any text without a preceding media needs to have attention zeroed out
+            text_without_media_mask = text_time == 0
+            text_without_media_mask = rearrange(text_without_media_mask, 'b i -> b 1 i 1')
+            attn.masked_fill(text_without_media_mask, 0.)
+        out = einsum('... i j, ... j d -> ... i d', attn, v)
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        return self.to_out(out)
+class GatedCrossAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_head = 64,
+        heads = 8,
+        ff_mult = 4,
+        only_attend_immediate_media = True
+    ):
+        super().__init__()
+        self.attn = MaskedCrossAttention(dim = dim, dim_head = dim_head, heads = heads, only_attend_immediate_media = only_attend_immediate_media)
+        self.attn_gate = nn.Parameter(torch.tensor([0.]))
+        self.ff = FeedForward(dim, mult = ff_mult)
+        self.ff_gate = nn.Parameter(torch.tensor([0.]))
+    def forward(
+        self,
+        x,
+        media,                  # media tensor, encoded by perceiver resample - (batch, time, latents, dim)
+        media_locations = None  # boolean tensor indicating positions of media - (batch, sequence)
+    ):
+        x = self.attn(x, media, media_locations = media_locations) * self.attn_gate.tanh() + x
+        x = self.ff(x) * self.ff_gate.tanh()  + x
+        return x

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 2,
+  "eos_token_id": 2,
+  "pad_token_id": 1,
+  "transformers_version": "4.29.0"
+}

modeling_flamingo.py ADDED Viewed

	@@ -0,0 +1,516 @@

+import random
+import pdb
+from einops import rearrange
+from typing import List, Optional, Tuple, Union
+import os
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+import transformers.models.opt.modeling_opt as modeling_opt
+from transformers.models.opt.modeling_opt\
+        import OPTDecoderLayer, OPTPreTrainedModel, OPTConfig
+from transformers import ViTModel
+from .utils import exists, freeze_all_layers_, unfreeze_all_layers_
+from .flamingo_pytorch import GatedCrossAttentionBlock, PerceiverResampler
+class OPTLearnedPositionalEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+    def __init__(self, num_embeddings: int, embedding_dim: int):
+        # OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models don't have this hack
+        self.offset = 2
+        super().__init__(num_embeddings + self.offset, embedding_dim)
+    def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        attention_mask = attention_mask.long()
+        # create positions depending on attention_mask
+        positions = torch.cumsum(attention_mask, dim=1)
+        positions = (positions.type_as(attention_mask) * attention_mask).long() - 1
+        # cut positions if `past_key_values_length` is > 0
+        positions = positions[:, past_key_values_length:]
+        return super().forward(positions + self.offset)
+class OPTDecoder(modeling_opt.OPTDecoder):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OPTDecoderLayer`]
+    Args:
+        config: OPTConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+    def __init__(self, config: OPTConfig):
+        OPTPreTrainedModel.__init__(self, config)
+        self.dropout = config.dropout
+        self.layerdrop = config.layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.vocab_size = config.vocab_size
+        self.media_token_id = config.media_token_id
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.word_embed_proj_dim, self.padding_idx)
+        self.embed_positions = OPTLearnedPositionalEmbedding(config.max_position_embeddings, config.hidden_size)
+        if config.word_embed_proj_dim != config.hidden_size:
+            self.project_out = nn.Linear(config.hidden_size, config.word_embed_proj_dim, bias=False)
+        else:
+            self.project_out = None
+        if config.word_embed_proj_dim != config.hidden_size:
+            self.project_in = nn.Linear(config.word_embed_proj_dim, config.hidden_size, bias=False)
+        else:
+            self.project_in = None
+        # Note that the only purpose of `config._remove_final_layer_norm` is to keep backward compatibility
+        # with checkpoints that have been fine-tuned before transformers v4.20.1
+        # see https://github.com/facebookresearch/metaseq/pull/164
+        if config.do_layer_norm_before and not config._remove_final_layer_norm:
+            self.final_layer_norm = nn.LayerNorm(config.hidden_size)
+        else:
+            self.final_layer_norm = None
+        dim_head = config.hidden_size // config.num_attention_heads
+        if not config.id_perceiver:
+            self.perceiver_resampler = PerceiverResampler(
+                dim=config.hidden_size,
+                depth=config.perceiver_depth,
+                dim_head=dim_head,
+                heads=config.num_attention_heads,
+                num_latents=config.perceiver_num_latents,
+                inp_dim=config.inp_dim,
+            )
+        else:
+            if config.inp_dim is None:
+                self.perceiver_resampler = nn.Identity()
+            else:
+                self.perceiver_resampler = nn.Linear(
+                        config.inp_dim, config.hidden_size,
+                        bias=False)
+        self.layers = nn.ModuleList([OPTDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gated_attn_layers = nn.ModuleList(
+                [GatedCrossAttentionBlock(
+                    dim=config.hidden_size, dim_head=dim_head, heads=config.num_attention_heads,
+                    only_attend_immediate_media=config.only_attend_immediate_media)\
+                 if not (ind % config.cross_attn_every) else None \
+                 for ind in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+        # in flamingo mode, freeze everything but perceiver and gated cross attention
+        if not config.finetune_LM:
+            freeze_all_layers_(self)
+            unfreeze_all_layers_(self.perceiver_resampler)
+            [unfreeze_all_layers_(cross_attn) for cross_attn in self.gated_attn_layers if exists(cross_attn)]
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        pixel_values=None,
+        image_embeds=None
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+                Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        batch, device = input_ids.shape[0], input_ids.device
+        flamingo_mode = exists(pixel_values) or exists(image_embeds)
+        # derive the media token ids (as a boolean tensor), for calculating the masked cross attention
+        if flamingo_mode:
+            media_locations = input_ids == self.media_token_id
+        assert not (exists(pixel_values) and exists(image_embeds))
+        # encode images into embeddings
+        # with the img_encoder passed in at init
+        # it can also accept precomputed image embeddings
+        if exists(pixel_values):
+            assert exists(self.img_encoder), 'img_encoder must be passed in for automatic image encoding'
+            if len(pixel_values.shape) == 4:
+                pixel_values = torch.unsqueeze(pixel_values, 1)
+            pixel_values = rearrange(pixel_values, 'b t ... -> (b t) ...')
+            with torch.no_grad():
+                if getattr(self.img_encoder, 'vision_model', None) is not None:
+                    image_outputs = self.img_encoder.vision_model(
+                            pixel_values=pixel_values,
+                            output_hidden_states=True, return_dict=True)
+                else:
+                    image_outputs = self.img_encoder(
+                            pixel_values=pixel_values,
+                            output_hidden_states=True, return_dict=True)
+            image_embeds = image_outputs['last_hidden_state']
+            image_embeds = rearrange(image_embeds, '(b t) ... -> b t ...', b = batch)
+        if exists(image_embeds):
+            image_embeds = self.perceiver_resampler(image_embeds)
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(inputs_embeds.shape[:2], dtype=torch.bool, device=inputs_embeds.device)
+        pos_embeds = self.embed_positions(attention_mask, past_key_values_length)
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+        if self.project_in is not None:
+            inputs_embeds = self.project_in(inputs_embeds)
+        hidden_states = inputs_embeds + pos_embeds
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        # check if head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask], ["head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            flamingo_cross_attn = self.gated_attn_layers[idx]
+            if exists(flamingo_cross_attn) and exists(image_embeds):
+                hidden_states = flamingo_cross_attn(
+                    hidden_states,
+                    image_embeds,
+                    media_locations = media_locations
+                )
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        if self.final_layer_norm is not None:
+            hidden_states = self.final_layer_norm(hidden_states)
+        if self.project_out is not None:
+            hidden_states = self.project_out(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class OPTModel(modeling_opt.OPTModel):
+    def __init__(self, config: OPTConfig):
+        OPTPreTrainedModel.__init__(self, config)
+        self.decoder = OPTDecoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+class OPTForCausalLM(modeling_opt.OPTForCausalLM):
+    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+    def __init__(self, config):
+        OPTPreTrainedModel.__init__(self, config)
+        self.model = OPTModel(config)
+        # the lm_head weight is automatically tied to the embed tokens weight
+        self.lm_head = nn.Linear(config.word_embed_proj_dim, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+def set_default_if_nonexist(config, key, value):
+    if getattr(config, key, None) is None:
+        setattr(config, key, value)
+    return config
+def setup_default_flamingo_configs(config):
+    set_default_if_nonexist(config, 'perceiver_depth', 2)
+    set_default_if_nonexist(config, 'perceiver_num_latents', 64)
+    set_default_if_nonexist(config, 'cross_attn_every', 3)
+    set_default_if_nonexist(config, 'only_attend_immediate_media', True)
+    set_default_if_nonexist(config, 'media_token_id', 50265)
+    set_default_if_nonexist(config, 'inp_dim', 768)
+    set_default_if_nonexist(config, 'finetune_LM', True)
+    set_default_if_nonexist(config, 'id_perceiver', False)
+    return config
+class FlamingoForCausalLM(modeling_opt.OPTForCausalLM):
+    _keys_to_ignore_on_load_missing = [
+            r"lm_head.weight",
+            ]
+    def __init__(self, config):
+        OPTPreTrainedModel.__init__(self, config)
+        config = setup_default_flamingo_configs(config)
+        self.model = OPTModel(config)
+        # the lm_head weight is automatically tied to the embed tokens weight
+        self.lm_head = nn.Linear(config.word_embed_proj_dim, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.model.decoder.img_encoder = None
+        self.loss_fct = CrossEntropyLoss()
+        dino_model = ViTModel.from_pretrained("facebook/dino-vitb16")
+        self.setup_vis_encoder(dino_model)
+    def setup_vis_encoder(self, img_encoder):
+        self.model.decoder.img_encoder = img_encoder
+        freeze_all_layers_(img_encoder)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        *args, **kwargs) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+                Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
+                tensors are only required when the model is used as a decoder in a Sequence to Sequence model.
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import GPT2Tokenizer, OPTForCausalLM
+        >>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
+        >>> tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350m")
+        >>> prompt = "Hey, are you consciours? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            *args, **kwargs)
+        logits = self.lm_head(outputs[0]).contiguous()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss = self.loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "feature_extractor_type": "ViTFeatureExtractor",
+  "image_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "image_processor_type": "ViTFeatureExtractor",
+  "image_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 224,
+    "width": 224
+  }
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d489df10d3aea1f59fdfeabaa5b0ea4ec5a35832f61c0965537441c6d93892ef
+size 1022117679

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "additional_special_tokens": [
+    "<image>",
+    "<PERSON>"
+  ],
+  "pad_token": "<pad>"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "clean_up_tokenization_spaces": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}

utils.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import torch
+def exists(val):
+    return val is not None
+# for controlling freezing during training of flamingo
+def set_module_requires_grad_(module, requires_grad):
+    for param in module.parameters():
+        param.requires_grad = requires_grad
+def freeze_all_layers_(module):
+    set_module_requires_grad_(module, False)
+def unfreeze_all_layers_(module):
+    set_module_requires_grad_(module, True)
+def freeze_model_and_make_eval_(model):
+    model.eval()
+    freeze_all_layers_(model)
+def _make_att_wd_mask(
+        input_ids_shape: torch.Size,
+        dtype: torch.dtype, device: torch.device,
+        past_key_values_length: int = 0,
+        att_wd_size: int = 0,
+    ):
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(
+            mask_cond > (mask_cond - att_wd_size).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)