Muennighoff commited on Sep 25

Commit

18652d8

•

1 Parent(s): d13896f

Cp over files

Browse files

Files changed (17) hide show

beam_search.py +1087 -0
config_molmoe.py +9 -5
constants.py +571 -0
data_factory.py +222 -0
data_utils.py +827 -0
dataset_sizes.py +262 -0
exceptions.py +50 -0
iterable_dataset.py +266 -0
modeling_molmoe.py +4 -4
multimodal_preprocessor.py +1549 -0
preprocesssors.py +2472 -0
prompts.py +385 -0
seqio_tokenizer.py +659 -0
tasks.py +2548 -0
torch_util.py +183 -0
util.py +1 -1
utils.py +195 -0

beam_search.py ADDED Viewed

	@@ -0,0 +1,1087 @@

+"""
+This is a self-contained and flexible beam search implementation adapted from
+AllenNLP's beam search: https://github.com/allenai/allennlp/blob/main/allennlp/nn/beam_search.py
+"""
+import copy
+import warnings
+from abc import abstractmethod
+from inspect import signature
+from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar, cast
+import torch
+__all__ = [
+    "Sampler",
+    "DeterministicSampler",
+    "MultinomialSampler",
+    "TopKSampler",
+    "TopPSampler",
+    "GumbelSampler",
+    "FinalSequenceScorer",
+    "SequenceLogProbabilityScorer",
+    "LengthNormalizedSequenceLogProbabilityScorer",
+    "Constraint",
+    "RepeatedNGramBlockingConstraint",
+    "BeamSearch",
+]
+StateType = Dict[str, torch.Tensor]
+StepFunctionTypeWithTimestep = Callable[[torch.Tensor, StateType, int], Tuple[torch.Tensor, StateType]]
+StepFunctionTypeNoTimestep = Callable[[torch.Tensor, StateType], Tuple[torch.Tensor, StateType]]
+StepFunctionType = TypeVar("StepFunctionType", StepFunctionTypeWithTimestep, StepFunctionTypeNoTimestep)
+"""
+The type of step function that can be passed to [`BeamSearch.search`](#search).
+This can either be [`StepFunctionTypeWithTimestep`](#stepfunctiontypewithtimestep)
+or [`StepFunctionTypeNoTimestep`](#stepfunctiontypenotimestep).
+"""
+ConstraintStateType = List[List[Dict[str, Any]]]
+class Sampler:
+    """
+    An abstract class that can be used to sample candidates (either nodes or beams)
+    within `BeamSearch`.
+    A `Sampler` just has three methods, `init_state()`, `sample_nodes()` and `sample_beams()`.
+    `init_state()` takes three arguments:
+    - a tensor of starting log probs with shape `(batch_size,, num_classes)`,
+    - the batch size, an int,
+    - and the number of classes, also an int.
+    It returns a state dictionary with any state tensors needed for subsequent
+    calls to `sample_nodes()` and `sample_beams()`.
+    By default this method just returns an empty dictionary.
+    Both `sample_nodes()` and `sample_beams()` should take three arguments:
+    - tensor of normalized log probabilities with shape `(batch_size, num_examples)`,
+    - an integer representing the number of samples to take for each example in the batch,
+    - and a state dictionary which could contain any tensors needed for the `Sampler` to keep
+      track of state.
+    For `sample_nodes()`, `num_examples = num_classes`, but for `sample_beams`,
+    `num_examples = beam_size * per_node_beam_size`.
+    The return value should be a tuple containing:
+    - a tensor of log probabilities of the sampled examples with shape `(batch_size, num_samples)`,
+    - a tensor of indices of the sampled examples with shape `(batch_size, num_samples)`,
+    - and the updated state dictionary.
+    A default implementation of `sample_beams` is provided, which just deterministically
+    picks the `k` examples with highest log probability.
+    """
+    def init_state(
+        self, start_class_log_probabilities: torch.Tensor, batch_size: int, num_classes: int
+    ) -> StateType:
+        del start_class_log_probabilities, batch_size, num_classes
+        return {}
+    @abstractmethod
+    def sample_nodes(
+        self, log_probs: torch.Tensor, per_node_beam_size: int, state: StateType
+    ) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
+        raise NotImplementedError
+    def sample_beams(
+        self, log_probs: torch.Tensor, beam_size: int, state: StateType
+    ) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
+        del state
+        selected_log_probs, selected_indices = torch.topk(log_probs, beam_size, dim=-1)
+        return selected_log_probs, selected_indices, {}
+class DeterministicSampler(Sampler):
+    """
+    A `Sampler` that just deterministically returns the `k` nodes or beams with highest
+    log probability.
+    """
+    def sample_nodes(
+        self, log_probs: torch.Tensor, per_node_beam_size: int, state: StateType
+    ) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
+        del state
+        selected_log_probs, selected_indices = torch.topk(log_probs, per_node_beam_size, dim=-1)
+        return selected_log_probs, selected_indices, {}
+class MultinomialSampler(Sampler):
+    """
+    A `Sampler` which samples nodes from the given multinomial distribution. Beams are sampled
+    in the default, non-deterministic way.
+    :param temperature: A `temperature` below 1.0 produces a sharper probability distribution and a `temperature`
+        above 1.0 produces a flatter probability distribution.
+    :param with_replacement: Whether to sample with replacement.
+    """
+    def __init__(
+        self,
+        temperature: float = 1.0,
+        with_replacement: bool = False,
+    ) -> None:
+        self.temperature = temperature
+        self.with_replacement = with_replacement
+    def sample_nodes(
+        self, log_probs: torch.Tensor, per_node_beam_size: int, state: StateType
+    ) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
+        if self.temperature != 1.0:
+            _probabilities = torch.nn.functional.softmax(log_probs / self.temperature, dim=-1)
+        else:
+            _probabilities = log_probs.exp()
+        selected_indices = torch.multinomial(_probabilities, per_node_beam_size, replacement=self.with_replacement)
+        return torch.gather(log_probs, 1, selected_indices), selected_indices, state
+class TopKSampler(Sampler):
+    """
+    A `Sampler` which redistributes the probability mass function for nodes among the
+    top `k` choices, then samples from that subset after re-normalizing the probabilities.
+    Beams are sampled in the default, deterministic way.
+    :param k: The number of top choices to be selected from.
+    :param temperature: A `temperature` below 1.0 produces a sharper probability distribution and a `temperature`
+        above 1.0 produces a flatter probability distribution.
+    :param with_replacement: If set to `True`, samples will be selected with replacement from the top k choices.
+    """
+    def __init__(
+        self,
+        k: int = 1,
+        temperature: float = 1.0,
+        with_replacement: bool = False,
+    ):
+        self.k = k
+        self.temperature = temperature or 1.0
+        self.with_replacement = with_replacement
+    def sample_nodes(
+        self, log_probs: torch.Tensor, per_node_beam_size: int, state: StateType
+    ) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
+        if not per_node_beam_size <= self.k <= log_probs.size()[1]:
+            raise ValueError(
+                "k must be a postive integer no less than per_node_beam_size and no greater than vocabulary size"
+            )
+        # shape (both): (batch_size, k)
+        top_k_log_probs, top_k_indices = log_probs.topk(self.k, dim=-1)
+        # Apply temperature if necessary.
+        # shape: (batch_size, k)
+        if self.temperature != 1.0:
+            top_k_log_probs = top_k_log_probs / self.temperature
+        # Re-normalize the subset.
+        # shape: (batch_size, k)
+        normalized_top_k_probs = torch.nn.functional.softmax(top_k_log_probs, dim=-1)
+        # Sample from the re-normalized subset.
+        # NOTE: These indices are not indices into `log_probs`, they are indices into `top_k_log_probs`.
+        # shape: (batch_size, per_node_beam_size)
+        sampled_indices = torch.multinomial(
+            normalized_top_k_probs, per_node_beam_size, replacement=self.with_replacement
+        )
+        # Convert `sampled_indices` back to indices in the original `log_probs` tensor.
+        # shape: (batch_size, per_node_beam_size)
+        indices = top_k_indices.gather(-1, sampled_indices)
+        return log_probs.gather(1, indices), indices, state
+class TopPSampler(Sampler):
+    """
+    A `Sampler` which redistributes the probability mass function for nodes among
+    the top choices with a cumulative probability of at least `p`, then samples from that subset
+    after re-normalizing the probabilities.
+    Beams are sampled in the default, deterministic way.
+    :param p:
+        The cumulative probability cutoff threshold. A higher value of `p` will result in more possible
+        examples to sample from. If `with_replacement` is `False` and the number of possible samples is
+        insufficient to sample without replacement from when calling `sample_nodes`, then the top
+        `per_node_beam_size` examples will be chosen.
+    :param temperature:
+        A `temperature` below 1.0 produces a sharper probability distribution and a `temperature`
+        above 1.0 produces a flatter probability distribution.
+    :param with_replacement:
+        If set to `True`, samples will be selected with replacement from the top choices.
+    """
+    def __init__(
+        self,
+        p: float = 0.9,
+        temperature: float = 1.0,
+        with_replacement: bool = False,
+    ):
+        if p < 0.0 or p > 1.0:
+            raise ValueError("p must be a positive float no greater than 1.0")
+        self.p = p
+        self.temperature = temperature or 1.0
+        self.with_replacement = with_replacement
+    def sample_nodes(
+        self, log_probs: torch.Tensor, per_node_beam_size: int, state: StateType
+    ) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
+        if not per_node_beam_size <= log_probs.size()[1]:
+            raise ValueError("per_node_beam_size cannot be greater than vocabulary size")
+        # First apply temperature coefficient:
+        if self.temperature != 1.0:
+            _log_probs = torch.nn.functional.log_softmax(log_probs / self.temperature, dim=-1)
+        else:
+            _log_probs = log_probs
+        # Sort the probabilities in descending order to then find cumulative sum
+        log_probs_descending, sorting_indices = torch.sort(_log_probs, descending=True)
+        # shape: (batch_size, num_classes)
+        probabilities_descending = log_probs_descending.exp()
+        probabilities_summed = torch.cumsum(probabilities_descending, dim=-1)
+        # Create a mask for filtering out probabilities that don't make the top `p`.
+        # shape: (batch_size, num_classes)
+        exclusion_mask = probabilities_summed >= self.p
+        # We want to include the first index where probabilities_summed >= p, so we shift over one.
+        exclusion_mask[..., 1:] = exclusion_mask[..., :-1].clone()
+        exclusion_mask[..., 0] = False
+        # Make sure there's at least `per_node_beam_size` options to be selected.
+        if not self.with_replacement:
+            exclusion_mask[..., :per_node_beam_size] = False
+        log_probs_descending[exclusion_mask] = torch.finfo(log_probs.dtype).min
+        # Now re-normalized the included log probs.
+        # shape: (batch_size, num_classes)
+        filtered_probabilities = torch.nn.functional.softmax(log_probs_descending, dim=-1)
+        # Sample from the re-normalized subset.
+        # NOTE: These indices are not indices into `log_probs`, they are indices into `log_probs_descending`.
+        # shape: (batch_size, per_node_beam_size)
+        sampled_indices = torch.multinomial(
+            filtered_probabilities, per_node_beam_size, replacement=self.with_replacement
+        )
+        # Convert `sampled_indices` back to indices in the original `log_probs` tensor.
+        # shape: (batch_size, per_node_beam_size)
+        selected_indices = sorting_indices.gather(-1, sampled_indices)
+        # Return (selected log probabilities, selected classes)
+        # shape: (len(log_probs),1) , (len(log_probs), 1)
+        return torch.gather(log_probs, 1, selected_indices), selected_indices, state
+class GumbelSampler(Sampler):
+    """
+    A `Sampler` which uses the Gumbel-Top-K trick to sample without replacement. See
+    [*Stochastic Beams and Where to Find Them: The Gumbel-Top-k Trick for Sampling
+    Sequences Without Replacement*, W Kool, H Van Hoof and M Welling, 2010]
+    (https://api.semanticscholar.org/CorpusID:76662039).
+    :param temperature: A `temperature` below 1.0 produces a sharper probability distribution and a `temperature`
+        above 1.0 produces a flatter probability distribution.
+    """
+    def __init__(self, temperature: float = 1.0):
+        self.temperature = temperature
+    def init_state(
+        self, start_class_log_probabilities: torch.Tensor, batch_size: int, num_classes: int
+    ) -> StateType:
+        # shape: (batch_size, num_classes)
+        zeros = start_class_log_probabilities.new_zeros((batch_size, num_classes))
+        # shape: (batch_size, num_classes)
+        G_phi_S = self.gumbel_with_max(start_class_log_probabilities, zeros)
+        return {"G_phi_S": G_phi_S}
+    def sample_nodes(
+        self,
+        log_probs: torch.Tensor,
+        per_node_beam_size: int,
+        state: StateType,
+    ) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
+        # First apply temperature coefficient:
+        # shape: (batch_size * beam_size, num_classes)
+        if self.temperature != 1.0:
+            _log_probs = torch.nn.functional.log_softmax(log_probs / self.temperature, dim=-1)
+        else:
+            _log_probs = log_probs
+        # shape: (group_size,)
+        phi_S = state["phi_S"]
+        # shape: (group_size, num_classes)
+        phi_S = phi_S.unsqueeze(-1).expand_as(_log_probs)
+        # shape: (group_size, num_classes)
+        phi_S_new = phi_S + _log_probs
+        # shape: (group_size, 1)
+        G_phi_S = state["G_phi_S"].unsqueeze(-1)
+        # shape: (group_size, num_classes)
+        G_phi_S_new = self.gumbel_with_max(phi_S_new, G_phi_S)
+        # Replace NaNs with very negative number.
+        # shape: (group_size, num_classes)
+        #  G_phi_S_new[G_phi_S_new.isnan()] = torch.finfo(G_phi_S_new.dtype).min
+        # shape (both): (group_size, per_node_beam_size)
+        top_G_phi_S_new, top_indices = torch.topk(G_phi_S_new, per_node_beam_size, dim=-1)
+        # shape: (group_size, per_node_beam_size)
+        top_log_probs = log_probs.gather(1, top_indices)
+        return top_log_probs, top_indices, {"G_phi_S": top_G_phi_S_new}
+    def sample_beams(
+        self,
+        log_probs: torch.Tensor,
+        beam_size: int,
+        state: StateType,
+    ) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
+        """
+        Returns the beams with the highest perturbed log probabilities.
+        """
+        # shape (log_probs): (batch_size, beam_size * per_node_beam_size)
+        batch_size = log_probs.size()[0]
+        # shape: (batch_size * beam_size, per_node_beam_size)
+        G_phi_S = state["G_phi_S"]
+        # shape: (batch_size, beam_size * per_node_beam_size)
+        G_phi_S = G_phi_S.reshape_as(log_probs)
+        # shape (both): (batch_size, beam_size)
+        G_phi_S_new, selected_indices = torch.topk(G_phi_S, beam_size, dim=-1)
+        # shape: (batch_size, beam_size)
+        selected_log_probs = log_probs.gather(1, selected_indices)
+        # Now sort the selected beams by their true log prob.
+        # shape (all): (batch_size, beam_size)
+        selected_log_probs, sort_indices = selected_log_probs.sort(dim=-1, descending=True)
+        selected_indices = selected_indices.gather(1, sort_indices)
+        G_phi_S_new = G_phi_S_new.gather(1, sort_indices)
+        # shape: (batch_size * beam_size,)
+        G_phi_S_new = G_phi_S_new.reshape(batch_size * beam_size)
+        # shape: (batch_size * beam_size,)
+        phi_S = selected_log_probs.reshape(batch_size * beam_size)
+        return selected_log_probs, selected_indices, {"G_phi_S": G_phi_S_new, "phi_S": phi_S}
+    def gumbel(self, phi) -> torch.Tensor:
+        """
+        Sample `Gumbel(phi)`.
+        `phi` should have shape `(batch_size, num_classes)`.
+        """
+        return -torch.log(-torch.log(torch.rand_like(phi))) + phi
+    def gumbel_with_max(self, phi, T) -> torch.Tensor:
+        """
+        Sample `Gumbel(phi)` conditioned on the maximum value being equal to `T`.
+        `phi` should have shape `(batch_size, num_classes)` and `T` should have
+        shape `(batch_size, 1)`.
+        """
+        # Shape: (batch_size, num_classes)
+        G_phi = self.gumbel(phi)
+        # Now we find the maximum from these samples.
+        # Shape: (batch_size, )
+        Z, _ = G_phi.max(dim=-1)
+        # Shape: (batch_size, num_classes)
+        v = T - G_phi + torch.log1p(-torch.exp(G_phi - Z.unsqueeze(-1)))
+        # Shape: (batch_size, num_classes)
+        return T - torch.nn.functional.relu(v) - torch.log1p(torch.exp(-v.abs()))
+class FinalSequenceScorer:
+    """
+    An abstract class that can be used to score the final generated sequences found
+    by beam search. Given the predicted sequences and the corresponding log probabilities of
+    those sequences, the class calculates and returns the final score of the sequences.
+    The default implementation scores the sequences using the sum of the log probabilities of
+    the sequence, which is passed as input.
+    """
+    @abstractmethod
+    def score(self, predictions: torch.Tensor, log_probabilities: torch.Tensor, end_index: int) -> torch.Tensor:
+        """
+        Score the final predictions found by beam search.
+        Returns a tensor of the final sequence scores of shape `(batch_size, beam_size)`.
+        :param predictions: A tensor containing the initial predictions with shape `(batch_size, beam_size, max_steps)`.
+        :param log_probabilities: A tensor containing the log probabilities of the sequence, defined as the sum
+            of the log probabilities per token, with shape `(batch_size, beam_size)`.
+        :param end_index: The index of the end symbol.
+        """
+        raise NotImplementedError
+class SequenceLogProbabilityScorer(FinalSequenceScorer):
+    """
+    A :class:`FinalSequenceScorer` which scores the sequences by the sum of the log probabilities
+    across the sequence's tokens.
+    """
+    def score(self, predictions: torch.Tensor, log_probabilities: torch.Tensor, end_index: int) -> torch.Tensor:
+        del predictions, end_index
+        # The sum of the sequence log probabilities is the input parameter, so just
+        # return it.
+        return log_probabilities
+class LengthNormalizedSequenceLogProbabilityScorer(FinalSequenceScorer):
+    """
+    A :class:`FinalSequenceScorer` which scores the sequences by the average log probability of the
+    tokens in the sequence. It optionally includes a length penalty which promotes
+    or demotes sequences based on their lengths. The final score for a sequence will
+    be `(sequence_log_probability) / (sequence_length ** length_penalty)`. The sequence length
+    here includes the end token.
+    :param length_penalty: The length penalty to use. A value of 1.0 means no length penalty is used.
+        A value > 1.0 favors longer sequences, and < 1.0 favors shorter sequences.
+    """
+    def __init__(self, length_penalty: float = 1.0):
+        super().__init__()
+        self.length_penalty = length_penalty
+    def score(self, predictions: torch.Tensor, log_probabilities: torch.Tensor, end_index: int) -> torch.Tensor:
+        # shape: (batch_size, beam_size)
+        lengths = (predictions != end_index).long().sum(dim=2)
+        # If the sequence ended during beam search, the `log_probabilities` will include
+        # the transition to the end token. Therefore, in such situations, `lengths` is
+        # actually off by 1. This corrects for that.
+        # shape: (batch_size, beam_size)
+        is_end_token = predictions[:, :, -1] == end_index
+        lengths += is_end_token.long()
+        # shape: (batch_size, beam_size)
+        average_log_probs = log_probabilities / (lengths**self.length_penalty)
+        return average_log_probs
+class Constraint:
+    """
+    An abstract class that can be used to enforce constraints on the output predictions
+    by manipulating the class log probabilities during beam search.
+    A `Constraint` just has three methods that need to be implemented by subclasses:
+    `init_state()`, `apply()` and `_update_state()`.
+    `init_state()` takes one argument:
+    - the batch size, an int
+    It returns a constraint state, which is a nested list of dictionaries, with any state needed for subsequent
+    calls to `apply()` and `update_state()`. The length of the outer list should be equal to `batch_size`.
+    Each inner list should be of length 1.
+    `apply()` takes two arguments:
+    - the constraint state, which is a nested list of dictionaries. The length of the outer list is `batch_size`
+    and the length of each inner list is `beam_size` except on the first time `apply()` is called when it is 1.
+    - `class_log_probabilities`, a tensor of shape `(batch_size, beam_size, num_classes)` that contains the
+    log probabilities for the classes during search. The first time `apply()` is called, `beam_size = 1`.
+    The `apply()` method should return new `class_log_probabilities` that enforce the constraint
+    for this step of beam search. For instance, it may prevent a specific class from being selected by setting
+    the corresponding log probability to a negligible value such as `float("-inf")` or
+    `torch.finfo(class_log_probabilities.dtype).min`.
+    `_update_state()` takes two arguments:
+    - the copied parent constraint state, which is a nested list of dictionaries. `state[i][j]` contains the
+    copied state for the parent of `last_prediction[i, j]`. It is unique to that batch and beam, so it can be
+    directly edited in-place without affecting the others.
+    - last_prediction, a tensor of shape `(batch_size, beam_size)` containing the predictions from the last
+    step of beam search.
+    The `_update_state()` function should return a new constraint state, a nested list of dictionaries of
+    length `batch_size` and inner list of length `beam_size`, one for each of the predictions in `last_prediction`.
+    """
+    @abstractmethod
+    def init_state(
+        self,
+        batch_size: int,
+    ) -> ConstraintStateType:
+        raise NotImplementedError
+    @abstractmethod
+    def apply(
+        self,
+        state: ConstraintStateType,
+        class_log_probabilities: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+    @staticmethod
+    def _copy_state(
+        state: ConstraintStateType,
+        batch_size: int,
+        beam_size: int,
+        last_backpointer: Optional[torch.Tensor] = None,
+    ) -> ConstraintStateType:
+        """
+        Copies the `state` . This method copies the data in `state` using `copy.deepcopy()`. If this
+        is not appropriate for your constraint, you will need to implement the copying yourself.
+        """
+        new_state = []
+        for i in range(batch_size):
+            batch_state = []
+            for j in range(beam_size):
+                if last_backpointer is None:
+                    # This is the first prediction, so the backpointer is 0
+                    backpointer = 0
+                else:
+                    backpointer = last_backpointer[i, j].item()
+                batch_state.append(copy.deepcopy(state[i][backpointer]))  # type: ignore
+            new_state.append(batch_state)
+        return new_state
+    def update_state(
+        self,
+        state: ConstraintStateType,
+        last_prediction: torch.Tensor,
+        last_backpointer: Optional[torch.Tensor] = None,
+    ) -> ConstraintStateType:
+        batch_size, beam_size = last_prediction.size()
+        new_state = self._copy_state(state, batch_size, beam_size, last_backpointer)
+        return self._update_state(new_state, last_prediction)
+    @abstractmethod
+    def _update_state(
+        self,
+        state: ConstraintStateType,
+        last_prediction: torch.Tensor,
+    ) -> ConstraintStateType:
+        raise NotImplementedError
+class RepeatedNGramBlockingConstraint(Constraint):
+    def __init__(self, ngram_size: int, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.ngram_size = ngram_size
+    def init_state(
+        self,
+        batch_size: int,
+    ) -> ConstraintStateType:
+        return [[{"seen_ngrams": {}, "current_prefix": []}] for _ in range(batch_size)]
+    def apply(
+        self,
+        state: ConstraintStateType,
+        class_log_probabilities: torch.Tensor,
+    ) -> torch.Tensor:
+        for i, batch in enumerate(state):
+            for j, beam in enumerate(batch):
+                current_prefix = tuple(beam["current_prefix"])
+                seen_ngrams = beam["seen_ngrams"]
+                try:
+                    disallowed_indices = seen_ngrams[current_prefix]
+                    class_log_probabilities[i, j, disallowed_indices] = torch.finfo(
+                        class_log_probabilities.dtype
+                    ).min
+                except KeyError:
+                    # We have not seen this prefix before, so there is no index
+                    # that needs to be blocked
+                    pass
+        return class_log_probabilities
+    def _update_state(
+        self,
+        state: ConstraintStateType,
+        last_prediction: torch.Tensor,
+    ) -> ConstraintStateType:
+        for i, batch in enumerate(state):
+            for j, beam in enumerate(batch):
+                prediction = last_prediction[i, j].item()
+                prefix = beam["current_prefix"]
+                seen_ngrams = beam["seen_ngrams"]
+                if len(prefix) == self.ngram_size - 1:
+                    # This is a new ngram that we have to remember
+                    if tuple(prefix) not in seen_ngrams:
+                        seen_ngrams[tuple(prefix)] = []
+                    seen_ngrams[tuple(prefix)].append(prediction)
+                # Create the new prefix, removing the oldest index if the prefix
+                # is too long
+                prefix.append(prediction)
+                if len(prefix) == self.ngram_size:
+                    prefix.pop(0)
+        return state
+class BeamSearch:
+    """
+    Implements the beam search algorithm for decoding the most likely sequences.
+    :param end_index: The index of the "stop" or "end" token in the vocabulary. Usually the EOS token ID.
+    :param max_steps: The maximum number of decoding steps to take, i.e. the maximum length
+        of the predicted sequences.
+    :param beam_size: The width of the beam used.
+    :param per_node_beam_size: The maximum number of candidates to consider per node, at each step in the search.
+        If not given, this just defaults to `beam_size`. Setting this parameter
+        to a number smaller than `beam_size` may give better results, as it can introduce
+        more diversity into the search. See
+        [*Beam Search Strategies for Neural Machine Translation*, Freitag and Al-Onaizan, 2017]
+        (https://api.semanticscholar.org/CorpusID:2229477).
+    :param sampler: An optional `Sampler` which is used to pick next candidate nodes and beams.
+        If not specified, `DeterministicSampler` will be used, which just takes the
+        `per_node_beam_size` most likely nodes and the `beam_size` most likely beams.
+        Using the [`GumbelSampler`](#gumbelsampler), on the other hand, will give you
+        [Stochastic Beam Search](https://api.semanticscholar.org/CorpusID:76662039).
+    :param min_steps: The minimum number of decoding steps to take, i.e. the minimum length of
+        the predicted sequences. This does not include the start or end tokens. If `None`,
+        no minimum is enforced.
+    :param final_sequence_scorer: An optional `FinalSequenceScorer` which is used to score the final generated sequences.
+        The output from this module is what is returned by the `search` method. If not
+        specified, `SequenceLogProbabilityScorer` will be used, which scores the sequences
+        by the sum of the token log probabilities.
+    :param constraints: An optional list of `Constraint`s which should be applied during beam search. If not
+        provided, no constraints will be enforced.
+    """
+    def __init__(
+        self,
+        end_index: int,
+        *,
+        max_steps: int = 50,
+        beam_size: int = 10,
+        per_node_beam_size: Optional[int] = None,
+        sampler: Optional[Sampler] = None,
+        min_steps: Optional[int] = None,
+        final_sequence_scorer: Optional[FinalSequenceScorer] = None,
+        constraints: Optional[List[Constraint]] = None,
+        distributed_model: bool = False
+    ) -> None:
+        if not max_steps > 0:
+            raise ValueError("max_steps must be positive")
+        if not beam_size > 0:
+            raise ValueError("beam_size must be positive")
+        if per_node_beam_size is not None and not per_node_beam_size > 0:
+            raise ValueError("per_node_beam_size must be positive")
+        if min_steps is not None:
+            if not min_steps >= 0:
+                raise ValueError("min_steps must be non-negative")
+            if not min_steps <= max_steps:
+                raise ValueError("min_steps must be less than or equal to max_steps")
+        self._end_index = end_index
+        self.max_steps = max_steps
+        self.beam_size = beam_size
+        self.per_node_beam_size = per_node_beam_size or beam_size
+        self.sampler = sampler or DeterministicSampler()
+        self.min_steps = min_steps or 0
+        self.final_sequence_scorer = final_sequence_scorer or SequenceLogProbabilityScorer()
+        self.constraints = constraints or []
+        self.distributed_model = distributed_model
+    @staticmethod
+    def _reconstruct_sequences(predictions, backpointers):
+        # Reconstruct the sequences.
+        # shape: [(batch_size, beam_size, 1)]
+        reconstructed_predictions = [predictions[-1].unsqueeze(2)]
+        if not backpointers:
+            return reconstructed_predictions
+        # shape: (batch_size, beam_size)
+        cur_backpointers = backpointers[-1]
+        for timestep in range(len(predictions) - 2, 0, -1):
+            # shape: (batch_size, beam_size, 1)
+            cur_preds = predictions[timestep].gather(1, cur_backpointers).unsqueeze(2)
+            reconstructed_predictions.append(cur_preds)
+            # shape: (batch_size, beam_size)
+            cur_backpointers = backpointers[timestep - 1].gather(1, cur_backpointers)
+        # shape: (batch_size, beam_size, 1)
+        final_preds = predictions[0].gather(1, cur_backpointers).unsqueeze(2)
+        reconstructed_predictions.append(final_preds)
+        return reconstructed_predictions
+    def search(
+        self,
+        start_predictions: torch.Tensor,
+        start_state: StateType,
+        step: StepFunctionType,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Given a starting state and a step function, apply beam search to find the
+        most likely target sequences.
+        Returns a tuple of `(predictions, final_scores)`, where `predictions`
+        has shape `(batch_size, beam_size, max_steps)` and `final_scores`
+        has shape `(batch_size, beam_size)`.
+        .. note::
+            If your step function returns `-inf` for some log probabilities
+            (like if you're using a masked log-softmax) then some of the "best"
+            sequences returned may also have `-inf` log probability. Specifically
+            this happens when the beam size is smaller than the number of actions
+            with finite log probability (non-zero probability) returned by the step function.
+            Therefore if you're using a mask you may want to check the results from `search`
+            and potentially discard sequences with non-finite log probability.
+        :param start_predictions: A tensor containing the initial predictions with shape `(batch_size,)`.
+            Usually the initial predictions are just the index of the "start" token
+            in the target vocabulary.
+        :param start_state: The initial state passed to the `step` function. Each value of the state dict
+            should be a tensor of shape `(batch_size, *)`, where `*` means any other
+            number of dimensions.
+        :param step: A function that is responsible for computing the next most likely tokens,
+            given the current state and the predictions from the last time step.
+            The function should accept two or three arguments:
+            - a tensor of shape `(group_size,)` or representing the index of the predicted
+            tokens from the last time step,
+            - the current state, a `StateType`, and
+            - optionally, the timestep, an `int`.
+            The `group_size` will be `batch_size * beam_size`, except in the initial
+            step, for which it will just be `batch_size`.
+            The function is expected to return a tuple, where the first element
+            is a tensor of shape `(group_size, vocab_size)` containing
+            the log probabilities of the tokens for the next step, and the second
+            element is the updated state. The tensor in the state should have shape
+            `(group_size, *)`, where `*` means any other number of dimensions.
+        """
+        step_signature = signature(step)
+        if len(step_signature.parameters) < 3:
+            # If the step function we're given does not take the time step argument, wrap it
+            # in one that does.
+            old_step = cast(StepFunctionTypeNoTimestep, step)
+            def new_step(last_predictions: torch.Tensor, state: Dict[str, torch.Tensor], time_step: int):
+                del time_step
+                return old_step(last_predictions, state)
+            return self._search(start_predictions, start_state, new_step)
+        else:
+            return self._search(start_predictions, start_state, cast(StepFunctionTypeWithTimestep, step))
+    def _search(
+        self,
+        start_predictions: torch.Tensor,
+        start_state: StateType,
+        step: StepFunctionTypeWithTimestep,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        batch_size = start_predictions.size()[0]
+        # List of (batch_size, beam_size) tensors. One for each time step. Does not
+        # include the start symbols, which are implicit.
+        predictions: List[torch.Tensor] = []
+        # List of (batch_size, beam_size) tensors. One for each time step. None for
+        # the first.  Stores the index n for the parent prediction, i.e.
+        # predictions[t-1][i][n], that it came from.
+        backpointers: List[torch.Tensor] = []
+        constraint_states = [constraint.init_state(batch_size) for constraint in self.constraints]
+        # Calculate the first timestep. This is done outside the main loop
+        # because we are going from a single decoder input (the output from the
+        # encoder) to the top `beam_size` decoder outputs. On the other hand,
+        # within the main loop we are going from the `beam_size` elements of the
+        # beam to `beam_size`^2 candidates from which we will select the top
+        # `beam_size` elements for the next iteration.
+        # shape: (batch_size, num_classes)
+        start_class_log_probabilities, state = step(start_predictions, start_state, 0)
+        num_classes = start_class_log_probabilities.size()[1]
+        # Make sure `per_node_beam_size` is not larger than `num_classes`.
+        if self.per_node_beam_size > num_classes:
+            raise ValueError(
+                f"Vocab size ({num_classes:d}) too small "
+                f"relative to per_node_beam_size ({self.per_node_beam_size:d}).\n"
+                f"Please decrease beam_size or per_node_beam_size."
+            )
+        sampler_state = self.sampler.init_state(start_class_log_probabilities, batch_size, num_classes)
+        # Apply all constraints.
+        if self.constraints:
+            # shape: (batch_size, 1, num_classes)
+            expanded_start_class_log_probabilities = start_class_log_probabilities.unsqueeze(1)
+            for constraint, constraint_state in zip(self.constraints, constraint_states):
+                expanded_start_class_log_probabilities = constraint.apply(
+                    constraint_state, expanded_start_class_log_probabilities
+                )
+            start_class_log_probabilities = expanded_start_class_log_probabilities.squeeze(1)
+        # Prevent selecting the end symbol if there is any min_steps constraint
+        if self.min_steps >= 1:
+            start_class_log_probabilities[:, self._end_index] = torch.finfo(
+                start_class_log_probabilities.dtype
+            ).min
+        # Get the initial predicted classed and their log probabilities.
+        # shape: (batch_size, beam_size), (batch_size, beam_size)
+        (
+            start_top_log_probabilities,
+            start_predicted_classes,
+            sampler_state,
+        ) = self.sampler.sample_beams(start_class_log_probabilities, self.beam_size, sampler_state)
+        if (
+            self.beam_size == 1 and
+            (start_predicted_classes == self._end_index).all() and
+            not self.distributed_model
+        ):
+            warnings.warn(
+                "Empty sequences predicted. You may want to increase the beam size or ensure "
+                "your step function is working properly.",
+                RuntimeWarning,
+            )
+            return start_predicted_classes.unsqueeze(-1), start_top_log_probabilities
+        # The log probabilities for the last time step.
+        # shape: (batch_size, beam_size)
+        last_log_probabilities = start_top_log_probabilities
+        # shape: [(batch_size, beam_size)]
+        predictions.append(start_predicted_classes)
+        # Log probability tensor that mandates that the end token is selected.
+        # shape: (batch_size * beam_size, num_classes)
+        log_probs_after_end = start_class_log_probabilities.new_full(
+            (batch_size * self.beam_size, num_classes),
+            torch.finfo(start_class_log_probabilities.dtype).min,
+        )
+        log_probs_after_end[:, self._end_index] = 0.0
+        # Set the same state for each element in the beam.
+        self._update_initial_state(state, batch_size)
+        for i, constraint in enumerate(self.constraints):
+            constraint_states[i] = constraint.update_state(constraint_states[i], start_predicted_classes)
+        for timestep in range(self.max_steps - 1):
+            # shape: (batch_size * beam_size,)
+            last_predictions = predictions[-1].reshape(batch_size * self.beam_size)
+            # If every predicted token from the last step is `self._end_index`,
+            # then we can stop early.
+            # FIXME for distributed model we cannot stop early unless all devices are done,
+            # for now we just always run to the max limit, ideally we should check all devices
+            if not self.distributed_model and (last_predictions == self._end_index).all():
+                # finished
+                break
+            # Take a step. This get the predicted log probs of the next classes
+            # and updates the state.
+            # shape: (batch_size * beam_size, num_classes)
+            class_log_probabilities, state = step(last_predictions, state, timestep + 1)
+            # Apply all constraints.
+            if self.constraints:
+                # shape: (batch_size, beam_size, num_classes)
+                reshaped_class_log_probabilities = class_log_probabilities.view(batch_size, self.beam_size, -1)
+                for constraint, constraint_state in zip(self.constraints, constraint_states):
+                    reshaped_class_log_probabilities = constraint.apply(
+                        constraint_state, reshaped_class_log_probabilities
+                    )
+                # shape: (batch_size * beam_size, num_classes)
+                class_log_probabilities = reshaped_class_log_probabilities.view(batch_size * self.beam_size, -1)
+            # The `timestep`-th iteration of the for loop is generating the `timestep + 2`-th token
+            # of the sequence (because `timestep` is 0-indexed and we generated the first token
+            # before the for loop). Here we block the end index if the search is not allowed to
+            # terminate on this iteration.
+            if timestep + 2 <= self.min_steps:
+                class_log_probabilities[:, self._end_index] = torch.finfo(class_log_probabilities.dtype).min
+            # shape: (batch_size * beam_size, num_classes)
+            last_predictions_expanded = last_predictions.unsqueeze(-1).expand(
+                batch_size * self.beam_size, num_classes
+            )
+            # Here we are finding any beams where we predicted the end token in
+            # the previous timestep and replacing the distribution with a
+            # one-hot distribution, forcing the beam to predict the end token
+            # this timestep as well.
+            # shape: (batch_size * beam_size, num_classes)
+            cleaned_log_probabilities = torch.where(
+                last_predictions_expanded == self._end_index,
+                log_probs_after_end,
+                class_log_probabilities,
+            )
+            # shape (both): (batch_size * beam_size, per_node_beam_size)
+            top_log_probabilities, predicted_classes, sampler_state = self.sampler.sample_nodes(
+                cleaned_log_probabilities, self.per_node_beam_size, sampler_state
+            )
+            # Here we expand the last log probabilities to (batch_size * beam_size, per_node_beam_size)
+            # so that we can add them to the current log probs for this timestep.
+            # This lets us maintain the log probability of each element on the beam.
+            # shape: (batch_size * beam_size, per_node_beam_size)
+            expanded_last_log_probabilities = (
+                last_log_probabilities.unsqueeze(2)
+                .expand(batch_size, self.beam_size, self.per_node_beam_size)
+                .reshape(batch_size * self.beam_size, self.per_node_beam_size)
+            )
+            # shape: (batch_size * beam_size, per_node_beam_size)
+            summed_top_log_probabilities = top_log_probabilities + expanded_last_log_probabilities
+            # shape: (batch_size, beam_size * per_node_beam_size)
+            reshaped_summed = summed_top_log_probabilities.reshape(
+                batch_size, self.beam_size * self.per_node_beam_size
+            )
+            # shape: (batch_size, beam_size * per_node_beam_size)
+            reshaped_predicted_classes = predicted_classes.reshape(
+                batch_size, self.beam_size * self.per_node_beam_size
+            )
+            # Keep only the top `beam_size` beam indices.
+            # shape (both): (batch_size, beam_size)
+            (
+                restricted_beam_log_probs,
+                restricted_beam_indices,
+                sampler_state,
+            ) = self.sampler.sample_beams(reshaped_summed, self.beam_size, sampler_state)
+            # Use the beam indices to extract the corresponding classes.
+            # shape: (batch_size, beam_size)
+            restricted_predicted_classes = reshaped_predicted_classes.gather(1, restricted_beam_indices)
+            predictions.append(restricted_predicted_classes)
+            # shape: (batch_size, beam_size)
+            last_log_probabilities = restricted_beam_log_probs
+            # The beam indices come from a `beam_size * per_node_beam_size` dimension where the
+            # indices with a common ancestor are grouped together. Hence
+            # dividing by per_node_beam_size gives the ancestor. (Note that this is integer
+            # division as the tensor is a LongTensor.)
+            # shape: (batch_size, beam_size)
+            backpointer = torch.divide(restricted_beam_indices, self.per_node_beam_size, rounding_mode="trunc")
+            backpointers.append(backpointer)
+            # Keep only the pieces of the state tensors corresponding to the
+            # ancestors created this iteration.
+            self._update_state(state, backpointer)
+            for i, constraint in enumerate(self.constraints):
+                constraint_states[i] = constraint.update_state(
+                    constraint_states[i], restricted_predicted_classes, last_backpointer=backpointer
+                )
+        # Warn about "-inf" log probabilities if not using any constraints (negligible
+        # log probabilities are expected when using constraints).
+        if not self.constraints and (
+            not torch.isfinite(last_log_probabilities).all()
+            or (last_log_probabilities == torch.finfo(last_log_probabilities.dtype).min).any()
+        ):
+            warnings.warn(
+                "Negligible log probabilities encountered ('-inf' or equivalent). "
+                "Some final sequences may not make sense. "
+                "This can happen when the beam size is larger than the number of valid (non-zero "
+                "probability) transitions that the step function produces.",
+                RuntimeWarning,
+            )
+        reconstructed_predictions = self._reconstruct_sequences(predictions, backpointers)
+        # shape: (batch_size, beam_size, max_steps)
+        all_predictions = torch.cat(list(reversed(reconstructed_predictions)), 2)
+        # Calculate the final sequence scores
+        # shape: (batch_size, beam_size)
+        final_scores = self.final_sequence_scorer.score(all_predictions, last_log_probabilities, self._end_index)
+        # Sort the sequences based on the final scores so the best scoring
+        # sequence is at index 0
+        sorted_final_scores, sorted_indices = torch.sort(final_scores, dim=1, descending=True)
+        sorted_all_predictions = torch.gather(
+            all_predictions, 1, sorted_indices.unsqueeze(-1).expand_as(all_predictions)
+        )
+        return sorted_all_predictions, sorted_final_scores
+    def _update_initial_state(self, state: StateType, batch_size: int):
+        """
+        Expand tensors in a state dictionary from `(batch_size, *)` to `(batch_size * beam_size, *)`.
+        """
+        for key, state_tensor in state.items():
+            if state_tensor is None:
+                continue
+            # shape: (batch_size * beam_size, *)
+            _, *last_dims = state_tensor.size()
+            state[key] = (
+                state_tensor.unsqueeze(1)
+                .expand(batch_size, self.beam_size, *last_dims)
+                .reshape(batch_size * self.beam_size, *last_dims)
+            )
+    def _update_state(self, state: StateType, backpointer: torch.Tensor):
+        batch_size = backpointer.size()[0]
+        for key, state_tensor in state.items():
+            if state_tensor is None:
+                continue
+            _, *last_dims = state_tensor.size()
+            # shape: (batch_size, beam_size, *)
+            expanded_backpointer = backpointer.view(batch_size, self.beam_size, *([1] * len(last_dims))).expand(
+                batch_size, self.beam_size, *last_dims
+            )
+            # shape: (batch_size * beam_size, *)
+            state[key] = (
+                state_tensor.reshape(batch_size, self.beam_size, *last_dims)
+                .gather(1, expanded_backpointer)
+                .reshape(batch_size * self.beam_size, *last_dims)
+            )

config_molmoe.py CHANGED Viewed

@@ -27,11 +27,15 @@ import gin
 #from olmo.aliases import PathOrStr
 from .aliases import PathOrStr
-from olmo.exceptions import OLMoConfigurationError
-from olmo.util import StrEnum, resource_path
-from olmo.mm_data.data_utils import build_tokenizer
-from olmo.multimodal_preprocessor import MultiModalPreprocessor
 __all__ = [
     "ActivationType",

 #from olmo.aliases import PathOrStr
 from .aliases import PathOrStr
+#from olmo.exceptions import OLMoConfigurationError
+from .exceptions import OLMoConfigurationError
+#from olmo.util import StrEnum, resource_path
+from .util import StrEnum, resource_path
+#from olmo.mm_data.data_utils import build_tokenizer
+from .data_utils import build_tokenizer
+#from olmo.multimodal_preprocessor import MultiModalPreprocessor
+from .multimodal_preprocessor import MultiModalPreprocessor
 __all__ = [
     "ActivationType",

constants.py ADDED Viewed

	@@ -0,0 +1,571 @@

+DEFAULT_IMAGE_PATCH_TOKEN = f"<im_patch>"
+DEFAULT_IM_START_TOKEN = f"<im_start>"
+DEFAULT_IM_END_TOKEN = f"<im_end>"
+DEFAULT_IM_COL_TOKEN = f"<im_col>"
+IMAGE_PROMPT = "<|image|>"
+EXTRA_TOKENS = (DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_COL_TOKEN, IMAGE_PROMPT)
+VIT_STANDARD_CONFIGS = {
+    "dinov2-large": {
+        "image_emb_dim": 1024,
+        "image_mlp_dim": 4096,
+        'image_patch_size': 14,
+        'image_pos_patch_size': 14,
+        'image_num_layers': 24,
+        'image_num_heads': 16,
+        'image_num_key_value_heads': 16,
+        'image_head_dim': 64,
+        'image_mlp_activations': 'gelu',
+        'image_default_input_size': (224, 224),
+        'image_num_pos': 257,
+        'image_norm_eps': 1e-6,
+        "image_model_type": "dino"
+    },
+    "SigLIP-So400m-14-384": {
+        "image_emb_dim": 1152,
+        'image_num_layers': 27,
+        "image_mlp_dim": 4304,
+        'image_patch_size': 14,
+        'image_pos_patch_size': 14,
+        'image_num_heads': 16,
+        'image_num_key_value_heads': 16,
+        'image_head_dim': 72,
+        'image_mlp_activations': 'gelu',
+        # Although it is called "384" that seems to be an error of the author's
+        # part, it actually only handles 378 inputs
+        'image_default_input_size': (378, 378),
+        'image_num_pos': 729,  # note not CLS token
+        'image_norm_eps': 1e-6,
+        "image_model_type": "siglip",
+        "resize_mode": "siglip"
+    },
+    "DFN5B-CLIP-ViT-H-14-378": {
+        "image_emb_dim": 1280,
+        'image_patch_size': 14,
+        'image_pos_patch_size': 14,
+        'image_num_layers': 32,
+        'image_num_heads': 16,
+        'image_num_key_value_heads': 16,
+        'image_head_dim': 80,
+        'image_mlp_dim': 5120,
+        'image_dropout_rate': 0.0,
+        'image_mlp_activations': 'quick_gelu',
+        'image_default_input_size': (378, 378),
+        'image_num_pos': 730,
+        'image_norm_eps': 1e-5,
+        "image_model_type": "openai",
+        "resize_mode": "no_aspect_ratio"
+    },
+    'ViT-L/14-336': {
+        'image_patch_size': 14,
+        'image_pos_patch_size': 14,
+        'image_emb_dim': 1024,
+        'image_num_heads': 16,
+        'image_num_layers': 23,
+        'image_head_dim': 64,
+        'image_mlp_dim': 4096,
+        'image_mlp_activations': 'quick_gelu',
+        'image_dropout_rate': 0.0,
+        'image_num_pos': 577,
+        'image_default_input_size': (336, 336),
+        'image_norm_eps': 1e-5,
+        'image_num_key_value_heads': 16,
+        "image_model_type": "openai"
+    },
+    'EVA02-L-14-336': {
+        'image_patch_size': 14,
+        'image_pos_patch_size': 14,
+        'image_emb_dim': 1024,
+        'image_num_heads': 16,
+        'image_num_layers': 24,
+        'image_head_dim': 64,
+        'image_mlp_dim': 2730,
+        'image_mlp_activations': 'silu',
+        'image_dropout_rate': 0.0,
+        'image_num_pos': 577,
+        'image_default_input_size': (336, 336),
+        'image_norm_eps': 1e-6,
+        'image_num_key_value_heads': 16,
+        "image_model_type": "eva"
+    },
+    'ViT-L/14': {
+        'image_patch_size': 14,
+        'image_pos_patch_size': 14,
+        'image_emb_dim': 1024,
+        'image_num_heads': 16,
+        # Note the original model has 24 layers, but we don't use the last layer
+        'image_num_layers': 23,
+        'image_head_dim': 64,
+        'image_mlp_dim': 4096,
+        'image_mlp_activations': 'quick_gelu',
+        'image_dropout_rate': 0.0,
+        'image_num_pos': 257,
+        'image_default_input_size': (224, 224),
+        'image_norm_eps': 1e-5,
+        'image_num_key_value_heads': 16,
+        "image_model_type": "openai"
+    },
+    'debug': {
+        'image_patch_size': 14,
+        'image_pos_patch_size': 14,
+        'image_emb_dim': 1024,
+        'image_num_heads': 16,
+        'image_num_layers': 1,
+        'image_head_dim': 64,
+        'image_mlp_dim': 1024,
+        'image_mlp_activations': 'quick_gelu',
+        'image_dropout_rate': 0.0,
+        'image_num_pos': 577,
+        'image_default_input_size': (336, 336),
+        'image_norm_eps': 1e-5,
+        'image_num_key_value_heads': 16,
+        "image_model_type": "openai"
+    }
+}
+OPEN_LLM_STANDARD_CONFIGS = {
+    "qwen1.5_7b": {
+        'vocab_size': 151936,
+        'hidden_size': 4096,
+        'intermediate_size': 11008,
+        'num_hidden_layers': 32,
+        'num_attention_heads': 32,
+        'num_key_value_heads': 32,
+        'max_sequence_length': 2048,
+        'max_position_embeddings': 32768,
+        'rope_theta': 1000000.0,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-6,
+        "qkv_bias": True,
+        'tie_word_embeddings': False,
+        'hidden_act': 'silu',
+        'norm_module': 'RMSNorm',
+        "tokenizer": "hf-Qwen/Qwen1.5-7B",
+    },
+    "qwen1.5_14b": {
+        'vocab_size': 152064,
+        'hidden_size': 5120,
+        'intermediate_size': 13696,
+        'num_hidden_layers': 40,
+        'num_attention_heads': 40,
+        'num_key_value_heads': 40,
+        'max_sequence_length': 2048,
+        'max_position_embeddings': 32768,
+        'rope_theta': 1000000.0,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-6,
+        "qkv_bias": True,
+        'tie_word_embeddings': False,
+        'hidden_act': 'silu',
+        'norm_module': 'RMSNorm',
+        "tokenizer": "hf-Qwen/Qwen1.5-14B",
+    },
+    "qwen1.5_32b": {
+        "vocab_size": 152064,
+        "hidden_size": 5120,
+        "intermediate_size": 27392,
+        "num_hidden_layers": 64,
+        "num_attention_heads": 40,
+        "num_key_value_heads": 8,
+        'max_sequence_length': 2048,
+        'max_position_embeddings': 32768,
+        "rope_theta": 1000000.0,
+        'initializer_range': 0.02,
+        "rms_norm_eps": 1e-6,
+        "qkv_bias": True,
+        "tie_word_embeddings": False,
+        'hidden_act': 'silu',
+        'norm_module': 'RMSNorm',
+        "tokenizer": "hf-Qwen/Qwen1.5-32B",
+    },
+    'llama_7b': {
+        'vocab_size': 32000,
+        'hidden_size': 4096,
+        'intermediate_size': 11008,
+        'num_hidden_layers': 32,
+        'num_attention_heads': 32,
+        'num_key_value_heads': 32,
+        'max_sequence_length': 2048,
+        'max_position_embeddings': 8192,
+        'rope_theta': 10000.0,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-5,
+        'tie_word_embeddings': False,
+        'hidden_act': 'silu',
+        'norm_module': 'RMSNorm',
+        "tokenizer": "llama"
+    },
+    'yi_6b': {
+        'vocab_size': 64000,
+        'hidden_size': 4096,
+        'intermediate_size': 11008,
+        'num_hidden_layers': 32,
+        'num_attention_heads': 32,
+        'num_key_value_heads': 4,
+        'max_sequence_length': 4096,
+        'max_position_embeddings': 4096,
+        'rope_theta': 5000000.0,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-5,
+        'tie_word_embeddings': False,
+        'hidden_act': 'silu',
+        'norm_module': 'RMSNorm',
+        "tokenizer": "yi"
+    },
+    'yi_9b': {
+        'vocab_size': 64000,
+        'hidden_size': 4096,
+        'intermediate_size': 11008,
+        'num_hidden_layers': 48,
+        'num_attention_heads': 32,
+        'num_key_value_heads': 4,
+        'max_sequence_length': 4096,
+        'max_position_embeddings': 4096,
+        'rope_theta': 10000,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-06,
+        'tie_word_embeddings': False,
+        'hidden_act': 'silu',
+        'norm_module': 'RMSNorm',
+        "tokenizer": "yi"
+    },
+    'yi_34b': {
+        'vocab_size': 64000,
+        'hidden_size': 7168,
+        'intermediate_size': 20480,
+        'num_hidden_layers': 60,
+        'num_attention_heads': 56,
+        'num_key_value_heads': 8,
+        'max_sequence_length': 4096,
+        'max_position_embeddings': 4096,
+        'rope_theta': 5000000.0,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-5,
+        'tie_word_embeddings': False,
+        'hidden_act': 'silu',
+        'norm_module': 'RMSNorm',
+        "tokenizer": "yi"
+    },
+    "olmo_1b": {
+        'vocab_size': 50304,
+        'hidden_size': 2048,
+        'intermediate_size': 8192,
+        'num_hidden_layers': 16,
+        'num_attention_heads': 16,
+        'num_key_value_heads': 16,
+        'max_sequence_length': 4096,
+        'max_position_embeddings': 32768,
+        'rope_theta': 10000.0,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-5,
+        'tie_word_embeddings': True,
+        'hidden_act': 'silu',
+        'norm_module': 'OlmoLayerNorm',
+        "tokenizer": "hf-allenai/OLMo-1B"
+    },
+    "olmo_7b": {
+        'vocab_size': 50304,
+        'hidden_size': 4096,
+        'intermediate_size': 22016//2,
+        'num_hidden_layers': 32,
+        'num_attention_heads': 32,
+        'num_key_value_heads': 32,
+        'max_sequence_length': 4096,
+        'max_position_embeddings': 32768,
+        'rope_theta': 10000.0,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-5,
+        'tie_word_embeddings': False,
+        'hidden_act': 'silu',
+        'norm_module': 'OlmoLayerNorm',
+        "tokenizer": "hf-allenai/OLMo-7B",
+    },
+    "olmo_1.7_7b": {
+        'vocab_size': 50304,
+        'hidden_size': 4096,
+        'intermediate_size': 22016//2,
+        'num_hidden_layers': 32,
+        'num_attention_heads': 32,
+        'num_key_value_heads': 32,
+        'max_sequence_length': 4096,
+        'max_position_embeddings': 32768,
+        'rope_theta': 10000.0,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-5,
+        'tie_word_embeddings': False,
+        'hidden_act': 'silu',
+        "qkv_clip": 8,
+        'norm_module': 'OlmoLayerNorm',
+        "tokenizer": "hf-allenai/OLMo-1.7-7B",
+    },
+    'mistral_7b': {
+        'vocab_size': 32000,
+        'hidden_size': 4096,
+        'intermediate_size': 14336,
+        'num_hidden_layers': 32,
+        'num_attention_heads': 32,
+        'num_key_value_heads': 8,
+        'max_sequence_length': 4096,
+        'max_position_embeddings': 32768,
+        'rope_theta': 10000.0,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-5,
+        'tie_word_embeddings': False,
+        'hidden_act': 'silu',
+        'norm_module': 'RMSNorm',
+        "tokenizer": "mistral"
+    },
+    'mistral0.3_7b': {
+        'vocab_size': 32768,
+        'hidden_size': 4096,
+        'intermediate_size': 14336,
+        'num_hidden_layers': 32,
+        'num_attention_heads': 32,
+        'num_key_value_heads': 8,
+        'max_sequence_length': 4096,
+        'max_position_embeddings': 32768,
+        'rope_theta': 1000000.0,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-5,
+        'tie_word_embeddings': False,
+        'hidden_act': 'silu',
+        'norm_module': 'RMSNorm',
+        "tokenizer": "mistral0.3"
+    },
+    "mistral0.2_22b": {
+        'vocab_size': 32000,
+        'hidden_size': 6144,
+        'intermediate_size': 16384,
+        'num_hidden_layers': 56,
+        'num_attention_heads': 48,
+        'num_key_value_heads': 8,
+        'max_sequence_length': 4096,
+        'max_position_embeddings': 32768,
+        'rope_theta': 1000000,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-5,
+        'tie_word_embeddings': False,
+        'hidden_act': 'silu',
+        'norm_module': 'RMSNorm',
+        "tokenizer": "mistral"
+    },
+    'llama_13b': {
+        'vocab_size': 32000,
+        'hidden_size': 5120,
+        'intermediate_size': 13824,
+        'num_hidden_layers': 40,
+        'num_attention_heads': 40,
+        'num_key_value_heads': 40,
+        'max_sequence_length': 2048,
+        'max_position_embeddings': 8192,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-5,
+        'tie_word_embeddings': False,
+        'hidden_act': 'silu',
+        "norm_module": 'RMSNorm',
+        'rope_theta': 10000.0,
+        "tokenizer": "llama"
+    },
+    'llama_70b': {
+        'vocab_size': 32000,
+        'hidden_size': 8192,
+        'intermediate_size': 28672,
+        'num_hidden_layers': 80,
+        'num_attention_heads': 64,
+        'num_key_value_heads': 8,
+        'max_sequence_length': 8192,
+        'max_position_embeddings': 8192,
+        'rope_theta': 10000.0,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-5,
+        'tie_word_embeddings': False,
+        'hidden_act': 'silu',
+        "tokenizer": "llama"
+    },
+    'llama_70bflash': {
+        'vocab_size': 32000,
+        'hidden_size': 8192,
+        'intermediate_size': 28672,
+        'num_hidden_layers': 80,
+        'num_attention_heads': 64,
+        'num_key_value_heads': 8,
+        'max_sequence_length': 8192,
+        'max_position_embeddings': 8192,
+        'rope_theta': 10000.0,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-5,
+        'tie_word_embeddings': False,
+        'scan_attention': True,
+        'scan_mlp': True,
+        'hidden_act': 'silu',
+        "tokenizer": "llama"
+    },
+    'llama3_8b': {
+        'vocab_size': 128256,
+        'hidden_size': 4096,
+        'intermediate_size': 14336,
+        'num_hidden_layers': 32,
+        'num_attention_heads': 32,
+        'num_key_value_heads': 8,
+        'max_sequence_length': 8192,
+        'max_position_embeddings': 8192,
+        'rope_theta': 500000.0,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-5,
+        'tie_word_embeddings': False,
+        'hidden_act': 'silu',
+        'norm_module': 'RMSNorm',
+        "tokenizer": "hf-meta-llama/Meta-Llama-3-8B",
+    },
+    'llama3_70b': {
+        'vocab_size': 128256,
+        'hidden_size': 8192,
+        'intermediate_size': 28672,
+        'num_hidden_layers': 80,
+        'num_attention_heads': 64,
+        'num_key_value_heads': 8,
+        'max_sequence_length': 8192,
+        'max_position_embeddings': 8192,
+        'rope_theta': 500000.0,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-5,
+        'tie_word_embeddings': False,
+        'hidden_act': 'silu',
+        'norm_module': 'RMSNorm',
+        "tokenizer": "hf-meta-llama/Meta-Llama-3-70B",
+    },
+    'open_llama_3b': {
+        'vocab_size': 32000,
+        'hidden_size': 3200,
+        'intermediate_size': 8640,
+        'num_hidden_layers': 26,
+        'num_attention_heads': 32,
+        'max_sequence_length': 2048,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-6,
+        'max_position_embeddings': 2048,
+        'num_key_value_heads': 32,
+        'rope_theta': 10000.0,
+        'tie_word_embeddings': False,
+        'hidden_act': 'silu',
+        'norm_module': 'RMSNorm',
+        "tokenizer": "llama"
+    },
+    'gemma_2b': {
+        'vocab_size': 256000,
+        'hidden_size': 2048,
+        'intermediate_size': 16384,
+        'num_hidden_layers': 18,
+        'num_attention_heads': 8,
+        'max_sequence_length': 8192,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-6,
+        'max_position_embeddings': 8192,
+        'num_key_value_heads': 1,
+        'rope_theta': 10000.0,
+        'tie_word_embeddings': True,
+        'normalize_input_embeds': True,
+        'norm_module': 'GemmaRMSNorm',
+        'hidden_act': 'gelu',
+        "tokenizer": "gemma"
+    },
+    'gemma_7b': {
+        'vocab_size': 256000,
+        'hidden_size': 3072,
+        'intermediate_size': 24576,
+        'num_hidden_layers': 28,
+        'num_attention_heads': 16,
+        'max_sequence_length': 8192,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-6,
+        'max_position_embeddings': 8192,
+        'num_key_value_heads': 16,
+        'rope_theta': 10000.0,
+        'tie_word_embeddings': True,
+        'normalize_input_embeds': True,
+        'norm_module': 'GemmaRMSNorm',
+        'hidden_act': 'gelu',
+        "tokenizer": "gemma"
+    },
+    'tiny_llama_1b': {
+        'vocab_size': 32000,
+        'hidden_size': 2048,
+        'intermediate_size': 5632,
+        'num_hidden_layers': 22,
+        'num_attention_heads': 32,
+        'max_sequence_length': 2048,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-5,
+        'max_position_embeddings': 2048,
+        'num_key_value_heads': 4,
+        'rope_theta': 10000.0,
+        'tie_word_embeddings': False,
+        'hidden_act': 'silu',
+        'norm_module': 'RMSNorm',
+        "tokenizer": "llama"
+    },
+    'debug': { # A small model for debugging
+        'vocab_size': 32000,
+        'hidden_size': 512,
+        'intermediate_size': 512,
+        'num_hidden_layers': 1,
+        'num_attention_heads': 8,
+        'max_sequence_length': 4096,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-5,
+        'max_position_embeddings': 4096,
+        'num_key_value_heads': 8,
+        'rope_theta': 10000.0,
+        'tie_word_embeddings': False,
+        'hidden_act': 'silu',
+        'norm_module': 'RMSNorm',
+        "tokenizer": "llama"
+    },
+    'gemma2_9b': {
+        'vocab_size': 256000,
+        'hidden_size': 3584,
+        'head_dim': 256,
+        'intermediate_size': 14336,
+        'num_hidden_layers': 42,
+        'num_attention_heads': 16,
+        'max_sequence_length': 8192,
+        "query_pre_attn_scalar": 224,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-6,
+        'max_position_embeddings': 8192,
+        'num_key_value_heads': 8,
+        'rope_theta': 10000.0,
+        'tie_word_embeddings': False,
+        'normalize_input_embeds': True,
+        'norm_module': 'GemmaRMSNorm',
+        'hidden_act': 'gelu_tanh',
+        "tokenizer": "hf-google/gemma-2-9b",
+        "attn_logit_softcapping": 50.0,
+        "final_logit_softcapping": 30.0,
+    },
+    'gemma2_27b': {
+        'vocab_size': 256000,
+        'hidden_size': 4608,
+        'head_dim': 128,
+        'intermediate_size': 36864,
+        'num_hidden_layers': 46,
+        'num_attention_heads': 32,
+        'max_sequence_length': 8192,
+        "query_pre_attn_scalar": 144,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-6,
+        'max_position_embeddings': 8192,
+        'num_key_value_heads': 16,
+        'rope_theta': 10000.0,
+        'tie_word_embeddings': False,
+        'normalize_input_embeds': True,
+        'norm_module': 'GemmaRMSNorm',
+        'hidden_act': 'gelu_tanh',
+        "tokenizer": "hf-google/gemma-2-27b",
+        "attn_logit_softcapping": 50.0,
+        "final_logit_softcapping": 30.0,
+    },
+}

data_factory.py ADDED Viewed

	@@ -0,0 +1,222 @@

+'''
+Dataset factory to load data from huggingface and others.
+'''
+import dataclasses
+import logging
+from typing import List, Optional
+import numpy as np
+import tensorflow as tf
+from .data_utils import add_segment_ids
+from .dataset_sizes import get_dataset_size
+from .tasks import get_task
+from .multimodal_preprocessor import MultiModalPreprocessor
+import seqio
+from .torch_util import get_global_rank
+log = logging.getLogger(__name__)
+@dataclasses.dataclass
+class SeqioDataset:
+    mixture_or_task_name: str
+    seq_len: int
+    global_batch_size: int
+    max_crops: int = None
+    is_training: bool = False
+    for_inference: bool = False
+    split: str = 'train'
+    shuffle: bool = True
+    num_epochs: int = None
+    drop_remainder: bool = True
+    seed: int = None
+    pack: bool = False
+    use_custom_packing_ops: bool = False
+    use_memory_cache: bool = False
+    shuffle_buffer_size: Optional[int] = None
+    different_host_mixture_seeds: bool = True
+    disable_autotune: bool = True
+    trim_output_features: bool = True
+    @classmethod
+    def from_dict(cls, data):
+        return cls(**data)
+    def get_task_feature_lengths_dict(self, max_crops):
+        if self.max_crops is not None:
+            assert self.max_crops >= max_crops
+            max_crops = self.max_crops
+        return dict(
+            target_tokens=self.seq_len,
+            loss_masks=self.seq_len,
+            images=max_crops,
+            image_positions=max_crops,
+            image_input_idx=max_crops,
+            is_training=self.is_training
+        )
+    def build(self, preprocessor: MultiModalPreprocessor, shard_id, num_shards):
+        shard_info = seqio.ShardInfo(index=shard_id, num_shards=num_shards)
+        task_feature_lengths_dict = self.get_task_feature_lengths_dict(
+            preprocessor.get_max_total_crops())
+        seed = self.seed
+        assert seed is not None
+        batch_size = self.global_batch_size // num_shards
+        if isinstance(self.mixture_or_task_name, (dict, list, tuple)):
+            if isinstance(self.mixture_or_task_name, dict):
+                items = self.mixture_or_task_name.items()
+            else:
+                items = self.mixture_or_task_name
+            task_list = []
+            for task, weight in items:
+                task = get_task(preprocessor, task, self.is_training, self.for_inference)
+                task_list.append((task, weight))
+            mixture_or_task = task_list
+        else:
+            mixture_or_task = get_task(
+                preprocessor, self.mixture_or_task_name, self.is_training, self.for_inference)
+        in_memory_shuffle = self.shuffle
+        if not self.drop_remainder:
+            # Used if we want to evaluate on an eval dataset without dropping any examples.
+            # To do this, we pad the dataset with dummy examples marked as invalid in their
+            # metadata so we can still get fixed-sized batches.
+            assert self.num_epochs is not None
+            assert not self.pack
+            assert not isinstance(mixture_or_task, list), "Inference datasets cannot be mixtures"
+            logging.info(
+                f"Initializing inf. dataset {mixture_or_task.name}: replica_batch_size={batch_size}"
+                f' seed={seed}, sharding={shard_info.index}/{shard_info.num_shards}'
+            )
+            ds = mixture_or_task.get_dataset(
+                sequence_length=task_feature_lengths_dict,
+                split=self.split,
+                shuffle=in_memory_shuffle,
+                num_epochs=self.num_epochs,
+                seed=seed,
+                try_in_mem_cache=self.use_memory_cache,
+                trim_output_features=self.trim_output_features
+            )
+            try:
+                n = len(ds)
+            except TypeError:
+                dataset_len = get_dataset_size(self.mixture_or_task_name, self.split)
+                logging.info(f"Setting dataset len to {dataset_len} based on DATASET_SIZES")
+                n = dataset_len
+                ds = tf.data.experimental.assert_cardinality(n)(ds)
+            remainder = n % self.global_batch_size
+            if remainder > 0:
+                n_to_pad = self.global_batch_size - remainder
+            else:
+                n_to_pad = 0
+            assert "metadata/valid" not in ds.element_spec
+            def add_valid(x):
+                x["metadata/valid"] = True
+                return x
+            def add_invalid(x):
+                x["metadata/valid"] = False
+                return x
+            ds = ds.map(add_valid)
+            if n_to_pad > 0:
+                to_pad = ds.take(1).map(add_invalid).cache().repeat(n_to_pad)
+                ds = ds.concatenate(to_pad)
+            # Shard after padding to ensure shards are the same length
+            ds = ds.shard(num_shards=num_shards, index=shard_id)
+            ds = preprocessor.get_post_mixing_preprocessor()(
+                ds, task_feature_lengths=task_feature_lengths_dict)
+            data_iter = ds.batch(batch_size, drop_remainder=True, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+            # Make it possible for client to get the size of the batched/sharded dataset with `len()`
+            new_len = (n + n_to_pad) // self.global_batch_size
+            data_iter = tf.data.experimental.assert_cardinality(new_len)(data_iter)
+        else:
+            if isinstance(mixture_or_task, list):
+                total_rate = sum(x[1] for x in mixture_or_task)
+                mixture_or_task = [(task, r/total_rate) for task, r in mixture_or_task]
+                sorted_tasks: List[seqio.Task] = sorted(mixture_or_task, key=lambda x: -x[1])
+                if self.different_host_mixture_seeds and shard_info:
+                    # If each process has the same seed they will draw from the datasets in the same
+                    # order, which can make the global batches very non-random if there are
+                    # many processes each with a small batch size. To fix this, we give each host
+                    # a different seed based on its rank to use when mixing
+                    mix_seed = seed + shard_info.index*4397
+                else:
+                    mix_seed = seed
+                logging.info(
+                    f"Initializing mixture: replica_batch_size={batch_size} seed={seed}, "
+                    f"mix_seed={mix_seed}, sharding={shard_info.index}/{shard_info.num_shards} rates:"
+                )
+                for task, rate in sorted_tasks:
+                    logging.info(f"\t{task.name}: {rate:0.4f}")
+                datasets = []
+                rates = []
+                for task, rate in sorted_tasks:
+                    assert rate > 0
+                    datasets.append(task.get_dataset(
+                        task_feature_lengths_dict,
+                        split=self.split,
+                        shuffle=self.shuffle,
+                        seed=seed,
+                        shard_info=shard_info,
+                        num_epochs=self.num_epochs,
+                        try_in_mem_cache=self.use_memory_cache,
+                        trim_output_features=self.trim_output_features
+                    ))
+                    rates.append(rate)
+                # If any of the sub-tasks have subsegment_ids, we need to ensure all the tasks have
+                # a subsegment_ids field so they can be mixed
+                if any("subsegment_ids" in ds.element_spec for ds in datasets):
+                    for ix, ds in enumerate(datasets):
+                        if "subsegment_ids" not in ds.element_spec:
+                            datasets[ix] = add_segment_ids(ds)
+                ds = tf.data.Dataset.sample_from_datasets(
+                    datasets, rates, seed=mix_seed, stop_on_empty_dataset=False)
+            else:
+                logging.info(
+                    f"Initializing dataset {mixture_or_task.name}: replica_batch_size={batch_size}"
+                    f' seed={seed}, sharding={shard_info.index}/{shard_info.num_shards}'
+                )
+                ds = mixture_or_task.get_dataset(
+                    task_feature_lengths_dict,
+                    split=self.split,
+                    shuffle=self.shuffle,
+                    seed=seed,
+                    shard_info=shard_info,
+                    num_epochs=self.num_epochs,
+                    try_in_mem_cache=self.use_memory_cache,
+                    trim_output_features=self.trim_output_features
+                )
+            data_iter = preprocessor.get_post_mixing_preprocessor()(
+                ds, task_feature_lengths=task_feature_lengths_dict)
+            data_iter = data_iter.batch(batch_size, drop_remainder=True, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+        ds = ds.prefetch(2)
+        # Following https://github.com/google-research/big_vision/blob/b8dab6e4de3436849415f37c591399c93b1eaf39/big_vision/input_pipeline.py#L228
+        # These options try to stop tf datasets from eating all our RAM if we are using a
+        # large mixture
+        # This options are used by default in some google codebases
+        # For example: (https://github.com/google-research/big_vision/blob/b8dab6e4de3436849415f37c591399c93b1eaf39/big_vision/input_pipeline.py#L228)
+        # They don't seem to harm throughput and can save RAM so we use them as well
+        options = tf.data.Options()
+        options.experimental_optimization.inject_prefetch = False
+        options.threading.max_intra_op_parallelism = 1
+        if self.disable_autotune:
+            # Following https://www.tensorflow.org/datasets/performances
+            # This reduces RAM and checkpoint size by a lot
+            options.autotune.enabled = False
+        data_iter = data_iter.with_options(options)
+        return data_iter

data_utils.py ADDED Viewed

	@@ -0,0 +1,827 @@

+import abc
+import dataclasses
+import functools
+import os
+from os import environ
+from typing import Mapping, Optional, Sequence, List
+from absl import logging
+import clu
+import gin
+from pathlib import Path
+import seqio
+from seqio import utils
+from seqio.feature_converters import _check_exact_match, _check_lengths
+import tensorflow as tf
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops.image_ops_impl import _ImageDimensions, _CheckAtLeast3DImage, _assert, _is_tensor
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from transformers import PreTrainedTokenizerFast
+from . import seqio_tokenizer as vocab
+from .constants import *
+from .utils import pop_metadata
+from .util import is_url
+DEFAULT_EXTRA_IDS = 0
+OutputFeaturesType = Mapping[str, utils.Feature]
+def build_tokenizer(
+    tokenizer_type, has_extra_token=True,
+    adds_space=False,
+    olmo_bos_token_id=1, olmo_eos_token_id=2,
+    tokenizer_dir="gs://mm-olmo/tokenizer",
+    pad_tokenizer_to=None, cache={},
+):
+  cache_key = (tokenizer_type, has_extra_token, adds_space, olmo_bos_token_id,
+               olmo_eos_token_id, pad_tokenizer_to)
+  if cache_key in cache:
+    return cache[cache_key]
+  if tokenizer_type == 'llama':
+    tok = vocab.SentencePieceVocabulary(
+      os.path.join(tokenizer_dir, "llama_tokenizer.model"),
+      extra_ids=DEFAULT_EXTRA_IDS,
+      reverse_extra_ids=True,
+      extra_tokens=EXTRA_TOKENS if has_extra_token else None,
+    )
+  elif tokenizer_type == 'yi':
+      tok = vocab.SentencePieceVocabulary(
+          os.path.join(tokenizer_dir, "yi_tokenizer.model"),
+          extra_ids=DEFAULT_EXTRA_IDS,
+          reverse_extra_ids=True,
+          extra_tokens=EXTRA_TOKENS if has_extra_token else None,
+      )
+  elif tokenizer_type == 'mistral':
+      tok = vocab.SentencePieceVocabulary(
+      os.path.join(tokenizer_dir, "mistral_tokenizer.model"),
+      extra_ids=DEFAULT_EXTRA_IDS,
+      reverse_extra_ids=True,
+      extra_tokens=EXTRA_TOKENS if has_extra_token else None,
+    )
+  elif tokenizer_type == "mistral0.3":
+      tok = vocab.SentencePieceVocabulary(
+          os.path.join(tokenizer_dir, "mistral0.3_tokenizer.model.v3"),
+          extra_ids=DEFAULT_EXTRA_IDS,
+          reverse_extra_ids=True,
+          extra_tokens=EXTRA_TOKENS if has_extra_token else None,
+      )
+  elif tokenizer_type == 'gemma':
+      tok = vocab.SentencePieceVocabulary(
+      os.path.join(tokenizer_dir, "gemma_tokenizer.model"),
+      extra_ids=DEFAULT_EXTRA_IDS,
+      reverse_extra_ids=True,
+      extra_tokens=EXTRA_TOKENS if has_extra_token else None,
+    )
+  elif tokenizer_type.startswith("hf-"):
+      # FIXME When using the beaker image "sanghol/mm-olmo" for hosting endpoints,
+      # we should set the cache_dir, otherwise FileNotFound errors will be raised
+      cache_dir = None if tokenizer_dir is None or is_url(tokenizer_dir) else tokenizer_dir
+      from transformers import AutoTokenizer
+      extra_tokens = list(EXTRA_TOKENS)
+      if pad_tokenizer_to is not None:
+          tokenizer = AutoTokenizer.from_pretrained(tokenizer_type[3:], token=environ.get("HF_ACCESS_TOKEN"), cache_dir=cache_dir)
+          n_extra_tokens = pad_tokenizer_to - len(tokenizer)
+          # This handles a case where the LLM embedding matrix is larger than the vocab size
+          # We need the extra tokens in `EXTRA_TOKENS` to be assigned id's higher than the embedding
+          # matrix size, not the vocab size, since we will concat the embedding and matrix with
+          # the special token embedding matrix, so we pad the vocab with additional special tokens
+          if n_extra_tokens > 0:
+              logging.info(f"Padding tokenizer with {n_extra_tokens} tokens")
+              extra_tokens = [f"|<EXTRA_TOKENS_{i}>|" for i in range(n_extra_tokens)] + extra_tokens
+      bos_token_id = None
+      tokenizer = AutoTokenizer.from_pretrained(
+          tokenizer_type[3:], additional_special_tokens=extra_tokens,
+          token=environ.get("HF_ACCESS_TOKEN"),
+          cache_dir=cache_dir,
+      )
+      if ("qwen2" in tokenizer_type.lower()) or ("olmo" in tokenizer_type.lower()):
+          # These tokenizers do not have a BOS, and instead use EOS as a generic seperator token.
+          # In this case we will use EOS as BOS
+          assert tokenizer.bos_token_id is None
+          bos_token_id = tokenizer.eos_token_id
+      if pad_tokenizer_to is not None:
+          for ix, tok in enumerate(EXTRA_TOKENS):
+              ids = tokenizer.encode(tok, add_special_tokens=False)
+              assert ids == [pad_tokenizer_to + ix]
+      tok = vocab.HfTokenizerWrapper(tokenizer, bos_token_id=bos_token_id, adds_space=adds_space)
+  elif tokenizer_type.startswith("olmo-"):
+      from olmo.tokenizer import Tokenizer
+      assert Path(tokenizer_type[5:]).is_file()
+      tokenizer = Tokenizer.from_file(
+          tokenizer_type[5:],
+          eos_token_id=olmo_eos_token_id,
+          pad_token_id=-1,
+      )
+      tok = vocab.OLMoTokenizerWrapper(tokenizer, bos_token_id=olmo_bos_token_id, adds_space=adds_space)
+  else:
+    raise NotImplementedError(tokenizer_type)
+  cache[cache_key] = tok
+  return tok
+def get_special_token_ids(tokenizer):
+  if isinstance(tokenizer, (vocab.HfTokenizerWrapper, vocab.OLMoTokenizerWrapper)):
+      ids = tokenizer.encode("".join(EXTRA_TOKENS))
+      if len(ids) == len(EXTRA_TOKENS) + 1:
+          ids = ids[1:]
+  elif ("gemma_tokenizer" in tokenizer._sentencepiece_model_file or
+        "yi_tokenizer" in tokenizer._sentencepiece_model_file
+  ):
+      # Not sure why ATM, but the LLaMa tokenizer will add an extra space token
+      # if this string starts with a space, while the gemma one needs the leading space
+      ids = tokenizer.encode(" " + " ".join(EXTRA_TOKENS))
+  else:
+      ids = tokenizer.encode(" ".join(EXTRA_TOKENS))
+  assert len(ids) == len(EXTRA_TOKENS)
+  return {k: i for k, i in zip(EXTRA_TOKENS, ids)}
+def _append_to_innermost_axis(
+    tensor: tf.Tensor, scalar: tf.Tensor,
+) -> tf.Tensor:
+  """Appends `scalar` to each slice in the innermost axis of `tensor`.
+  >>> _append_to_innermost_axis([1, 2, 3], -1)
+  [1, 2, 3, -1]
+  >>> _append_to_innermost_axis([[1, 2], [3, 4]], -1)
+  [[1, 2, -1], [3, 4, -1]]
+  >>> _append_to_innermost_axis(tf.ragged.constant([[1, 2], [3]]), -1)
+  [[1, 2, -1], [3, -1]]
+  Args:
+    tensor: The tensor that should have a value appended.
+    scalar: The value to append.
+  Returns:
+    A copy of `tensor` with `scalar` appended to each slice along
+    the innermost axis.
+  """
+  if isinstance(tensor, tf.RaggedTensor):
+    if tensor.shape.rank > 2:
+      return tensor.with_values(
+          _append_to_innermost_axis(tensor.values, scalar)
+      )
+    else:
+      return tf.concat([tensor, tf.fill([tensor.nrows(), 1], scalar)], axis=1)
+  else:
+    ndims = tf.rank(tensor)
+    paddings = tf.concat(
+        [tf.zeros((ndims - 1, 2), dtype=tf.int32), tf.constant([[0, 1]])],
+        axis=0,
+    )
+    return tf.pad(tensor, paddings=paddings, constant_values=scalar)
+def _shift_right_by_one(tensor: tf.Tensor, bos_id: int = 0) -> tf.Tensor:
+  """Shift the input tensor to the right by one position without wrapping."""
+  if not (tensor.dtype.is_integer or tensor.dtype.is_floating):
+    raise ValueError(f"Only numeric types are supported. Got: {tensor.dtype}")
+  # tf.roll wraps around the axis.
+  rolled = tf.roll(tensor, shift=1, axis=0)
+  # Zero out the first position by multiplying with [0, 1, 1, ..., 1].
+  depth = tf.shape(tensor)[0]
+  mask = tf.one_hot(0, depth=depth, on_value=0, off_value=1, dtype=tensor.dtype)
+  # Expand dims of mask to broadcast to rolled.
+  dim_expansion = [slice(None, None)] + [None] * (len(rolled.shape) - 1)
+  mask = mask[dim_expansion]
+  return rolled * mask + (1 - mask) * bos_id
+def make_autoregressive_inputs(
+    targets: tf.Tensor,
+    sequence_id: tf.Tensor = None,
+    output_dtype: Optional[tf.dtypes.DType] = None,
+    bos_id: int = 0,
+) -> tf.Tensor:
+  """Generate inputs for an autoregressive model, by shifting the targets.
+  Modified from mesh_tensorflow.transformer.transformer.autoregressive_inputs.
+  For the first element of each sequence, the returned input id is 0.
+  For a "packed" dataset, also pass the sequence_id tensor, which aligns
+  with the targets tensor and contains different values for different
+  concatenated examples.
+  Example for a packed dataset:
+  ```
+        targets = [3, 8, 2, 9, 2, 5, 4, 2, -1, -1]
+    sequence_id = [1, 1, 1, 2, 2, 3, 3, 3, 0, 0]
+         inputs = [1, 3, 8, 1, 9, 1, 5, 4, -1, -1]
+                            |     |        |
+                            These positions are set to 0 if sequence_id is not
+                            None.
+  ```
+  Args:
+    targets: a tf.int32 tensor with shape [length].
+    sequence_id: an optional tensor with the same shape as targets.
+    output_dtype: an optional output data type.
+    bos_id: bos id.
+  Returns:
+    a tensor with dtype tf.int32 and the same shape as targets.
+  """
+  output_dtype = output_dtype or targets.dtype
+  if sequence_id is not None and not sequence_id.dtype.is_integer:
+    raise ValueError(
+        "The sequence_id should be integer-valued tensors for a packed dataset."
+    )
+  if sequence_id is not None and len(targets.shape) > 1:
+    raise ValueError(
+        "Only 1-D sequences are supported with packing. Got a "
+        f"packed {len(targets.shape)}-D sequence."
+    )
+  inputs = _shift_right_by_one(targets, bos_id)
+  if inputs.dtype != output_dtype:
+    inputs = tf.cast(inputs, output_dtype)
+  # We should have a 0 at the beginning of each sequence rather than the
+  # shifted EOS (e.g. 1) from the previous sequence.
+  if sequence_id is not None:
+    not_first_in_sequence = tf.equal(
+        sequence_id, _shift_right_by_one(sequence_id)
+    )
+    not_first_in_sequence = tf.cast(not_first_in_sequence, output_dtype)
+    first_ids = tf.cast((1 - not_first_in_sequence) * bos_id, output_dtype)
+    inputs = inputs * not_first_in_sequence + first_ids
+  return inputs
+@tf.function
+def sum_except_first_axis(tensor):
+    # Compute the sum along all axes except the first
+    axes_to_sum = tuple(range(1, len(tensor.shape)))
+    return tf.reduce_sum(tensor, axis=axes_to_sum)
+@seqio.map_over_dataset()
+def add_segment_ids(ex):
+    ex["subsegment_ids"] = tf.zeros_like(ex["target_tokens"], dtype=tf.int32)
+    return ex
+def trim_and_pad_dataset(
+    dataset: tf.data.Dataset, feature_lengths: Mapping[str, int]
+) -> tf.data.Dataset:
+  """Trim and pad first dimension of features to `feature_lengths`.
+  Args:
+    dataset: tf.data.Dataset, the dataset to trim/pad examples in.
+    feature_lengths: map from feature key to final length. Other features will
+      be returned unchanged.
+  Returns:
+    Trimmed/padded tf.data.Dataset.
+  """
+  def _trim_and_pad(k: str, t: tf.Tensor) -> tf.Tensor:
+    """Trim/pad to the first axis of `t` to be of size `length`."""
+    if k not in feature_lengths:
+      return t
+    if isinstance(t, tf.RaggedTensor):
+      t = t.to_tensor()
+    constant_values = -1
+    length_k = feature_lengths[k]
+    if isinstance(length_k, int):
+      t = t[:length_k]
+      pad_amt = length_k - tf.shape(t)[0]
+      padded_t = tf.pad(t, [(0, pad_amt)] + [(0, 0)] * (len(t.shape) - 1), constant_values=constant_values)
+      padded_t.set_shape([length_k] + t.shape.as_list()[1:])
+      return padded_t
+    slices = tuple((slice(0, limit) for limit in length_k))
+    t = t[slices]
+    pad_amt = tf.pad((length_k - tf.shape(t))[..., None], ((0, 0), (1, 0)), constant_values=constant_values)
+    padded_t = tf.pad(t, pad_amt, constant_values=constant_values)
+    padded_t.set_shape(length_k)
+    return padded_t
+  return dataset.map(
+      lambda x: {k: _trim_and_pad(k, t) for k, t in x.items()},
+      num_parallel_calls=tf.data.experimental.AUTOTUNE,
+  )
+def get_3d_subsegments(segmented_suffix):
+    q_lens, text_lens = segmented_suffix.nested_row_lengths()
+    text_segments = tf.range(0, tf.shape(text_lens)[0], dtype=tf.int32)
+    question_repeat = tf.reshape(tf.stack([tf.ones_like(q_lens), q_lens-1], 1), [-1])
+    question_offset = tf.range(1, tf.shape(q_lens)[0]+1, dtype=tf.int32)*200
+    question_offset = tf.reshape(tf.stack([question_offset, question_offset-100], 1), [-1])
+    text_segments = text_segments + tf.repeat(question_offset, question_repeat)
+    segment_ids = tf.cast(tf.repeat(text_segments, text_lens), tf.int32)
+    return segment_ids
+def assert_not_truncated(ds, keys, max_val):
+    def _check(ex):
+        for k in keys:
+            tf.assert_less(tf.shape(ex[k])[0], max_val+1,
+                           message=f"Field {k} was unexpectedly truncated max_len={max_val}")
+        return ex
+    return ds.map(_check)
+def apply_with_random_selector(x, func, num_cases):
+    """Computes func(x, sel), with sel sampled from [0...num_cases-1].
+    Args:
+      x: input Tensor.
+      func: Python function to apply.
+      num_cases: Python int32, number of cases to sample sel from.
+    Returns:
+      The result of func(x, sel), where func receives the value of the
+      selector as a python integer, but sel is sampled dynamically.
+    """
+    sel = tf.random.uniform([], maxval=num_cases, dtype=tf.int32)
+    # Pass the real x only to one of the func calls.
+    return control_flow_ops.merge([
+        func(control_flow_ops.switch(x, tf.equal(sel, case))[1], case)
+        for case in range(num_cases)])[0]
+def denormalize_boxes(boxes, image_shape):
+    """Converts boxes normalized by [height, width] to pixel coordinates.
+    Args:
+      boxes: a tensor whose last dimension is 4 representing the coordinates of
+        boxes in ymin, xmin, ymax, xmax order.
+      image_shape: a list of two integers, a two-element vector or a tensor such
+        that all but the last dimensions are `broadcastable` to `boxes`. The last
+        dimension is 2, which represents [height, width].
+    Returns:
+      denormalized_boxes: a tensor whose shape is the same as `boxes` representing
+        the denormalized boxes.
+    Raises:
+      ValueError: If the last dimension of boxes is not 4.
+    """
+    with tf.name_scope('denormalize_boxes'):
+      if isinstance(image_shape, list) or isinstance(image_shape, tuple):
+        height, width = image_shape
+        height = tf.cast(height, dtype=boxes.dtype)
+        width = tf.cast(width, dtype=boxes.dtype)
+      else:
+        image_shape = tf.cast(image_shape, dtype=boxes.dtype)
+        height, width = tf.split(image_shape, 2, axis=-1)
+      ymin, xmin, ymax, xmax = tf.split(boxes, 4, axis=-1)
+      ymin = ymin * height
+      xmin = xmin * width
+      ymax = ymax * height
+      xmax = xmax * width
+      denormalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1)
+      return denormalized_boxes
+def pad_to_bounding_box(image, offset_height, offset_width, target_height,
+                        target_width, value=0):
+  return pad_to_bounding_box_internal(
+      image,
+      offset_height,
+      offset_width,
+      target_height,
+      target_width,
+      check_dims=True,
+      value=value)
+def pad_to_bounding_box_internal(image, offset_height, offset_width,
+                                 target_height, target_width, check_dims, value):
+  with ops.name_scope(None, 'pad_to_bounding_box_with_one_internal', [image]):
+    image = ops.convert_to_tensor(image, name='image')
+    is_batch = True
+    image_shape = image.get_shape()
+    if image_shape.ndims == 3:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+    elif image_shape.ndims is None:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+      image.set_shape([None] * 4)
+    elif image_shape.ndims != 4:
+      raise ValueError(
+          '\'image\' (shape %s) must have either 3 or 4 dimensions.' %
+          image_shape)
+    batch, height, width, depth = _ImageDimensions(image, rank=4)
+    after_padding_width = target_width - offset_width - width
+    after_padding_height = target_height - offset_height - height
+    if check_dims:
+      assert_ops = _CheckAtLeast3DImage(image, require_static=False)
+      assert_ops += _assert(offset_height >= 0, ValueError,
+                            'offset_height must be >= 0')
+      assert_ops += _assert(offset_width >= 0, ValueError,
+                            'offset_width must be >= 0')
+      assert_ops += _assert(after_padding_width >= 0, ValueError,
+                            'width must be <= target - offset')
+      assert_ops += _assert(after_padding_height >= 0, ValueError,
+                            'height must be <= target - offset')
+      image = control_flow_ops.with_dependencies(assert_ops, image)
+    # Do not pad on the depth dimensions.
+    paddings = array_ops.reshape(
+        tf.stack([
+            0, 0, offset_height, after_padding_height, offset_width,
+            after_padding_width, 0, 0
+        ]), [4, 2])
+    padded = array_ops.pad(image, paddings, constant_values=value)
+    padded_shape = [
+        None if _is_tensor(i) else i
+        for i in [batch, target_height, target_width, depth]
+    ]
+    padded.set_shape(padded_shape)
+    if not is_batch:
+      padded = array_ops.squeeze(padded, axis=[0])
+    return padded
+def resize_and_crop_boxes(boxes, image_scale, output_size, offset, paddings):
+    """Resizes boxes to output size with scale and offset.
+    Args:
+      boxes: `Tensor` of shape [N, 4] representing ground truth boxes.
+      image_scale: 2D float `Tensor` representing scale factors that apply to
+        [height, width] of input image.
+      output_size: 2D `Tensor` or `int` representing [height, width] of target
+        output image size.
+      offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
+        boxes.
+      paddings: 2D `Tensor` representing top/left paddings.
+    Returns:
+      boxes: `Tensor` of shape [N, 4] representing the scaled boxes.
+    """
+    # Adjusts box coordinates based on image_scale, offset and paddings.
+    boxes *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
+    boxes -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
+    boxes += tf.tile(tf.expand_dims(paddings, axis=0), [1, 2])
+    # Clips the boxes.
+    boxes = clip_boxes(boxes, output_size)
+    return boxes
+def clip_boxes(boxes, image_shape):
+  """Clips boxes to image boundaries.
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+    image_shape: a list of two integers, a two-element vector or a tensor such
+      that all but the last dimensions are `broadcastable` to `boxes`. The last
+      dimension is 2, which represents [height, width].
+  Returns:
+    clipped_boxes: a tensor whose shape is the same as `boxes` representing the
+      clipped boxes.
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
+        boxes.shape[-1]))
+  with tf.name_scope('clip_boxes'):
+    if isinstance(image_shape, list) or isinstance(image_shape, tuple):
+      height, width = image_shape
+      max_length = [height, width, height, width]
+    else:
+      image_shape = tf.cast(image_shape, dtype=boxes.dtype)
+      height, width = tf.unstack(image_shape, axis=-1)
+      max_length = tf.stack(
+          [height, width, height, width], axis=-1)
+    clipped_boxes = tf.math.maximum(tf.math.minimum(boxes, max_length), 0.0)
+    return clipped_boxes
+def get_non_empty_box_indices(boxes):
+    """Get indices for non-empty boxes."""
+    # Selects indices if box height or width is 0.
+    height = boxes[:, 2] - boxes[:, 0]
+    width = boxes[:, 3] - boxes[:, 1]
+    indices = tf.where(
+        tf.logical_and(tf.greater(height, 0), tf.greater(width, 0)))
+    return indices[:, 0]
+def resize_and_pad(image, desired_output_size, masks=None, boxes=None, labels=None,
+                   random_scale_min=0.1, random_scale_max=2.0, do_random_scale=False,
+                   shrink_both_sides=True, boxes1=None, filter_box=True,
+                   desired_target_size=None, random_scale_ratio=0.0,
+                   resize_method=tf.image.ResizeMethod.BILINEAR, return_outputs=True,
+                   pad_value=0, normalize=True):
+    desired_height, desired_width = desired_output_size
+    desired_height_f = tf.cast(desired_height, dtype=tf.float32)
+    desired_width_f = tf.cast(desired_width, dtype=tf.float32)
+    height = tf.cast(tf.shape(image)[0], tf.float32)
+    width = tf.cast(tf.shape(image)[1], tf.float32)
+    if boxes is not None:
+        # Converts boxes from normalized coordinates to pixel coordinates.
+        # Now the coordinates of boxes are w.r.t. the original image.
+        boxes = denormalize_boxes(boxes, [height, width])
+    if boxes1 is not None:
+        boxes1 = denormalize_boxes(boxes1, [height, width])
+    if do_random_scale:
+        random_scale_factor = tf.random.uniform([], random_scale_min, random_scale_max)
+        if not shrink_both_sides:
+            # Max random is where scale * W > W_desired
+            #                     scale * H > H_desired
+            rsf_max = tf.maximum(desired_width_f / width, desired_height_f / height)
+            random_scale_factor = tf.minimum(rsf_max, random_scale_factor)
+        scaled_y = tf.cast(random_scale_factor * desired_height_f, tf.int32)
+        scaled_x = tf.cast(random_scale_factor * desired_width_f, tf.int32)
+        # Recompute the accurate scale_factor using rounded scaled image size.
+        image_scale_y = tf.cast(scaled_y, tf.float32) / height
+        image_scale_x = tf.cast(scaled_x, tf.float32) / width
+        image_scale = tf.cond(tf.less(
+            tf.random.uniform([], minval=0, maxval=1, dtype=tf.float32),
+            tf.cast(random_scale_ratio, tf.float32)),
+            lambda: tf.maximum(image_scale_x, image_scale_y),
+            lambda: tf.minimum(image_scale_x, image_scale_y))
+        # image_scale = tf.minimum(image_scale_x, image_scale_y)
+        # Conceptual captions has some REALLY WIDE images I believe
+        # this ensures that we won't scale any side lower than to 64
+        image_scale = tf.maximum(image_scale, 64.0 / tf.minimum(height, width))
+        # Select non-zero random offset (x, y) if scaled image is larger than
+        # self._output_size.
+        scaled_height = tf.cast(height * image_scale, tf.int32)
+        scaled_width = tf.cast(width * image_scale, tf.int32)
+        offset_y = tf.cast(scaled_height - desired_height, tf.float32)
+        offset_x = tf.cast(scaled_width - desired_width, tf.float32)
+        offset_y = tf.maximum(0.0, offset_y) * tf.random.uniform([], 0, 1)
+        offset_x = tf.maximum(0.0, offset_x) * tf.random.uniform([], 0, 1)
+        offset_y = tf.cast(offset_y, tf.int32)
+        offset_x = tf.cast(offset_x, tf.int32)
+    else:
+        image_scale_y = desired_height_f / height
+        image_scale_x = desired_width_f / width
+        image_scale = tf.minimum(image_scale_x, image_scale_y)
+        scaled_height = tf.cast(height * image_scale, tf.int32)
+        scaled_width = tf.cast(width * image_scale, tf.int32)
+        offset_y = tf.constant(0)
+        offset_x = tf.constant(0)
+    # Now resize and crop
+    if resize_method == 'random' and do_random_scale:
+        resize_methods = sorted([k for k in tf.image.ResizeMethod.__dict__.keys() if k.isupper()])
+        image = apply_with_random_selector(
+            image,
+            lambda x, method_idx: tf.image.resize(x, [scaled_height, scaled_width],
+                                                  tf.image.ResizeMethod.__dict__[resize_methods[method_idx]],
+                                                  antialias=True),
+            num_cases=len(resize_methods))
+    elif resize_method != 'random':
+        image = tf.image.resize(image, [scaled_height, scaled_width], method=resize_method, antialias=True)
+    else:
+        image = tf.image.resize(image, [scaled_height, scaled_width],
+                                method=tf.image.ResizeMethod.BILINEAR, antialias=True)
+    image = tf.clip_by_value(image, 0.0, 1.0)
+    # H x W x C
+    image = image[offset_y:offset_y + desired_height, offset_x:offset_x + desired_width, :]
+    H = tf.shape(image)[0]
+    W = tf.shape(image)[1]
+    top_pad = (desired_height - H) // 2
+    left_pad = (desired_width - W) // 2
+    image_mask = pad_to_bounding_box(
+        tf.ones_like(image, dtype=tf.bool), top_pad, left_pad, desired_height, desired_width)[:,:,0]
+    image = pad_to_bounding_box(image, top_pad, left_pad, desired_height, desired_width, value=pad_value)
+    if isinstance(desired_height, int) and isinstance(desired_width, int):
+        image.set_shape([desired_height, desired_width, 3])
+    if masks is not None and tf.size(masks) != 0:
+        masks = tf.image.resize(masks, [scaled_height, scaled_width],
+                                method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
+        if len(masks.shape) == 3:
+            masks = masks[offset_y:offset_y + desired_height, offset_x:offset_x + desired_width]
+        else:
+            masks = masks[:, offset_y:offset_y + desired_height, offset_x:offset_x + desired_width]
+        masks = pad_to_bounding_box(masks, top_pad, left_pad, desired_height, desired_width)
+        masks = tf.image.resize(masks, desired_target_size,
+                                method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
+    indices = None
+    if boxes is not None:
+        # assert ValueError("the box need to be shift which is not tested yet.")
+        boxes = resize_and_crop_boxes(
+            boxes,
+            tf.stack([image_scale, image_scale]),
+            [desired_height, desired_width],
+            tf.cast(tf.stack([offset_y, offset_x]), dtype=tf.float32),
+            tf.cast(tf.stack([top_pad, left_pad]), dtype=tf.float32))
+        if filter_box:
+            indices = get_non_empty_box_indices(boxes)
+        else:
+            indices = tf.range(tf.shape(boxes)[0])
+        boxes = tf.gather(boxes, indices)
+        if labels is not None:
+            labels = tf.gather(labels, indices)
+        if boxes1 is not None:
+            boxes1 = resize_and_crop_boxes(
+                boxes1,
+                tf.stack([image_scale, image_scale]),
+                [desired_height, desired_width],
+                tf.cast(tf.stack([offset_y, offset_x]), dtype=tf.float32),
+                tf.cast(tf.stack([top_pad, left_pad]), dtype=tf.float32))
+    image_info = tf.stack([
+        tf.cast(top_pad, tf.float32),
+        tf.cast(left_pad, tf.float32),
+        1.0 / image_scale,
+        height,
+        width,
+        tf.cast(offset_y, dtype=tf.float32) / height,
+        tf.cast(offset_x, dtype=tf.float32) / width,
+        tf.cast(offset_y, dtype=tf.float32),
+        tf.cast(offset_x, dtype=tf.float32),
+        tf.cast(scaled_height, dtype=tf.float32),
+        tf.cast(scaled_width, dtype=tf.float32),
+        ])
+    if boxes1 is not None:
+        outputs = (image_info, masks, boxes, labels, indices, boxes1)
+    else:
+        outputs = (image_info, masks, boxes, labels, indices)
+    if normalize:
+        image = normalize_image(image)
+    if return_outputs:
+        return image, image_mask, outputs
+    else:
+        return image, image_mask
+def _remove_bars_from_frames(frames, black_bar=True, threshold=32, max_perc_to_trim=0.3):
+    """
+    :param frames: [num_frames, height, width, 3]
+    :param blackbar_threshold: Pixels must be this intense for us to not trim
+    :param max_perc_to_prim: Will trim x% by default of the image at most in each dimension
+    :return:
+    """
+    # Detect black bars####################
+    frames_shape = tf.shape(frames)
+    h, w = frames_shape[1], frames_shape[2]
+    if black_bar:
+      has_content = tf.reduce_max(frames, axis=(0, -1)) >= threshold
+    else:
+      has_content = tf.reduce_min(frames, axis=(0, -1)) <= threshold
+    y_frames = tf.cast(tf.reshape(tf.where(tf.reduce_any(has_content, axis=1)), [-1]), tf.int32)
+    nhbars = tf.shape(y_frames)[0]
+    y_frames = tf.cond(nhbars > 0, lambda: y_frames, lambda: tf.expand_dims(tf.cast(h // 2, tf.int32), axis=0))
+    y1 = tf.minimum(y_frames[0], tf.cast(tf.cast(h, tf.float32) * max_perc_to_trim, tf.int32))
+    y2 = tf.maximum(y_frames[-1] + 1, tf.cast(tf.cast(h, tf.float32) * (1 - max_perc_to_trim), tf.int32))
+    x_frames = tf.cast(tf.reshape(tf.where(tf.reduce_any(has_content, axis=0)), [-1]), tf.int32)
+    nvbars = tf.shape(x_frames)[0]
+    x_frames = tf.cond(nvbars > 0, lambda: x_frames, lambda: tf.expand_dims(tf.cast(w // 2, tf.int32), axis=0))
+    x1 = tf.minimum(x_frames[0], tf.cast(tf.cast(w, tf.float32) * max_perc_to_trim, tf.int32))
+    x2 = tf.maximum(x_frames[-1] + 1, tf.cast(tf.cast(w, tf.float32) * (1 - max_perc_to_trim), tf.int32))
+    frames = frames[:, y1:y2, x1:x2]
+    return frames
+def convert_video_dtype(video,dtype):
+    """
+    Converts tensor to dtype and scales the values.
+    Video equivalent of tf.convert_image_dtype: https://www.tensorflow.org/api_docs/python/tf/image/convert_image_dtype
+    """
+    return tf.map_fn(
+        fn=functools.partial(
+            tf.image.convert_image_dtype,
+            dtype=dtype),
+        elems=video,
+        fn_output_signature=dtype)
+def stateless_shuffle(x: tf.Tensor, seed):
+  if hasattr(tf.random.experimental, 'stateless_shuffle'):
+    return tf.random.experimental.stateless_shuffle(x, seed=seed)
+  else:
+    vals = tf.random.stateless_uniform(tf.shape(x)[:1], seed)
+    ixs = tf.argsort(vals)
+    return tf.gather(x, ixs)
+def stateless_permutation(n: int, seed):
+    if hasattr(tf.random.experimental, 'stateless_shuffle'):
+        ix = tf.range(0, n, dtype=tf.int32)
+        return tf.random.experimental.stateless_shuffle(ix, seed=seed)
+    else:
+        vals = tf.random.stateless_uniform(n, seed)
+        return tf.argsort(vals)
+@seqio.map_over_dataset
+def _strip_metadata(example):
+    return pop_metadata(example)[0]
+def sample_patches(mask, n_patches, stateless=False, seeds=None):
+  input_sample_valid = tf.boolean_mask(tf.range(tf.shape(mask)[0]), mask)
+  input_sample_masked = tf.boolean_mask(tf.range(tf.shape(mask)[0]), mask == 0)
+  if stateless:
+    encoder_pos_ids = tf.concat([
+      stateless_shuffle(input_sample_valid, seeds[0]),
+      stateless_shuffle(input_sample_masked, seeds[1])], axis=0)[:n_patches]
+  else:
+    encoder_pos_ids = tf.concat([
+      tf.random.shuffle(input_sample_valid),
+      tf.random.shuffle(input_sample_masked)], axis=0)[:n_patches]
+  encoder_pos_ids = tf.reshape(encoder_pos_ids, (n_patches,))
+  encoder_pos_ids = tf.cast(encoder_pos_ids, tf.int32)
+  return encoder_pos_ids
+@gin.configurable()
+def normalize_image(image,
+                    offset=(0.48145466, 0.4578275, 0.40821073),
+                    scale=(0.26862954, 0.26130258, 0.27577711)):
+  """Normalizes the image to zero mean and unit variance."""
+  offset = tf.constant(offset)
+  offset = tf.expand_dims(offset, axis=0)
+  offset = tf.expand_dims(offset, axis=0)
+  image -= tf.cast(offset, image.dtype)
+  scale = tf.constant(scale)
+  scale = tf.expand_dims(scale, axis=0)
+  scale = tf.expand_dims(scale, axis=0)
+  image /= tf.cast(scale, image.dtype)
+  return image
+def unnormalize_image(image,
+                    offset=(0.48145466, 0.4578275, 0.40821073),
+                    scale=(0.26862954, 0.26130258, 0.27577711)):
+  """Normalizes the image to zero mean and unit variance."""
+  scale = tf.cast(tf.expand_dims(tf.expand_dims(tf.constant(scale), axis=0), axis=0), image.dtype)
+  image *= scale
+  offset = tf.cast(tf.expand_dims(tf.expand_dims(tf.constant(offset), axis=0), axis=0), image.dtype)
+  image += offset
+  return image
+def flatten_parts(ds: tf.data.Dataset, parts: List[str], add_index=False, dataset_size=None) -> tf.data.Dataset:
+    def _flatten(ex):
+        flat_key = {k: ex[k] for k in parts}
+        if add_index:
+            flat_key['index'] = tf.range(len(ex[parts[0]]))
+        flat_ds = tf.data.Dataset.from_tensor_slices(flat_key)
+        def _merge(_flat_ex):
+            for k, v in ex.items():
+                if k not in parts:
+                    _flat_ex[k] = v
+            return _flat_ex
+        return flat_ds.map(_merge)
+    ds = ds.flat_map(_flatten)
+    if dataset_size is not None:
+        ds = tf.data.experimental.assert_cardinality(dataset_size)(ds)
+    return ds

dataset_sizes.py ADDED Viewed

	@@ -0,0 +1,262 @@

+DATASET_SIZES = {
+    ("cockatoo_qa_v2", "train"): 194820,
+    ("user_qa", "train"): 71172,
+    ("text_vqa", "train"): 34602,
+    ("chart_qa", "train"): 28299,
+    ("chart_qa_prompting", "train"): 28299,
+    ("chart_qa_weighted", "train"): 28299,
+    ("tally_qa", "train"): 132981,
+    ("doc_qa", "train"): 39463,
+    ("info_qa", "train"): 23946,
+    ("okvqa", "train"): 9009,
+    ("gqa", "train"): 943000,
+    ("gqa_multi", "train"): 72140,
+    ("coco_2014_vqa", "train"): 443757,  # (82783, 443757)
+    ("coco_captioning_karpathy", "train"): 414113,  # (82783, 414113)
+    ("coco_captioning_karpathy_multi", "train"): 82783,
+    ("coco_2014_vqa_multi", "train"): 82783,
+    ("science_qa_img", "train"): 6218,
+    ("ai2_diagram", "train"): 11389,
+    ("a_okvqa_mc", "train"): 17056,
+    ("a_okvqa_da", "train"): 17056,
+    ("ocr_vqa", "train"): 166043,
+    ("st_qa", "train"): 25050,
+    ("ocr_qa", "train"): 166043,
+    ("dv_qa", "train"): 200000,
+    ("tabwmp_da", "train"): 23059,
+    ("figure_qa", "train"): 100000,
+    ("figure_qa_zero_shot", "train"): 100000,
+    ("plot_qa", "train"): 157070,
+    ('clocks', 'train'): 800269,
+    ('clocks', 'validation'): 25600,
+    ("st_qa", "test"): 4070,
+    ('text_vqa', "test"): 5734,
+    ('okvqa', "test"): 5046,
+    ('chart_qa', "test"): 1250,
+    ('doc_qa', "test"): 5188,
+    ('info_qa', "test"): 3288,
+    ('gqa', "test"): 95336,
+    ('coco_captioning_karpathy', "test"): 25010,
+    ("science_qa_img", "test"): 2017,
+    ("ai2_diagram", "test"): 3088,
+    ("a_okvqa_mc_eval", "test"): 6702,
+    ("a_okvqa_da_eval", "test"): 6109,
+    ("ai2_diagram_v2", "train"): 10950,
+    ("ai2_diagram_v2", "validation"): 1463,
+    ("ai2_diagram_v2", "test"): 3088,
+    ("vqa_v2_test", "test2015"): 555187,
+    ("ai2_diagram_v2_transparent", "train"): 10950,
+    ("ai2_diagram_v2_transparent", "validation"): 1463,
+    ("ai2_diagram_v2_transparent", "test"): 3088,
+    # splits in mix_data include both transparent + opaque boxes
+    ("ai2_diagram_v2_mix_transparent", "train"): 15042,
+    ("ai2_diagram_v2_mix_transparent", "validation"): 1980,
+    ("ai2_diagram_v2_mix_transparent", "test"): 4272,
+    # vaia_qa
+    ('vaia_qa', 'train'): 477052,
+    ('vaia_qa', 'validation'): 1024,
+    ('vaia_qa_latex_image', 'train'): 477052,
+    ('vaia_qa_latex_image', 'validation'): 1024,
+    ('vaia_qa_latex_image_only', 'train'): 42605,
+    ('vaia_qa_latex_image_only', 'validation'): 1024,
+    ('vaia_qa_latex_all_image_only', 'train'): 154266,
+    ('vaia_qa_latex_all_image_only', 'validation'): 1024,
+    ("vaia_qa_latex_image_math_subset_short_answer", 'train'): 198161,
+    ("vaia_qa_latex_image_math_subset_short_answer", 'validation'): 419,
+    ("vaia_qa_latex_image_math_subset_mc_only_short_answer", "train"): 57568,
+    ("vaia_qa_latex_image_math_subset_mc_only_short_answer", "validation"): 118,
+    ("vaia_qa_latex_image_math_subset_mc_only_short_answer_first", "train"): 57568,
+    ("vaia_qa_latex_image_math_subset_mc_only_short_answer_first", "validation"): 118,
+    ("vaia_qa_latex_image_all_image_only_short_answer", "train"): 86752,
+    ("vaia_qa_latex_image_all_image_only_short_answer", "validation"): 92,
+    ("vaia_qa_latex_image_all_image_only_short_answer_first", "train"): 86752,
+    ("vaia_qa_latex_image_all_image_only_short_answer_first", "validation"): 92,
+    ("vaia_qa_latex_image_math_subset_image_only_short_answer", "train"): 21726,
+    ("vaia_qa_latex_image_math_subset_image_only_short_answer", "validation"): 48,
+    ('vqa_online', 'train'): 62722,
+    ('vqa_online', 'validation'): 1024,
+    ('vqa_online', 'test'): 1024,
+    ('vqa_online_gpt_longQ_longA', 'train'): 62722,
+    ('vqa_online_gpt_longQ_longA', 'validation'): 1024,
+    ('vqa_online_gpt_longQ_longA', 'test'): 1024,
+    ("tally_qa", "validation"): 38589,
+    ('text_vqa', "validation"): 5000,
+    ('okvqa', "validation"): 5046,
+    ('chart_qa', "validation"): 960*2,
+    ('chart_qa_prompting_explanation', "validation"): 960*2,
+    ('chart_qa_ex', "validation"): 960*2,
+    ('chart_qa_human', "validation"): 960,
+    ('chart_qa_aug', "validation"): 960,
+    ('doc_qa', "validation"): 5349,
+    ('info_qa', "validation"): 2801,
+    ('coco_2014_vqa', "validation"): 214354,  # 40504 images
+    ('coco_2014_vqa_multi', "validation"): 214354,
+    ('coco_captioning_karpathy', "validation"): 25010,
+    ('gqa', "validation"): 132062,
+    ("science_qa_img", "validation"): 2097,
+    ("ai2_diagram", "validation"): 1024,
+    ("a_okvqa_mc", "validation"): 1145,
+    ("a_okvqa_da", "validation"): 1075,
+    ("charxiv_descriptive", "validation"): 1000,
+    ("charxiv_descriptive", "test"): 1320,
+    ("charxiv_reasoning", "validation"): 1000,
+    ("charxiv_reasoning", "test"): 1320,
+    ("fintabnetqa", "validation"): 125,
+    ("fintabnetqa", "test"): 250,
+    ("vwtq", "validation"): 125,
+    ("vwtq", "test"): 750,
+    ("vwtq_syn", "validation"): 125,
+    ("vwtq_syn", "test"): 250,
+    ("vtabfact", "validation"): 125,
+    ("vtabfact", "test"): 250,
+    ("nutrition_fact", "validation"): 100,
+    ("nutrition_fact", "test"): 100,
+    ("mmmu_test", "validation"): 900,
+    ("count_bench", "test"): 500,
+    ("mmmu_test", "test"): 10500,
+    ("real_world_qa_test", "test"): 765,
+    ("real_world_qa_no_instruction", "test"): 765,
+    ("real_world_qa_dbg", "test"): 765,
+    ("real_world_qa_as_user_qa", "test"): 765,
+    ("seed_bench_test", "test"): 19241,
+    ("pope_test", "test"): 9000,
+    ("mme_test", "test"): 2374,
+    ("math_vista_test", "validation"): 1000,
+    ("math_vista_demo", "validation"): 1000,
+    ("math_vista_v2", "validation"): 1000,
+    ("math_vista_test", "test"): 5141,
+    ("mmbench_test", "validation"): 4329,
+    ("mmbench_test", "test"): 6666,
+    ("sugar_crepe_test", "test"): 15022,
+    ("blink_test", "validation"): 1901,
+    ("dense_caption_eval_dbg", "validation"): 1,
+    ("refclef_unc", "train"): 17978,
+    ("refclef_unc", "validation"): 12029,
+    ("refcoco_unc", "train"): 16994,
+    ("refcoco_unc", "validation"): 10834,
+    ("refcocoplus_unc", "train"): 16992,
+    ("refcocoplus_unc", "validation"): 10758,
+    ("refcocog_umd", "train"): 21899,
+    ("refcocog_umd", "validation"): 4896,
+    ("refclef_unc", "testA"): 3449,
+    ("refclef_unc", "testB"): 3221,
+    ("refclef_unc", "testC"): 2664,
+    ("refclef_unc", "testAB"): 116,
+    ("refclef_unc", "testBC"): 86,
+    ("refcoco_unc", "testA"): 5657,
+    ("refcoco_unc", "testB"): 5095,
+    ("refcocoplus_unc", "testA"): 5726,
+    ("refcocoplus_unc", "testB"): 4889,
+    ("refcocog_umd", "test"): 9602,
+    ("countbench_qa_point_count", "huggingface"): 490,
+    ('countbench_qa', 'huggingface'): 490,
+    ('cockatoo_712k_sept6', 'train'): 712121,
+    ('cockatoo_712k_sept6', 'validation'): 5120,
+    ('user_qa', 'train'): 71172,
+    ('user_qa', 'validation'): 2048,
+    # pointing
+    ("pointing_test", "test"): 436,
+    ("fast_flickr_count_qa_point_count", "train"): 36916,
+    ("fast_flickr_count_qa_point_count", "validation"): 163,
+    ("fast_flickr_count_qa_point_count", "test"): 540,
+    ("fast_flickr_count_qa_pointing", "train"): 36916,
+    ("fast_flickr_count_qa_pointing", "validation"): 163,
+    ("fast_flickr_count_qa_pointing", "test"): 540,
+    ('pointing', 'train'): 309216,
+    ('point_count', 'train'): 309216,
+    ('pointing', 'validation'): 2054,
+    ('point_count', 'validation'): 2054,
+    ('point_count_high_freq', 'train'): 113840,
+    ('point_count_high_freq', 'validation'): 3969,
+    ('pointing_high_freq', 'train'): 113840,
+    ('pointing_high_freq', 'validation'): 3969,
+    ('point_qa', 'train'): 27856,
+    ('point_qa', 'validation'): 978,
+    ("a_okvqa_da", "test"): 6109,
+    ("a_okvqa_mc", "test"): 6702,
+    ("user_questions_for_elo", "test"): 14851,
+    ("user_questions_for_elo_long", "test"): 1368,
+    ("user_questions_for_elo_9_to_12", "test"): 3000,
+    ("sim_point_count_qa", "train"): 522611,
+    ("sim_point_count_qa", "validation"): 800,
+    ("sim_point_count_qa", "test"): 800,
+    ("sim_count_qa", "train"): 522611,
+    ("sim_count_qa", "validation"): 800,
+    ("sim_count_qa", "test"): 800,
+    ("scifi_charts_qa", "validation"): 1024,
+    ("scifi_table_qa", "validation"): 1024,
+    ("scifi_natural_qa", "validation"): 128,
+    ("scifi_nutrition_qa", "validation"): 128,
+    ("scifi_document_qa", "validation"): 1024,
+    ("scifi_diagram_qa", "validation"): 1024,
+    ("scifi_charts_qa", "train"): 233622,
+    ("scifi_table_qa", "train"): 93036,
+    ("scifi_document_qa", "train"): 142559,
+    ("scifi_diagram_qa", "train"): 33102,
+    ("scifi_charts_qa_split", "train"): 116814,
+    ("scifi_table_qa_split", "train"): 46518,
+    ("scifi_document_qa_split", "train"): 71282,
+    ("scifi_diagram_qa_split", "train"): 16551,
+    ("scifi_charts_qa_exp_split", "train"): 116814,
+    ("scifi_table_qa_exp_split", "train"): 46518,
+    ("scifi_document_qa_exp_split", "train"): 71282,
+    ("scifi_diagram_qa_exp_split", "train"): 16551,
+    ("android_control", "train"): 74714,
+    ("android_control", "validation"): 690,
+    ("android_control", "test"): 3897,
+    ("synthetic_qa_v3_multi_turn", "train"): 9824,
+    ("synthetic_qa_v3", "train"): 162855,
+    ("synthetic_qa_v3_style_tag", "train"): 162855,
+    ("synthetic_qa_v3_as_user_qa", "train"): 162855,
+}
+for (name, split), count in list(DATASET_SIZES.items()):
+    if name in ["chart_qa"]:
+        DATASET_SIZES[(name + "_scifi", split)] = count
+    if name in ["android_control"]:
+        for k in ["ll", "hl", "hl_ll", "hl_cot"]:
+            DATASET_SIZES[(f"{name}_{k}", split)] = count
+    if name in ["scifi_charts_qa" ,"scifi_table_qa", "scifi_document_qa", "scifi_diagram_qa", "scifi_datikz_qa"]:
+        DATASET_SIZES[(name + "_exp", split)] = count
+        DATASET_SIZES[(name[:-3] + "_exp", split)] = count
+        DATASET_SIZES[(name[:-3] + "_demo", split)] = count
+    if name in ["ai2_diagram_v2_mix_transparent"]:
+        DATASET_SIZES[("ai2_diagram_v2_mix_transparent_one_style", split)] = count
+    if name in ["chart_qa", "info_qa", "doc_qa", "text_vqa", "coco_2014_vqa",
+                "ai2_diagram_v2_mix_transparent", "countbench_qa", "chart_qa_human"]:
+        DATASET_SIZES[(name + "_demo", split)] = count
+def get_dataset_size(name, split):
+    if name.endswith("_eval"):
+        if (name, split) in DATASET_SIZES:
+            return DATASET_SIZES[(name, split)]
+        name = name[:-len('_eval')]
+    return DATASET_SIZES[(name, split)]

exceptions.py ADDED Viewed

	@@ -0,0 +1,50 @@

+__all__ = [
+    "OLMoError",
+    "OLMoConfigurationError",
+    "OLMoCliError",
+    "OLMoEnvironmentError",
+    "OLMoNetworkError",
+    "OLMoCheckpointError",
+]
+class OLMoError(Exception):
+    """
+    Base class for all custom OLMo exceptions.
+    """
+class OLMoConfigurationError(OLMoError):
+    """
+    An error with a configuration file.
+    """
+class OLMoCliError(OLMoError):
+    """
+    An error from incorrect CLI usage.
+    """
+class OLMoEnvironmentError(OLMoError):
+    """
+    An error from incorrect environment variables.
+    """
+class OLMoNetworkError(OLMoError):
+    """
+    An error with a network request.
+    """
+class OLMoCheckpointError(OLMoError):
+    """
+    An error occurred reading or writing from a checkpoint.
+    """
+class OLMoThreadError(Exception):
+    """
+    Raised when a thread fails.
+    """

iterable_dataset.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import logging
+import math
+import multiprocessing
+import os
+import pickle
+import queue
+import socket
+import time
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
+from multiprocessing.managers import BaseManager
+from multiprocessing.shared_memory import SharedMemory
+from os.path import exists
+from pathlib import Path
+from typing import Any, Dict, Iterator, List, Optional, Sequence, Union
+import psutil
+import tensorflow as tf
+import numpy as np
+import torch
+import torch.utils.data
+import clu
+from clu.data.dataset_iterator import Element
+from .aliases import PathOrStr
+from .torch_util import barrier, get_fs_local_rank, get_global_rank, get_world_size, get_node_rank, \
+    get_local_world_size, get_local_rank, move_to_device
+from .util import roundrobin, threaded_generator
+from .data_factory import SeqioDataset
+from .multimodal_preprocessor import MultiModalPreprocessor
+from .preprocesssors import rename
+import torch.distributed as dist
+from . import tasks
+__all__ = ["MMIterableDataset"]
+log = logging.getLogger(__name__)
+def batch_fn(batch, for_inference):
+    if for_inference:
+        out = {}
+        for k, v in batch.items():
+            if k.startswith("metadata/"):
+                out[k] = v
+            else:
+                out[k] = torch.from_numpy(v)
+        return out
+    else:
+        out = {k: torch.from_numpy(v) for k, v in batch.items() if not k.startswith("metadata/")}
+        out["metadata"] = [{} for _ in out["input_ids"]]
+        return out
+class PyTorchDatasetIterator(clu.data.dataset_iterator.TfDatasetIterator):
+    def __init__(self, dataset, *, checkpoint: bool, for_inference: bool):
+        self.for_inference = for_inference
+        super().__init__(dataset, checkpoint=checkpoint)
+    def __next__(self) -> Element:
+        batch = {k: v.numpy() for k, v in next(self.iterator).items()}
+        return batch_fn(batch, self.for_inference)
+    def __len__(self) -> int:
+        return len(self._dataset)
+class MMIterableDataset(torch.utils.data.IterableDataset[Dict[str, Any]]):
+    def __init__(
+        self,
+        dataset: SeqioDataset,
+        preprocessor: MultiModalPreprocessor,
+        world_size: Optional[int] = None,
+        rank: Optional[int] = None,
+    ):
+        self.preprocessor = preprocessor
+        self.rank = rank if rank is not None else get_global_rank()
+        self.world_size = world_size if world_size is not None else get_world_size()
+        self.dataset_config = dataset
+        data_iter = dataset.build(
+            self.preprocessor,
+            self.rank,
+            self.world_size,
+        )
+        data_iter: tf.data.Dataset = rename(input_ids="input_tokens", labels="target_tokens")(data_iter)
+        self.dataset = data_iter
+        self.data_iter = PyTorchDatasetIterator(
+            data_iter, checkpoint=True, for_inference=dataset.for_inference)
+    def reset(self):
+        self.data_iter.reset()
+    def save(self, filename: PathOrStr):
+        self.data_iter.save(filename)
+    def restore(self, filename: PathOrStr):
+        self.data_iter.restore(filename)
+    def __iter__(self) -> Iterator[Dict[str, Any]]:
+        return self.data_iter
+def _split_batch(batch, n):
+    subbatches = [{} for _ in range(n)]
+    for k, v in batch.items():
+        assert len(v) % n == 0, f"n={n} but {k} has {len(v)}"
+        subatch_dim = len(v) // n
+        for i, subbatch in enumerate(subbatches):
+            subbatch[k] = v[i * subatch_dim:(i + 1) * subatch_dim]
+    return subbatches
+def tf_to_torch_dtype(tf_dtype):
+    dtype_mapping = {
+        tf.float16: torch.float16,
+        tf.float32: torch.float32,
+        tf.float64: torch.float64,
+        tf.int8: torch.int8,
+        tf.uint8: torch.uint8,
+        tf.int16: torch.int16,
+        tf.int32: torch.int32,
+        tf.int64: torch.int64,
+        tf.bool: torch.bool,
+    }
+    return dtype_mapping[tf_dtype]
+class PeerToPeer(torch.utils.data.IterableDataset[Dict[str, Any]]):
+    """
+    This dataloader runs the tf.data.Dataset on one processes per a node, and then
+    transfers the batch to the other processes. For 7B model about a 10% performance
+    despite my attempts to make it asynchronous
+    The advantage is that it avoids the overhead of running multiple tf.data.Dataset
+    in one node
+    """
+    def __init__(
+        self,
+        dataset: SeqioDataset,
+        preprocessor: MultiModalPreprocessor,
+        world_size: Optional[int] = None,
+        rank: Optional[int] = None,
+        device=None
+    ):
+        assert get_world_size() % get_local_world_size() == 0
+        self.device = device
+        self.device_batch_size = dataset.global_batch_size // get_world_size()
+        self.preprocessor = preprocessor
+        self.seqio_dataset = dataset
+        lws = get_local_world_size()
+        if get_local_rank() == 0:
+            tf_dataset = dataset.build(
+                self.preprocessor,
+                get_node_rank(),
+                get_world_size() // lws,
+            )
+            tf_dataset = rename(input_ids="input_tokens", labels="target_tokens")(tf_dataset)
+            self.dataset = tf_dataset
+            device_spec = {k: ((v.shape[0]//lws,) + tuple(v.shape[1:]), tf_to_torch_dtype(v.dtype))
+                           for k, v in tf_dataset.element_spec.items()}
+        else:
+            self.dataset = None
+            device_spec = None
+        broadcast = [device_spec]
+        torch.distributed.broadcast_object_list(broadcast)
+        self.device_spec = broadcast[0]
+        self._node_group_ranks = ranks = [(i + get_node_rank()*lws) for i in range(lws)]
+        if get_local_rank() == 0:
+            assert get_global_rank() == self._node_group_ranks[0]
+        self._keys = sorted(self.device_spec)
+        self.multithread_pin = False
+    def _pin(self, it, on):
+        batch = next(it)
+        batch = {k: torch.from_numpy(v) for k, v in batch.items()}
+        batch = _split_batch(batch, len(self._node_group_ranks))
+        return [{k: v.pin_memory() for k, v in subbatch.items()} for subbatch in batch]
+    def _send_pinned(self, batch):
+        requests = []
+        for rank_ix, rank in enumerate(self._node_group_ranks[1:], start=1):
+            for k in self._keys:
+                batch[rank_ix][k] = batch[rank_ix][k].to(self.device, non_blocking=True)
+                requests.append(dist.P2POp(dist.isend, batch[rank_ix][k], rank))
+        ops = dist.batch_isend_irecv(requests)
+        return batch[0], ops
+    def _send(self, it, on):
+        if get_local_rank() == 0:
+            try:
+                batch = next(it)
+                batch = {k: torch.from_numpy(v) for k, v in batch.items()}
+                batch = _split_batch(batch, len(self._node_group_ranks))
+            except StopIteration:
+                # Special batch to indicate iteration is done
+                batch = [
+                    {k: torch.full(sh, -10, dtype=dtype, device=self.device)
+                     for k, (sh, dtype) in self.device_spec.items()}
+                    for _ in range(len(self._node_group_ranks))
+                ]
+            # pin_memory so the device transfer can be non_blocking
+            batch = [{k: v.pin_memory() for k, v in subbatch.items()}
+                     for subbatch in batch]
+            requests = []
+            for rank_ix, rank in enumerate(self._node_group_ranks[1:], start=1):
+                for k in self._keys:
+                    batch[rank_ix][k] = batch[rank_ix][k].to(self.device, non_blocking=True)
+                    requests.append(dist.P2POp(dist.isend, batch[rank_ix][k], rank))
+            ops = dist.batch_isend_irecv(requests)
+            batch = batch[0]
+        else:
+            batch = {k: torch.zeros(sh, dtype=dtype, device=self.device)
+                     for k, (sh, dtype) in self.device_spec.items()}
+            requests = []
+            for k in self._keys:
+                requests.append(dist.P2POp(dist.irecv, batch[k], self._node_group_ranks[0]))
+            ops = dist.batch_isend_irecv(requests)
+        return batch, ops
+    def __iter__(self):
+        on = 0
+        if get_local_rank() == 0:
+            it = iter(self.dataset.as_numpy_iterator())
+        else:
+            it = None
+        if get_local_rank() == 0 and self.multithread_pin:
+            # Try to be clever and do memory pinning in a seperate thread, in practice
+            # didn't seem to help much so off by default for now
+            # Currently does not support finite dataset
+            with ThreadPoolExecutor(max_workers=1) as pool:
+                _is_sending = self._send_pinned(self._pin(it, on))
+                _is_pinning = pool.submit(self._pin, it, on)
+                on += 1
+                while True:
+                    result = _is_sending
+                    _is_sending = self._send_pinned(_is_pinning.result())
+                    _is_pinning = pool.submit(self._pin, it, on)
+                    on += 1
+                    for op in result[1]:
+                        op.wait()
+                    yield result[0]
+        else:
+            _in_flight = self._send(it, on)
+            on += 1
+            while True:
+                on += 1
+                next_batch = self._send(it, on)  # queue up the next batch
+                for op in _in_flight[1]:  # wait for the current batch
+                    op.wait()
+                if _in_flight["input_ids"][0] != -10:  # indicates no more data
+                    return
+                yield _in_flight[0]
+                _in_flight = next_batch

modeling_molmoe.py CHANGED Viewed

@@ -39,14 +39,14 @@ import einops
 from transformers import PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithPast
-from olmo.aliases import PathOrStr
-from olmo.beam_search import (
     BeamSearch,
     Constraint,
     FinalSequenceScorer,
     Sampler
 )
-from olmo.config import (
     ActivationType,
     BlockType,
     LayerNormType,
@@ -56,7 +56,7 @@ from olmo.config import (
     AttentionType,
 )
-from olmo.util import resource_path
 from .config_molmoe import (
     MolmoConfig,
     VisionBackboneConfig

 from transformers import PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithPast
+from .aliases import PathOrStr
+from .beam_search import (
     BeamSearch,
     Constraint,
     FinalSequenceScorer,
     Sampler
 )
+from .config import (
     ActivationType,
     BlockType,
     LayerNormType,
     AttentionType,
 )
+from .util import resource_path
 from .config_molmoe import (
     MolmoConfig,
     VisionBackboneConfig

multimodal_preprocessor.py ADDED Viewed

	@@ -0,0 +1,1549 @@

+import dataclasses
+import logging
+import re
+from collections import defaultdict
+from typing import Tuple, Optional, Any, Dict, List, Union, Mapping
+import einops
+import seqio
+import numpy as np
+import tensorflow as tf
+from .mm_data import seqio_tokenizer
+from .data_utils import pad_to_bounding_box, \
+    get_3d_subsegments, _append_to_innermost_axis, resize_and_pad, \
+    apply_with_random_selector, get_special_token_ids, make_autoregressive_inputs, \
+    trim_and_pad_dataset, assert_not_truncated
+from .prompts import apply_keyword_prompt, STYLE_TO_GENERAL_PROMPT, GENERAL_PROMPTS_V1
+import .constants as config
+def siglip_resize(src, imgsize, truncate):
+    """Resize and preprocess for SigLIP ViT in the offical jax implementation"""
+    assert src.dtype == tf.uint8
+    # SigCLIP removes aspect ratio by default
+    resized = tf.image.resize(src, imgsize, method=tf.image.ResizeMethod.BILINEAR, antialias=False)
+    dtype = src.dtype
+    tf_dtype = tf.type_spec_from_value(src).dtype
+    resized = tf.cast(tf.clip_by_value(resized, tf_dtype.min, tf_dtype.max), dtype)
+    # Normalize between -1 and 1 without using imagenet standard mean/std
+    vmin=-1; vmax=1; in_min=0; in_max=255.0
+    in_min_t = tf.constant(in_min, tf.float32)
+    in_max_t = tf.constant(in_max, tf.float32)
+    image = tf.cast(resized, tf.float32)
+    image = (image - in_min_t) / (in_max_t - in_min_t)
+    image = vmin + image * (vmax - vmin)
+    if truncate:
+        image = image[:truncate, :truncate]
+    return image
+def extract_bboxes(text, image_w, image_h):
+    points = extract_points(text, image_w, image_h)
+    boxes = []
+    for i in range(len(points)//2):
+        x1, y1 = points[i*2]
+        x2, y2 = points[i*2 + 1]
+        boxes.append([x1, y1, x2, y2])
+    return boxes
+def extract_annotated_points(caption, image_w, image_h):
+    points = []
+    for match in re.finditer("<point x=\"([0-9\\.]*)\" y=\"([0-9\\.]*)\" alt=\"([^\"]*)\">", caption):
+        x = float(match.group(1))
+        y = float(match.group(2))
+        points.append(([[x, y]], match.group(3)))
+    for match in re.finditer("<points ([^<]*) alt=\"([^\"]*)\">", caption):
+        loc_str = match.group(1)
+        locations = defaultdict(dict)
+        if loc_str.startswith("points="):
+            point_grp = []
+            for point_match in re.finditer(r"([0-9]+\.[0-9]),? ([0-9]+\.[0-9])", loc_str):
+                try:
+                    point = [float(point_match.group(i)) for i in range(1, 3)]
+                    point_grp.append(point)
+                except ValueError:
+                    pass
+        else:
+            for val in loc_str.split():
+                try:
+                    key, val = val.split("=")
+                    locations[key[1:]][key[:1]] = float(val.strip("\""))
+                except ValueError:
+                    import pdb; pdb.set_trace()
+                    logging.warning(f"Failed to parse {val} from {match.group(0)}")
+            point_grp = []
+            for key, coords in locations.items():
+                if sorted(coords) == ["x", "y"]:
+                    point_grp.append([coords["x"], coords["y"]])
+        if point_grp:
+            points.append((point_grp, match.group(2)))
+    normalized = []
+    for point_grp, point_text in points:
+        normalized.append((
+            np.array(point_grp) / 100.0 * np.array([image_w, image_h]),
+            point_text,
+        ))
+    return normalized
+def extract_points(text, image_w, image_h):
+    all_points = []
+    for match in re.finditer(r"Click\(([0-9]+\.[0-9]), ?([0-9]+\.[0-9])\)", text):
+        try:
+            point = [float(match.group(i)) for i in range(1, 3)]
+        except ValueError:
+            pass
+        else:
+            point = np.array(point)
+            if np.max(point) > 100:
+                # Treat as an invalid output
+                continue
+            point /= 100.0
+            point = point * np.array([image_w, image_h])
+            all_points.append(point)
+    for match in re.finditer(r"\(([0-9]+\.[0-9]),? ?([0-9]+\.[0-9])\)", text):
+        try:
+            point = [float(match.group(i)) for i in range(1, 3)]
+        except ValueError:
+            pass
+        else:
+            point = np.array(point)
+            if np.max(point) > 100:
+                # Treat as an invalid output
+                continue
+            point /= 100.0
+            point = point * np.array([image_w, image_h])
+            all_points.append(point)
+    for match in re.finditer(r'x\d*="\s*([0-9]+(?:\.[0-9]+)?)"\s+y\d*="\s*([0-9]+(?:\.[0-9]+)?)"', text):
+        try:
+            point = [float(match.group(i)) for i in range(1, 3)]
+        except ValueError:
+            pass
+        else:
+            point = np.array(point)
+            if np.max(point) > 100:
+                # Treat as an invalid output
+                continue
+            point /= 100.0
+            point = point * np.array([image_w, image_h])
+            all_points.append(point)
+    for match in re.finditer(r'(?:\d+|p)\s*=\s*([0-9]{3})\s*,\s*([0-9]{3})', text):
+        try:
+            point = [int(match.group(i)) / 10.0 for i in range(1, 3)]
+        except ValueError:
+            pass
+        else:
+            point = np.array(point)
+            if np.max(point) > 100:
+                # Treat as an invalid output
+                continue
+            point /= 100.0
+            point = point * np.array([image_w, image_h])
+            all_points.append(point)
+    return all_points
+def extract_points_from_point_count(text, image_w, image_h):
+    all_points = []
+    points = re.findall(r"(\d+\.\d+),\s*(\d+\.\d+)", text)
+    for match in points:
+        try:
+            point = [float(match[0]), float(match[1])]
+        except ValueError:
+            pass
+        else:
+            point = np.array(point)
+            if np.max(point) > 100:
+                # Treat as an invalid output
+                continue
+            point = point * np.array([image_w, image_h])
+            all_points.append(point)
+    return all_points
+def select_tiling(h, w, patch_size, max_num_patches):
+    """Decide how best to divide in image of size [w, h] in up to max_num_patches of size patch_size"""
+    original_size = tf.stack([h, w])  # [1, 2]
+    original_res = h * w
+    tilings = []
+    for i in range(1, max_num_patches+1):
+        for j in range(1, max_num_patches+1):
+            if i*j <= max_num_patches:
+                tilings.append((i, j))
+    # sort so argmin and argmax favour smaller tilings in the event of a tie
+    tilings.sort(key=lambda x: (x[0]*x[1], x[0]))
+    candidate_tilings = tf.constant(tilings, dtype=tf.int32)  # [n_resolutions, 2]
+    candidate_resolutions = candidate_tilings * patch_size  # [n_resolutions, 2]
+    # How much we would need to scale the image to fit exactly in each tiling
+    required_scale_d = tf.cast(candidate_resolutions, tf.float32) / tf.cast(original_size[None, :], tf.float32)
+    required_scale = tf.reduce_min(required_scale_d, axis=-1, keepdims=True)  # [n_resolutions, 1]
+    if tf.reduce_all(required_scale < 1):
+        # We are forced to downscale, so try to minimize the amount of downscaling
+        ix = tf.argmax(required_scale)[0]
+    else:
+        # Pick the resolution that required the least upscaling so that it most closely fits the image
+        required_scale = tf.where(required_scale < 1.0, 10e9, required_scale)
+        ix = tf.argmin(required_scale)[0]
+    return candidate_tilings[ix]
+DEMO_STYLES = [
+    "point_count",
+    "pointing",
+    "user_qa",
+    "scifi_charts_exp",
+    "scifi_charts_exp",
+    "scifi_charts_exp",
+    "scifi_charts_exp",
+    "long_caption",
+    "named_entity"
+]
+@dataclasses.dataclass
+class MultiModalPreprocessor:
+    """Turns text/image inputs into tensors that can be input to the model"""
+    tokenizer: Any
+    # How to prompt the model
+    prompt_templates: str = "none"  # How to template prompts for examples
+    message_format: str = "none"  # How to format messages
+    system_prompt: Optional[str] = None  # How to generate system prompts
+    prompt_override: Optional[str] = None  # Used for setting prompt manually
+    always_start_with_space: bool = False  # Always include a leading space for the first bit of text
+    default_inference_len: int = 65  # Inference len for length-conditioned prompting
+    # How to crops/resize images
+    crop_mode: str = "resize"
+    max_crops: int = 6
+    overlap_margins: Tuple[int, int] = (4, 4)
+    do_random_scale: Optional[bool] = False
+    resize: str = "default"
+    random_scale_max: float = 1.1
+    random_scale_min: float = 0.9
+    random_scale_ratio: float = 0.5
+    use_col_tokens: bool = True
+    # Data about the ViT and connector we need when deciding the crops
+    base_image_input_size: Tuple[int, int] = (336, 336)
+    image_token_length_w: int = 12
+    image_token_length_h: int = 12
+    image_patch_size: int = 14
+    image_padding_mask: bool = False
+    # Other settings
+    loss_token_weighting: Optional[str] = None
+    unconditioned: Union[bool, float] = False  # Ignore images
+    fix_image_input_idx: int = 2  # backwards compatibility fix
+    pad_to: Optional[int] = None  # experimental feature
+    _special_tokens: Dict[str, int] = None
+    split_at: Optional[int] = None
+    def get_max_total_crops(self):
+        if self.crop_mode == "resize":
+            return 1
+        elif "resize" in self.crop_mode:
+            return 1 + self.max_crops
+        else:
+            return self.max_crops
+    @property
+    def image_num_patch(self):
+        h, w = self.base_image_input_size
+        return h//self.image_patch_size, w//self.image_patch_size
+    @property
+    def special_token_ids(self):
+        if self._special_tokens is None:
+            self._special_tokens = get_special_token_ids(self.tokenizer)
+        return self._special_tokens
+    def image_to_patches_and_tokens(self, image, is_training):
+        """Preprocesses an image
+        Args:
+            image: [h, w, 3] image to preprocessing
+        Returns:
+            crops: (n_crops, n_patches, patch_dim) individual crops, `n_crops` might
+                   change between images but the other dimension are fixed
+            tokens: (n_tokens,) tf.int32 tokens, pad tokens indicate where to insert the
+                                patch features, might include other special tokens as well
+            patch_ordering: (n_crops, n_tokens_per_crop) order image features should be inserted
+                            into the `tokens`, negative values indicates patches features to exclude
+            padding_mask: (n_crops, h, w) mask of what pixels are padding, can be None
+        """
+        do_random_scale = self.do_random_scale
+        if do_random_scale:
+            do_random_scale = is_training
+        base_image_input_size = self.base_image_input_size
+        if isinstance(base_image_input_size, int):
+            base_image_input_size = (base_image_input_size, base_image_input_size)
+        image_token_length_w, image_token_length_h = self.image_token_length_w, self.image_token_length_h
+        base_image_input_d = self.image_patch_size
+        tokens_per_image = image_token_length_w * image_token_length_h
+        image_base_patch_w = base_image_input_size[1] // base_image_input_d
+        image_base_patch_h = base_image_input_size[0] // base_image_input_d
+        extra_image = False
+        patch_ordering = None
+        if self.resize == "default":
+            image = tf.image.convert_image_dtype(image, dtype=tf.float32)
+            def _resize(_image, sz):
+                return resize_and_pad(
+                    _image, sz,
+                    do_random_scale=do_random_scale,
+                    random_scale_max=self.random_scale_max,
+                    random_scale_min=self.random_scale_min,
+                    random_scale_ratio=self.random_scale_ratio,
+                    return_outputs=False,
+                    resize_method='random' if is_training else tf.image.ResizeMethod.BILINEAR)
+        elif self.resize == "stretch":
+            image = tf.image.convert_image_dtype(image, dtype=tf.float32)
+            assert not do_random_scale
+            def _resize(_image, sz):
+                if not is_training:
+                    img = tf.image.resize(_image, sz, antialias=True, method=tf.image.ResizeMethod.BILINEAR)
+                else:
+                    resize_methods = sorted([k for k in tf.image.ResizeMethod.__dict__.keys() if k.isupper()])
+                    img = apply_with_random_selector(
+                        _image,
+                        lambda x, method_idx: tf.image.resize(x, sz,
+                                                              tf.image.ResizeMethod.__dict__[resize_methods[method_idx]],
+                                                              antialias=True),
+                        num_cases=len(resize_methods))
+                return img, tf.ones(tf.shape(img)[:2], dtype=tf.bool)
+        elif self.resize in "siglip":
+            assert not do_random_scale
+            def _resize(_image, sz):
+                img = siglip_resize(_image, sz, truncate=None)
+                return img, tf.ones(tf.shape(img)[:2], dtype=tf.bool)
+        else:
+            raise NotImplementedError(self.resize)
+        def _img_to_patches(_img, _img_mask, dy=1, dx=1):
+            _img = einops.rearrange(
+                _img, '(dy h dh) (dx w dw) c -> (dy dx) (h w) (dh dw c)',
+                dh=base_image_input_d,
+                dw=base_image_input_d,
+                dy=dy,
+                dx=dx,
+                h=image_base_patch_h,
+                w=image_base_patch_w
+            )
+            _img_mask = einops.rearrange(
+                _img_mask, '(dy h dh) (dx w dw) -> (dy dx) (h w) (dh dw)',
+                dh=base_image_input_d,
+                dw=base_image_input_d,
+                dy=dy,
+                dx=dx,
+                h=image_base_patch_h,
+                w=image_base_patch_w
+            )
+            return _img, tf.reduce_mean(tf.cast(_img_mask, tf.float32), -1)
+        mode = self.crop_mode
+        if mode == "resize":
+            patches, img_mask = _resize(image, base_image_input_size)
+            patches, img_mask = _img_to_patches(patches, img_mask)
+            image_layout_impatch_w = 1
+            image_layout_impatch_h = 1
+            patch_ordering = tf.range(tokens_per_image)[None, :]
+        elif mode in ["overlap", "overlap-and-resize-c2"]:
+            original_image_h = tf.shape(image, out_type=tf.int32)[0]
+            original_image_w = tf.shape(image, out_type=tf.int32)[1]
+            crop_size = base_image_input_size[0]
+            # Discard this many patches from the (left/top, right/bottom) of crops
+            left_margin, right_margin = self.overlap_margins
+            # left_margin, right_margin = 2, 2
+            assert left_margin % 2 == 0  # Required for compatibility with 2x2 pooling
+            total_margin_pixels = base_image_input_d*(right_margin + left_margin)  # pixels removed per dim
+            crop_patches = base_image_input_size[0] // base_image_input_d  # patches per crop dim
+            crop_window_patches = crop_patches - (right_margin + left_margin)  # usable patches
+            crop_window_size = crop_window_patches * base_image_input_d
+            tiling = select_tiling(original_image_h - total_margin_pixels, original_image_w - total_margin_pixels,
+                                   crop_window_size, self.max_crops)
+            src, img_mask = _resize(
+                image, [tiling[0]*crop_window_size+total_margin_pixels, tiling[1]*crop_window_size+total_margin_pixels])
+            n_crops = tiling[0]*tiling[1]
+            patches_arr = tf.TensorArray(
+                tf.float32, n_crops, element_shape=[crop_size, crop_size, 3])
+            mask_arr = tf.TensorArray(
+                tf.bool, n_crops, element_shape=[crop_size, crop_size])
+            # We assume 2x2 pooling, but can allow padding the right/bottom with extra
+            # patches if the number of patches per side is not even
+            assert (crop_patches+1)//2 == image_token_length_h
+            assert (crop_patches+1)//2 == image_token_length_w
+            patch_ordering_arr = tf.TensorArray(
+                tf.int32, n_crops, element_shape=[image_token_length_h, image_token_length_w])
+            on = 0
+            on_patch = 0
+            for i in range(tiling[0]):
+                y0 = i*crop_window_size
+                if i == 0:
+                    crop_y0 = 0
+                else:
+                    crop_y0 = left_margin // 2
+                crop_h = image_base_patch_h - (right_margin + left_margin)
+                if i == 0:
+                    crop_h += left_margin
+                if i == (tiling[0]-1):
+                    crop_h += right_margin
+                for j in range(tiling[1]):
+                    x0 = j*crop_window_size
+                    if j == 0:
+                        crop_x0 = 0
+                    else:
+                        crop_x0 = left_margin // 2
+                    crop_w = image_base_patch_w - (right_margin + left_margin)
+                    if j == 0:
+                        crop_w += left_margin
+                    if j == (tiling[1]-1):
+                        crop_w += right_margin
+                    pooled_w = (crop_w + 1) // 2
+                    pooled_h = (crop_h + 1) // 2
+                    patch_ordering_arr = patch_ordering_arr.write(
+                        on_patch,
+                        pad_to_bounding_box(
+                            tf.reshape(tf.range(on, on+pooled_h*pooled_w, dtype=tf.int32), (pooled_h, pooled_w, 1)),
+                            crop_y0, crop_x0, image_token_length_h, image_token_length_w, value=-1
+                        )[:, :, 0]
+                    )
+                    patches_arr = patches_arr.write(on_patch, src[y0:y0+crop_size, x0:x0+crop_size])
+                    mask_arr = mask_arr.write(on_patch, img_mask[y0:y0+crop_size, x0:x0+crop_size])
+                    on += pooled_h*pooled_w
+                    on_patch += 1
+            patches = patches_arr.stack()
+            patch_ordering = patch_ordering_arr.stack()
+            img_mask = mask_arr.stack()
+            image_layout_impatch_w, image_layout_impatch_h = tiling[0], tiling[1]
+            patches = einops.rearrange(
+                patches, 'p (h dh) (w dw) c -> p (h w) (dh dw c)',
+                dh=base_image_input_d,
+                dw=base_image_input_d,
+                h=image_base_patch_h,
+                w=image_base_patch_w
+            )
+            img_mask = einops.rearrange(
+                img_mask, 'p (h dh) (w dw) -> p (h w) (dh dw)',
+                dh=base_image_input_d,
+                dw=base_image_input_d,
+                h=image_base_patch_h,
+                w=image_base_patch_w
+            )
+            img_mask = tf.reduce_mean(tf.cast(img_mask, tf.float32), -1)
+            patch_ordering = tf.reshape(patch_ordering, [-1])
+            valid = patch_ordering >= 0
+            # Transpose, to get left-to-right order
+            patch_ordering_rh = tf.reshape(patch_ordering,
+                                           [tiling[0], tiling[1], image_token_length_h, image_token_length_w])
+            patch_ordering_rh = tf.transpose(patch_ordering_rh, [0, 2, 1, 3])
+            patch_ordering_rh = tf.reshape(patch_ordering_rh, [-1])
+            # The tranpose will screw up which patches are masked, project the
+            # new order into sparse structure of `patch_ordering` to fix this
+            patch_ordering = tf.tensor_scatter_nd_update(
+                patch_ordering,
+                tf.where(valid),
+                tf.boolean_mask(patch_ordering_rh, patch_ordering_rh >= 0),
+                name="patch_order_transpose_Scatter"
+            )
+            h = tiling[0]*crop_window_patches + (right_margin+left_margin)
+            w = tiling[1]*crop_window_patches + (right_margin+left_margin)
+            special_token_ids = self.special_token_ids
+            per_row = tf.fill(((w+1)//2,),
+                              special_token_ids[config.DEFAULT_IMAGE_PATCH_TOKEN],)
+            if self.use_col_tokens:
+                per_row = tf.concat([per_row, [special_token_ids[config.DEFAULT_IM_COL_TOKEN]]], 0)
+            joint = tf.tile(per_row, [(h+1)//2])
+            joint = [
+                [special_token_ids[config.DEFAULT_IM_START_TOKEN]],
+                joint,
+                [special_token_ids[config.DEFAULT_IM_END_TOKEN]]
+            ]
+            if "resize" in mode:
+                resized, resized_mask = _resize(image, base_image_input_size)
+                resized, resized_mask = _img_to_patches(resized, resized_mask)
+                if 'c2' in mode:
+                    patches = tf.concat([resized, patches], 0)
+                    image_mask = tf.concat([resized_mask, img_mask], 0)
+                else:
+                    patches = tf.concat([patches, resized], 0)
+                    image_mask = tf.concat([img_mask, resized_mask], 0)
+                if patch_ordering is not None:
+                    if 'c2' in mode:
+                        patch_ordering = tf.where(
+                            patch_ordering >= 0,
+                            patch_ordering + tokens_per_image,
+                            -1
+                        )
+                        patch_ordering = tf.concat([tf.range(0, tokens_per_image), patch_ordering], 0)
+                    else:
+                        raise ValueError()
+                per_row = tf.fill((image_token_length_w,), special_token_ids[config.DEFAULT_IMAGE_PATCH_TOKEN],)
+                if self.use_col_tokens:
+                    per_row = tf.concat([per_row, [special_token_ids[config.DEFAULT_IM_COL_TOKEN]]], 0)
+                extra_tokens = tf.tile(per_row, [image_token_length_h])
+                joint = [
+                            [special_token_ids[config.DEFAULT_IM_START_TOKEN]],
+                            extra_tokens,
+                            [special_token_ids[config.DEFAULT_IM_END_TOKEN]],
+                        ] + joint
+            joint = tf.concat(joint, 0)
+            return patches, joint, patch_ordering, img_mask
+        elif mode in ["patchify", "patchify-and-resize", "patchify-v2", "patchify-v2-and-resize", "patchify-v2-and-resize-c2"]:
+            original_image_w = tf.shape(image, out_type=tf.int32)[0]
+            original_image_h = tf.shape(image, out_type=tf.int32)[1]
+            assert base_image_input_size[0] == base_image_input_size[1]
+            base_patch_size = base_image_input_size[0]
+            tiling = select_tiling(original_image_w, original_image_h, base_patch_size, self.max_crops)
+            patches, img_mask = _resize(
+                image, [tiling[0]*base_patch_size, tiling[1]*base_patch_size])
+            patches, img_mask = _img_to_patches(patches, img_mask, tiling[0], tiling[1])
+            if 'v2' in mode:
+                # Order patches left-to-right not crop-by-crop
+                patch_ordering = tf.reshape(
+                    tf.range(tokens_per_image*tiling[0]*tiling[1]),
+                    [tiling[0], tiling[1], image_token_length_w, image_token_length_h])
+                patch_ordering = tf.transpose(patch_ordering, [0, 2, 1, 3])
+                patch_ordering = tf.reshape(patch_ordering, (-1, tokens_per_image))
+            else:
+                patch_ordering = None
+            # given image size, determine the number of patch size.
+            image_layout_impatch_w = tiling[0]
+            image_layout_impatch_h = tiling[1]
+            if "resize" in mode:
+                extra_image = True
+                resized, resized_mask = _resize(image, base_image_input_size)
+                resized, resized_mask = _img_to_patches(resized, resized_mask)
+                if 'c2' in mode:
+                    patches = tf.concat([resized, patches], 0)
+                    image_mask = tf.concat([resized_mask, img_mask], 0)
+                else:
+                    patches = tf.concat([patches, resized], 0)
+                    image_mask = tf.concat([img_mask, resized_mask], 0)
+                if patch_ordering is not None:
+                    if 'c2' in mode:
+                        patch_ordering = tf.concat(
+                            [tf.range(0, tokens_per_image)[None, :], patch_ordering+tokens_per_image], 0)
+                    else:
+                        n = tf.shape(patch_ordering)[0]
+                        patch_ordering = tf.concat(patch_ordering, [tf.range(n, n+tokens_per_image)[None, :]], 0)
+        else:
+            raise NotImplementedError(mode)
+        special_token_ids = self.special_token_ids
+        per_row = tf.fill((image_token_length_w*image_layout_impatch_w,),
+                          special_token_ids[config.DEFAULT_IMAGE_PATCH_TOKEN],)
+        if self.use_col_tokens:
+            per_row = tf.concat([per_row, [special_token_ids[config.DEFAULT_IM_COL_TOKEN]]], 0)
+        joint = tf.tile(per_row, [image_token_length_h * image_layout_impatch_h])
+        joint = [
+            [special_token_ids[config.DEFAULT_IM_START_TOKEN]],
+            joint,
+            [special_token_ids[config.DEFAULT_IM_END_TOKEN]]
+        ]
+        if extra_image:
+            assert not self.image_padding_mask
+            per_row = tf.fill((image_token_length_w,), special_token_ids[config.DEFAULT_IMAGE_PATCH_TOKEN],)
+            if self.use_col_tokens:
+                per_row = tf.concat([per_row, [special_token_ids[config.DEFAULT_IM_COL_TOKEN]]], 0)
+            extra_tokens = tf.tile(per_row, [image_token_length_h])
+            if 'c2' in mode:
+                joint = [
+                            [special_token_ids[config.DEFAULT_IM_START_TOKEN]],
+                            extra_tokens,
+                            [special_token_ids[config.DEFAULT_IM_END_TOKEN]],
+                        ] + joint
+            else:
+                joint += [
+                    [special_token_ids[config.DEFAULT_IM_START_TOKEN]],
+                    extra_tokens,
+                    [special_token_ids[config.DEFAULT_IM_END_TOKEN]]
+                ]
+        if self.pad_to is not None:
+            n = [tf.shape(x)[0] for x in joint]
+            assert len(joint[-1]) == 1
+            to_pad = self.pad_to - tf.reduce_sum(tf.stack(n))
+            joint = tf.concat(joint[:-1] + [
+                tf.zeros(to_pad, dtype=tf.int32) - 1,
+                joint[-1]
+            ], axis=0)
+        else:
+            joint = tf.concat(joint, 0)
+        return patches, tf.concat(joint, 0), patch_ordering, img_mask
+    def build_image_input_idx(self, input_tokens, patch_order, no_image=None):
+        """Builds the index used to insert patch features into `input_tokens`"""
+        tokens_per_image = self.image_token_length_w * self.image_token_length_h
+        if no_image is not None and no_image:
+            return tf.zeros((0, tokens_per_image), tf.int32)
+        image_input_idx = input_tokens == self.special_token_ids[config.DEFAULT_IMAGE_PATCH_TOKEN]
+        image_input_idx = tf.experimental.numpy.nonzero(image_input_idx)[0]
+        image_input_idx = tf.cast(image_input_idx, tf.int32)
+        if patch_order is not None:
+            n_tokens = tf.shape(image_input_idx)[0]
+            # Item N should have the value of image_input_index[where(patch_order == n)] if >= 0 else -1
+            patch_order = tf.reshape(patch_order, [-1])
+            n_patches = tf.shape(patch_order)[0]
+            if n_tokens != n_patches:
+                # Most complex case where some patches are dropped
+                # First invert the valid tokens
+                valid = patch_order >= 0
+                sorted_patch_ixs = tf.scatter_nd(
+                    tf.boolean_mask(patch_order, valid)[:, None],
+                    tf.range(tf.reduce_sum(tf.cast(valid, tf.int32)), dtype=tf.int32),
+                    [n_tokens],
+                    name="valid_order_scatter"
+                )
+                # Project the inverted mapping into same sparse structure
+                tmp = tf.fill(tf.shape(patch_order), -1)
+                sorted_patch_ixs_ex = tf.tensor_scatter_nd_update(
+                    tmp,
+                    tf.where(valid),
+                    sorted_patch_ixs,
+                    name="order_with_padding_scatter"
+                )
+                # Do the gather and then re-masked outputs that were masked in `sorted_patch_ixs`
+                valid = tf.cast(sorted_patch_ixs_ex >= 0, tf.int32)
+                image_input_idx = tf.gather(image_input_idx, sorted_patch_ixs_ex*valid)
+                image_input_idx = image_input_idx*valid - 100*(1 - valid)
+            else:
+                sorted_patch_ixs = tf.scatter_nd(patch_order[:, None], tf.range(n_patches), [n_patches])
+                image_input_idx = tf.gather(tf.reshape(image_input_idx, [-1]), sorted_patch_ixs)
+            image_input_idx = tf.reshape(image_input_idx, [-1, tokens_per_image])
+        return image_input_idx
+    def build_multimodel_features(self, tokens, mask, subsegments, images, is_training):
+        """Builds input features by pre-processing `images` and modifying `tokens`
+        to include image col/pad/start/end tokens instead image placeholder tokens
+        """
+        image_token_id = self.special_token_ids[config.IMAGE_PROMPT]
+        image_idx = tf.experimental.numpy.nonzero(tokens == image_token_id)[0]
+        if images is None or tf.shape(images)[0] == 0:
+            tf.debugging.assert_equal(image_idx, tf.cast(0, tf.int64),
+                                      "Image placeholders in input, but no images given!")
+            tokens_per_image = self.image_token_length_w * self.image_token_length_h
+            n_pixels = self.image_patch_size ** 2 * 3
+            image_num_patch = np.prod(self.image_num_patch)
+            crops = tf.zeros((0, image_num_patch, n_pixels), dtype=tf.float32)
+            image_idx = tf.zeros((0, tokens_per_image), tf.int32)
+            out = dict(
+                target_tokens=tokens,
+                images=crops,
+                image_input_idx=image_idx,
+                loss_masks=mask
+            )
+            if self.image_padding_mask:
+                out["image_masks"] = tf.zeros((0, image_num_patch), dtype=tf.float32)
+            if subsegments is not None:
+                out["subsegment_ids"] = subsegments
+            return out
+        elif tf.shape(image_idx)[0] == 0 and tf.shape(images)[0] > 0:
+            # As a special case, no image prompt means the images are all at the start
+            image_idx = tf.zeros([tf.shape(images)[0]], tf.int64) - 1
+        else:
+            tf.debugging.assert_equal(
+                tf.shape(images)[0], tf.shape(image_idx)[0],
+                message="Different number of images and image placeholders")
+        # Each image will produce a variable number of crops/tokens, so we aggregate things
+        # the results tensor arrays and the concat them
+        tokens_per_image = self.image_token_length_w * self.image_token_length_h
+        n_pixels = self.image_patch_size*self.image_patch_size*3
+        n_patches = self.image_num_patch[0]*self.image_num_patch[1]
+        n = tf.shape(images)[0]
+        all_crops = tf.TensorArray(dtype=tf.float32, size=n, infer_shape=False,
+                                   element_shape=[None, n_patches, n_pixels])
+        all_image_idx = tf.TensorArray(dtype=tf.int32, size=n, infer_shape=False,
+                                       element_shape=[None, tokens_per_image])
+        out_tokens = tf.TensorArray(dtype=tf.int32, size=n, infer_shape=False,
+                                    element_shape=[None])
+        out_masks = tf.TensorArray(dtype=tf.float32, size=n, infer_shape=False,
+                                   element_shape=[None])
+        if self.image_padding_mask:
+            all_crop_masks = tf.TensorArray(dtype=tf.float32, size=n, infer_shape=False,
+                                            element_shape=[None, None])
+        else:
+            # Dummy array to keep tensorflow's control analysis happy
+            all_crop_masks = tf.TensorArray(dtype=tf.float32, size=0, infer_shape=False,
+                                            element_shape=[None, None])
+        if subsegments is not None:
+            out_subsegments = tf.TensorArray(dtype=tf.int32, size=n, element_shape=[None])
+        else:
+            out_subsegments = tf.TensorArray(dtype=tf.int32, size=0, element_shape=[None])
+        image_idx = tf.cast(image_idx, tf.int32)
+        for ix in range(tf.shape(image_idx)[0]):
+            token_ix = image_idx[ix]
+            crops, image_tokens, patch_ordering, img_mask = self.image_to_patches_and_tokens(images[ix], is_training)
+            patch_idx = self.build_image_input_idx(image_tokens, patch_ordering)
+            if token_ix == -1:  # -1 is an image inserted at the very start
+                start = 0
+                token_ix = 0
+                end = 0
+            else:
+                start = 0 if ix == 0 else image_idx[ix-1] + 1
+                end = token_ix + 1
+            all_image_idx = all_image_idx.write(ix, patch_idx + token_ix)
+            all_crops = all_crops.write(ix, crops)
+            image_token_mask = tf.zeros_like(image_tokens, dtype=tf.float32)
+            if ix == (tf.shape(images)[0] - 1):
+                tokens_part = tf.concat([tokens[start:token_ix], image_tokens, tokens[end:]], 0)
+                mask_part = tf.concat([mask[start:token_ix], image_token_mask, mask[end:]], 0)
+            else:
+                tokens_part = tf.concat([tokens[start:token_ix], image_tokens], 0)
+                mask_part = tf.concat([mask[start:token_ix], image_token_mask], 0)
+            out_tokens = out_tokens.write(ix, tokens_part)
+            out_masks = out_masks.write(ix, mask_part)
+            if self.image_padding_mask:
+                all_crop_masks = all_crop_masks.write(ix, img_mask)
+            if subsegments is not None:
+                parts = tf.fill([tf.shape(image_tokens)[0]], subsegments[token_ix])
+                if ix == (tf.shape(images)[0] - 1):
+                    seg = tf.concat([subsegments[start:token_ix], parts, subsegments[end:]], 0)
+                else:
+                    seg = tf.concat([subsegments[start:token_ix], parts], 0)
+                out_subsegments = out_subsegments.write(ix, seg)
+        out = dict(
+            target_tokens=out_tokens.concat(),
+            images=all_crops.concat(),
+            image_input_idx=all_image_idx.concat(),
+            loss_masks=out_masks.concat()
+        )
+        if self.image_padding_mask:
+            out["image_masks"] = all_crop_masks.concat()
+        if subsegments is not None:
+            out["subsegment_ids"] = out_subsegments.concat()
+        return out
+    def _format_message(self, args):
+        message, ix = args
+        return self.format_message(message, ix)
+    def format_message(self, message, ix):
+        """Applies system formatting to ith message from a sequence of messages"""
+        # If the image placeholder text is not preceded by space it will not get tokenized
+        # correctly by some tokenizers, so double check it here
+        assert config.IMAGE_PROMPT == "<|image|>"
+        tf.debugging.assert_equal(
+            tf.strings.regex_full_match(message, r".*[^ ]<\|image\|>.*"),
+            False,
+            message="Image token must always be preceded by a space"
+        )
+        is_user = ix % 2 == 0
+        if self.message_format == "none" or self.message_format is None:
+            pass
+        elif self.message_format == "role":
+            if is_user:
+                # We put the "System:" prefix here since it doesn't need a loss
+                message = tf.strings.join(["User: ", message, " Assistant:"])
+        elif self.message_format == "cleanup":
+            if is_user:
+                # We put the "System:" prefix here since it doesn't need a loss
+                message = tf.strings.join(
+                    [
+                        "[[User]]: Correct the spelling and punctuation mistakes on the following transcript based on what appears in the image.\n\n{before} ",
+                        message,
+                        "\n[[Assistant]]: {after}"
+                    ]
+                )
+        elif self.message_format == "mistral":
+            if is_user:
+                message = tf.strings.join(["[INST] ", message, " [/INST]"])
+        else:
+            raise NotImplementedError(self.message_format)
+        # For now assume a space will be used to separate the messages
+        if not self.tokenizer.adds_space:
+            if ix != 0 or self.always_start_with_space:
+                message = tf.strings.join([" ", message])
+        # Else space added automatically by the tokenizer
+        return message
+    def get_multi_message_token_input(self, conversations, text_weights=None):
+        """Build inputs for a ragged tensor of conversations, where each row of the tensor,
+        is a different conversation"""
+        tf.debugging.assert_equal(tf.reduce_any(tf.strings.regex_full_match(
+            conversations.values, re.escape(config.IMAGE_PROMPT))), False, "Segmented prompts must start with the image")
+        n_conversation = tf.shape(conversations)[0]
+        ar = tf.TensorArray(dtype=tf.int32, infer_shape=False, element_shape=[None],
+                            size=n_conversation)
+        n_messages_per_conversation = conversations.row_lengths()
+        for ix in range(n_conversation):
+            ar = ar.write(ix, tf.range(n_messages_per_conversation[ix], dtype=tf.int32))
+        message_ix = ar.concat()
+        messages = tf.map_fn(
+            self._format_message, elems=(conversations.values, message_ix), fn_output_signature=tf.string)
+        messages = self.tokenizer.encode_tf(messages)
+        # Append EOS
+        is_response = message_ix % 2 == 1
+        is_response_int = tf.cast(is_response, tf.int32)
+        eos = tf.RaggedTensor.from_row_lengths(
+            tf.fill([tf.reduce_sum(is_response_int)], self.tokenizer.eos_token_id),
+            tf.cast(is_response_int, messages.row_splits.dtype)
+        )
+        messages = tf.concat([messages, eos], axis=1)
+        # Build mask over system responses
+        mask = tf.ones_like(messages) * tf.cast(tf.expand_dims(is_response, axis=1), tf.int32)
+        decoder_loss_weights = tf.cast(mask.values, tf.float32)
+        # Build subsegment ids for each conversation
+        tokens_per_message = tf.RaggedTensor.from_row_splits(
+            row_splits=conversations.row_splits,
+            values=messages.row_lengths()
+        )
+        token_per_conversation = tf.reduce_sum(tokens_per_message, axis=1)
+        subsegment_ids = tf.repeat(tf.range(n_conversation, dtype=tf.int32)+1, token_per_conversation)
+        image_ix = self.special_token_ids[config.IMAGE_PROMPT]
+        messages = tf.concat([[image_ix], messages.values], axis=0)
+        decoder_loss_weights = tf.concat([[0], decoder_loss_weights], axis=0)
+        subsegment_ids = tf.concat([[10000], subsegment_ids], axis=0)
+        return messages, decoder_loss_weights, subsegment_ids
+    def get_multi_response_token_input(self, user_prompt, text, text_weights=None):
+        """Build tokens for a multi-response-per-image example"""
+        # FIXME this could be relaxed to just having the same prefix
+        tf.debugging.assert_equal(tf.reduce_any(tf.strings.regex_full_match(
+            user_prompt, re.escape(config.IMAGE_PROMPT))), False, "Segmented prompts must start with the image")
+        user_prompt = self.format_message(user_prompt, 0)
+        vocab = self.tokenizer
+        prompts = vocab.encode_tf(user_prompt)
+        response = self.format_message(text, 1)
+        responses = vocab.encode_tf(response)
+        responses = _append_to_innermost_axis(responses, vocab.eos_token_id)
+        response_mask = tf.ones_like(responses, dtype=tf.float32)
+        if text_weights is not None:
+            response_mask *= text_weights
+        image_tokens = tf.constant([self.special_token_ids[config.IMAGE_PROMPT]])
+        if len(responses.shape) == 3:
+            # Tricky case where we have multiple questions, each of which has multiple answers
+            assert len(prompts.shape) == 2
+            # Also shift the last tokens to the response segment since that tokens will
+            # have multiple possible target tokens to predict
+            last_prompt_tokens = prompts[:, -1:]
+            last_prompt_tokens = tf.repeat(last_prompt_tokens, responses.row_lengths())
+            last_prompt_tokens = tf.RaggedTensor.from_row_splits(
+                values=tf.RaggedTensor.from_row_lengths(
+                    values=last_prompt_tokens,
+                    row_lengths=tf.ones_like(last_prompt_tokens, dtype=responses.row_splits.dtype)
+                ),
+                row_splits=responses.row_splits
+            )
+            responses = tf.concat([last_prompt_tokens,  responses], 2)
+            prompts = prompts[:, :-1]
+            shared_prefix = image_tokens
+            segmented_suffix = tf.concat([tf.expand_dims(prompts, 1), responses], 1)
+            targets = tf.concat([shared_prefix, segmented_suffix.values.values], 0)
+            segmented_mask = tf.concat([
+                tf.zeros_like(tf.expand_dims(prompts, 1), dtype=tf.float32),
+                tf.concat([
+                    tf.zeros_like(last_prompt_tokens, dtype=tf.float32),
+                    response_mask
+                ], 2)
+            ], 1).values.values
+            decoder_loss_weights = tf.concat(
+                [tf.zeros_like(shared_prefix, dtype=tf.float32), segmented_mask], 0)
+            text_segment_ids = get_3d_subsegments(segmented_suffix)
+            subsegment_ids = tf.concat([
+                tf.zeros_like(shared_prefix) + tf.reduce_max(text_segment_ids)+1,
+                text_segment_ids], 0)
+            subsegment_ids = tf.cast(subsegment_ids, tf.int32)
+        else:
+            if len(prompts.shape) == 1:
+                # One prompt for all responses, we use the last token of the prompt as the
+                # first token of each response segment since there will be multiple targets
+                # for that token, the remaining targets are part of the prefix
+                shared_prefix = tf.concat([image_tokens, prompts[:-1]], 0)
+                prompts = prompts[-1:]
+                prompts = tf.tile(tf.expand_dims(prompts, axis=0), [tf.shape(text)[0], 1])
+            else:
+                shared_prefix = image_tokens
+            # Separate prompt for each response
+            segmented_suffix = tf.concat([prompts, responses], 1)
+            segmented_mask = tf.concat([tf.zeros_like(prompts, dtype=tf.float32), response_mask], 1).values
+            targets = tf.concat([shared_prefix, segmented_suffix.values], 0)
+            decoder_loss_weights = tf.concat(
+                [tf.zeros_like(shared_prefix, dtype=tf.float32), segmented_mask], 0)
+            subsegments = tf.ragged.row_splits_to_segment_ids(segmented_suffix.row_splits) + 1
+            subsegment_ids = tf.concat([tf.zeros_like(shared_prefix)+10000,
+                                        tf.cast(subsegments, tf.int32)], 0)
+        return targets, decoder_loss_weights, subsegment_ids
+    def get_tokens_input(self, messages, for_inference=False, text_weights=None):
+        """Gets the token input for an example, using image placeholder tokens to
+        indicate where images features should be inserted
+        inputs
+        messages: List or tensor users/system text messages, can have image placeholder tokens
+        for_inference: bool, if true truncate the messages if it is a system message
+        text_weights: Weights per a system message
+        returns
+        tokens: [n_tokens] tf.int32 token inputs with image placeholder tokens
+        loss_mask: [n_tokens] tf.float32 token weights for loss
+        subsegment: [n_tokens] tf.int32 or None, subsegment ids used to build more complex
+                               attention masks if needed
+        """
+        if isinstance(messages, tf.RaggedTensor):
+            assert not for_inference, "Cannot have multiple target messages for inference"
+            return self.get_multi_message_token_input(messages, text_weights)
+        elif len(tf.shape(messages[-1])) > 0:
+            assert not for_inference, "Cannot have multiple target messages for inference"
+            assert len(messages) == 2
+            prompt = messages[0]
+            response = messages[1]
+            return self.get_multi_response_token_input(prompt, response, text_weights)
+        else:
+            messages = tf.convert_to_tensor(messages)
+            if for_inference:
+                if tf.shape(messages) % 2 == 0:
+                    # Remove the last message since the model should predict it
+                    messages = messages[:-1]
+        # Apply system formatting
+        ix = tf.range(tf.shape(messages)[0])
+        is_response = ix % 2 == 1
+        messages = tf.map_fn(
+            self._format_message, elems=(messages, ix), fn_output_signature=tf.string)
+        # Tokenize
+        messages = self.tokenizer.encode_tf(messages)
+        # Add EOS to system messages
+        is_response_int = tf.cast(is_response, tf.int32)
+        eos = tf.RaggedTensor.from_row_lengths(
+            tf.fill([tf.reduce_sum(is_response_int)], self.tokenizer.eos_token_id),
+            tf.cast(is_response_int, messages.row_splits.dtype)
+        )
+        messages = tf.concat([messages, eos], axis=1)
+        targets = messages.values
+        # Build mask over system responses
+        mask = tf.ones_like(messages) * tf.cast(tf.expand_dims(is_response, axis=1), tf.int32)
+        decoder_loss_weights = tf.cast(mask.values, tf.float32)
+        if text_weights is not None:
+            decoder_loss_weights = decoder_loss_weights * text_weights
+        return messages.values, decoder_loss_weights, None
+    def preprocess(self, image, input_text, is_training=False,
+                   seq_len=None, pad_images=1, style=None, for_inference=True):
+        """Get input tensors for the given image/text data
+        image: [h, w, 3] numpy uint8 array of image pixels
+        input_text: string input text, a list of text for a multi-turn conversation or dictionary
+                    of inputs to use to build the prompt from a template
+        is_training: allow training-time preprocessing (e.g., image augmentation)
+        seq_len: pad input tokens to `seq_len`
+        pad_images: pad input images to `self.get_max_total_crops()`
+        style: Style to use for prompt templating
+        """
+        if image is not None and len(tf.shape(image)) == 3:
+            image = tf.expand_dims(image, axis=0)
+        messages = self.get_messages(input_text, style, is_training, for_inference=for_inference, user_prompt_seed=None, system_prompt_seed=None)
+        targets, loss_masks, subsegments = self.get_tokens_input(messages, for_inference=for_inference)
+        batch = self.build_multimodel_features(
+            targets, loss_masks, subsegments, image, is_training)
+        # Optionally padding to get constant sized arrays
+        if pad_images:
+            max_crops = self.get_max_total_crops() * pad_images
+            image = batch["images"]
+            n = max_crops - tf.shape(batch["images"])[0]
+            batch["images"] = tf.pad(image, [[0, n], [0, 0], [0, 0]], constant_values=-1)
+            if self.image_padding_mask:
+                m = max_crops - tf.shape(batch["image_masks"])[0]
+                batch["image_masks"] = tf.pad(batch["image_masks"], [[0, m], [0, 0]], constant_values=-1)
+            batch["image_input_idx"] = tf.pad(batch["image_input_idx"], [[0, n], [0, 0]], constant_values=-1)
+        if seq_len is not None:
+            targets = batch["target_tokens"]
+            if seq_len < len(targets):
+                raise ValueError("Sequence length too short")
+            n = seq_len - len(targets)
+            batch["target_tokens"] = tf.pad(targets, [[0, n]], constant_values=-1)
+            batch["loss_masks"] = tf.pad(batch["loss_masks"], [[0, n]], constant_values=-1)
+        batch = self.get_post_mixing_preprocessor(pack=False)._convert_example(batch)
+        return batch
+    def get_user_prompt(self, style, example, is_training=True, for_inference=False, seed=None):
+        """Build a list of strings of what a user might type in to the model for the given example,
+        and its responses, by applying a prompt template to the fields in `example`
+        Can return multiple strings for one message for multi-response examples
+        """
+        if "style" in example:
+            style = example["style"]
+        if "prompt" in example:
+            # Examples have a complete user prompt pre-specified, usually for eval sets
+            prompt = example["prompt"]
+        elif self.prompt_templates == "none":
+            # Bare-bone prompt with not templating of instructions
+            if "prompt" in example:
+                prompt = example["prompt"]
+            elif "refexp" in example:
+                prompt = example["refexp"]
+            elif "question" in example and "options" in example:
+                prompt = tf.strings.join([example["question"], "\n", example["options"], "\n"])
+            elif "question" in example:
+                prompt = example["question"]
+            else:
+                prompt = ""
+        elif self.prompt_templates == "uber_model":
+            if not isinstance(style, str):
+                tf.debugging.assert_equal(tf.logical_or(
+                    style == "ai2_diagram_no_letter",
+                    style == "ai2_diagram",
+                ), True)
+                prompt = tf.strings.join([example["question"], "\n", example["options"], "\n"])
+            else:
+                # We template long captions and pointing since they are "demo" tasks, and use
+                # plain text for everything else
+                if style == "long_caption":
+                    prompt = apply_keyword_prompt(GENERAL_PROMPTS_V1["long_caption"], example, seed)
+                elif style == "pointing":
+                    prompt = apply_keyword_prompt(GENERAL_PROMPTS_V1["pointing"], example, seed)
+                elif style == "point_count":
+                    prompt = apply_keyword_prompt(GENERAL_PROMPTS_V1["point_count"], example, seed)
+                elif "prompt" in example:
+                    prompt = example["prompt"]
+                elif "refexp" in example:
+                    prompt = example["refexp"]
+                elif "question" in example and "options" in example:
+                    prompt = tf.strings.join([example["question"], "\n", example["options"], "\n"])
+                elif "question" in example:
+                    prompt = example["question"]
+                else:
+                    prompt = ""
+        elif self.prompt_templates == "uber_model_pointing":
+            if style == "long_caption":
+                long_captions = GENERAL_PROMPTS_V1["long_caption_no_pointing"]
+                prompt = apply_keyword_prompt(GENERAL_PROMPTS_V1["long_caption"], example, seed)
+            elif style == "pointing":
+                prompt = apply_keyword_prompt(GENERAL_PROMPTS_V1["pointing"], example, seed)
+            elif style in [
+                "scifi_charts_explanation",
+                "scifi_table_explanation",
+                "scifi_document_explanation",
+                "scifi_diagram_explanation",
+                "user_qa",
+                "long_caption",
+            ]:
+                raise NotImplementedError()
+                if style == "long_caption":
+                    prompts = GENERAL_PROMPTS_V1["long_caption"]
+                elif "prompt" in example:
+                    prompts = tf.expand_dims(example["prompt"], axis=0)
+                else:
+                    prompts = tf.expand_dims(example["question"], axis=0)
+                suffixes = []
+                for suffix in GENERAL_PROMPTS_V1["no_pointing_suffix"]:
+                    if not suffix[0].isspace():
+                        suffix = " " + suffix
+                    suffixes.append(suffix)
+                no_point_prompts = tf.reshape(tf.strings.join([
+                    tf.tile(tf.expand_dims(suffixes, 1), [1, tf.shape(prompts)[1]]),
+                    tf.tile(prompts, [len(suffixes), 1]),
+                ]), [-1])
+                # prefixes = []
+                # for prefix in GENERAL_PROMPTS_V1["no_pointing_prefix"]:
+                #     if not prefix[0].isspace():
+                #         prefix = prefix + " "
+                #     prefixes.append(prompts + prefix)
+                prompt = apply_keyword_prompt(no_point_prompts, example, seed, keywords=[])
+            elif "prompt" in example:
+                prompt = example["prompt"]
+            elif "refexp" in example:
+                prompt = example["refexp"]
+            elif "question" in example and "options" in example:
+                prompt = tf.strings.join([example["question"], "\n", example["options"], "\n"])
+            elif "question" in example:
+                prompt = example["question"]
+            else:
+                prompt = ""
+        elif self.prompt_templates == "general_instructions_v1":
+            if isinstance(style, str):
+                prompt = apply_keyword_prompt(GENERAL_PROMPTS_V1[STYLE_TO_GENERAL_PROMPT[style]], example, seed)
+            elif isinstance(style, list):
+                # This ia bit of hack to allow apply prompts to joint caption/transcript data
+                # FIXME ideally we can apply the templating to multiple styles more generally
+                def _apply(_style, ix):
+                    tmp = dict(example)
+                    # prevent apply_keyword_prompt for generating multiple templates
+                    tmp["text"] = tmp["text"][0]
+                    if _style == "long_caption":
+                        return apply_keyword_prompt(GENERAL_PROMPTS_V1["long_caption"], tmp, seed)
+                    elif _style == "transcript":
+                        return apply_keyword_prompt(GENERAL_PROMPTS_V1["transcript"], tmp, seed)
+                    else:
+                        raise NotImplementedError(_style)
+                prompt = [_apply(x, ix) for ix, x in enumerate(style)]
+            else:
+                raise NotImplementedError()
+        elif self.prompt_templates == "zero_shot_v1":
+            assert style is not None
+            if not isinstance(style, str):
+                # FIXME can we handle tensor style's in a better way?
+                if style == "ai2_diagram":
+                    prompt = "Question: {question}\nAnswer with correct answer option letter only\nOptions: {options}\nAnswer:"
+                    prompt = apply_keyword_prompt([prompt], example, seed)
+                elif style == "ai2_diagram_no_letter":
+                    prompt = "Question: {question}\nAnswer with correct answer option only\nOptions: {options}\nAnswer:"
+                    prompt = apply_keyword_prompt([prompt], example, seed)
+                else:
+                    prompt = ""
+                tf.debugging.assert_equal(prompt != "", True)
+            else:
+                general_style = STYLE_TO_GENERAL_PROMPT[style]
+                if general_style == "short_answer":
+                    prompt = apply_keyword_prompt(["Question: {question} Answer with as few words as possible. Answer:"], example, seed)
+                elif general_style == "multiple_choice":
+                    prompt = apply_keyword_prompt(["Question: {question}\nAnswer with correct answer option letter only\nOptions: {options}\nAnswer:"], example, seed)
+                elif general_style == "count_bench":
+                    prompt = apply_keyword_prompt(["Question: How many {object} are there?\nRespond with only a number.\nAnswer:"], example, seed)
+                else:
+                    raise NotImplementedError(general_style)
+        elif self.prompt_templates == "zero_shot_v2":
+            assert style is not None
+            if self.prompt_override:
+                prompt = apply_keyword_prompt([self.prompt_override], example, seed)
+            elif not isinstance(style, str):
+                if style == "ai2_diagram":
+                    prompt = "{question} Answer with correct answer option letter only. Options: {options}"
+                    prompt = apply_keyword_prompt([prompt], example, seed)
+                elif style == "ai2_diagram_no_letter":
+                    prompt = "{question} Answer with correct answer option only. Options: {options}"
+                    prompt = apply_keyword_prompt([prompt], example, seed)
+                else:
+                    prompt = ""
+                tf.debugging.assert_equal(prompt != "", True)
+            else:
+                if style in ["vqa2", "gqa", "tally_qa", "okvqa", "a_okvqa_da"]:
+                    prompt = "Answer with a single word. {question}"
+                elif style in ["text_vqa", "doc_qa", "info_qa", "chart_qa", "st_qa", "ocr_vqa", "dv_qa", "tabwmp_da", "figure_qa", "figure_qa_zero_shot", "plot_qa"]:
+                    prompt = "{question}\nRespond as concisely as possible, do not output anything other than the answer."
+                elif STYLE_TO_GENERAL_PROMPT[style] == "multiple_choice":
+                    prompt = "{question} Answer with correct answer option letter only. Options: {options}"
+                elif STYLE_TO_GENERAL_PROMPT[style] == "short_answer":
+                    prompt = "{question} Answer with as few words as possible."
+                elif style == "vtabfact":
+                    prompt = "{question}"
+                elif style == "count_bench":
+                    prompt = "How many {object} are there?\nRespond with only a number."
+                else:
+                    raise NotImplementedError(style)
+                prompt = apply_keyword_prompt([prompt], example, seed)
+        else:
+            raise NotImplementedError(self.prompt_templates)
+        if for_inference:
+            return [prompt]
+        else:
+            return [prompt, example["text"]]
+    def get_system_prompt(self, style, example, for_inference,
+                          messages, seed=None):
+        if isinstance(style, str) and style == "count_bench":
+            style = "ok_vqa"
+        if self.system_prompt == "style":
+            if isinstance(style, str):
+                prefix = style + ":"
+            else:
+                prefix = tf.strings.join([style, ":"])
+        elif self.system_prompt == "demo_or_style":
+            if isinstance(style, str):
+                if style == "android_control" or style == "demo":
+                    # android is a special case since I hacked in prefix in the preprocessor
+                    prefix = ""
+                elif style in ["scifi_demo", "synthetic_qa"] or style in DEMO_STYLES:
+                    if style == "scifi_demo":
+                        p_no_prompt = 0.2
+                    elif style == "synthetic_qa":
+                        p_no_prompt = 0.25
+                    else:
+                        p_no_prompt = 0.9
+                    if len(tf.shape(messages)) > 1:
+                        n_messages = tf.shape(messages)[1]
+                        style = tf.tile(tf.expand_dims(style, axis=0), [n_messages])
+                        r = tf.random.stateless_uniform([n_messages], seed, 0, 1)
+                    else:
+                        r = tf.random.stateless_uniform((), seed, 0, 1)
+                    prefix = tf.where(r < p_no_prompt, "", tf.strings.join([style + ":"]))
+                else:
+                    prefix = style + ":"
+            else:
+                if tf.reduce_any(style == tf.constant(DEMO_STYLES + ["scifi_demo", "android_control", "demo"])):
+                    prefix = ""
+                else:
+                    prefix = tf.strings.join([style, ":"])
+        elif self.system_prompt in ["long_caption_length_hint", "style_long_caption_length_hint"]:
+            if seed is not None:
+                raise NotImplementedError("Determinism")
+            std = 25
+            use_hint = tf.logical_or(
+                tf.equal(style, "long_caption"), tf.equal(style, "transcript"))
+            if self.system_prompt == "style_long_caption_length_hint":
+                default = tf.strings.join([style, ": "])
+            else:
+                default = ""
+            if for_inference:
+                assert len(tf.shape(use_hint)) == 0
+                if self.default_inference_len and use_hint:
+                    prefix = tf.strings.join([style, " ", str(self.default_inference_len), ": "])
+                else:
+                    prefix = default
+            else:
+                std = 25
+                n = tf.strings.length(messages[-1])
+                n += tf.cast(tf.random.normal(n.shape)*std, tf.int32)
+                hint = tf.strings.join([style, " ", tf.strings.as_string(n//15), ": "])
+                use_hint = tf.logical_and(use_hint, tf.random.uniform(tf.shape(hint)) > 0.1)
+                prefix = tf.where(use_hint, hint, default)
+        elif for_inference and self.system_prompt in ["style_and_length", "style_and_length_v2"]:
+            v2 = self.system_prompt == "style_and_length_v2"
+            if example.get("length_cond") is not None:
+                # Examples have individual length conditioning
+                n = tf.strings.as_string(example["length_cond"])
+            else:
+                inference_len = self.default_inference_len
+                n = None if inference_len is None else str(inference_len)
+                logging.warning(f"eval len: {n}")
+            if n is not None and tf.strings.length(n) > 0:  # allow empty string to signal unconditioned
+                prefix = tf.strings.join([style, " ", n, ":"])
+            else:
+                prefix = tf.strings.join([style, ":" if v2 else " :"])
+        elif self.system_prompt in ["style_and_length", "style_and_length_v2"]:
+            v2 = self.system_prompt == "style_and_length_v2"
+            std = 25
+            logging.info(f"style prompt std={std}, percent=10")
+            if seed is not None:
+                seeds = tf.random.split(seed)
+                p = tf.random.stateless_uniform((), seed=seeds[0])
+            else:
+                p = tf.random.uniform(())
+            if p > 0.10:
+                n = tf.strings.length(messages[-1])
+                if seed is not None:
+                    n += tf.cast(tf.random.stateless_normal(n.shape, seed=seeds[1])*std, tf.int32)
+                else:
+                    n += tf.cast(tf.random.normal(n.shape)*std, tf.int32)
+                n = tf.strings.as_string(n//15)
+                prefix = tf.strings.join([style, " ", n, ":"])
+            else:
+                prefix = tf.strings.join([style, ":" if v2 else " :"])
+        else:
+            raise NotImplementedError(self.system_prompt)
+        return prefix
+    def preprend_system_prompt(self, style, example, for_inference, messages, seed=None):
+        prefix = self.get_system_prompt(style, example, for_inference, messages, seed=seed)
+        separator = tf.where(tf.logical_and(
+            tf.strings.length(prefix) > 0, tf.strings.length(messages[0]) > 0), " ", "")
+        with_system_prompt = tf.strings.join([prefix, separator, messages[0]])
+        if isinstance(messages, list):
+            messages = [with_system_prompt] + messages[1:]
+        else:
+            messages = tf.concat([tf.expand_dims(with_system_prompt, 0), messages[1:]], axis=0)
+        return messages
+    def get_messages(self, ex, style, is_training, for_inference, user_prompt_seed, system_prompt_seed):
+        if isinstance(ex, list):
+            messages = ex
+        elif isinstance(ex, str):
+            messages = [ex]
+        elif "messages" in ex:
+            messages = ex["messages"]
+        else:
+            # Apply a prompt template
+            messages = self.get_user_prompt(style, ex, is_training, for_inference=for_inference, seed=user_prompt_seed)
+        # Maybe add a system prompt. The system prompt gets concatenated with the first user input
+        if self.system_prompt and self.system_prompt != "none":
+            if isinstance(ex, dict):
+                style = ex.get("style", style)
+            if isinstance(messages, tf.RaggedTensor):
+                n = tf.shape(messages)[0]
+                message_arr = tf.TensorArray(dtype=tf.string, size=n, element_shape=(None,))
+                seeds = tf.random.split(system_prompt_seed, n)
+                for i in range(n):
+                    message_arr = message_arr.write(i, self.preprend_system_prompt(style, None, for_inference, messages[i], seed=seeds[i]))
+                messages = tf.RaggedTensor.from_row_splits(
+                    values=message_arr.concat(), row_splits=messages.row_splits)
+            else:
+                messages = self.preprend_system_prompt(style, ex, for_inference, messages, seed=system_prompt_seed)
+        return messages
+    def get_preprocessor(self, is_training, for_inference, style=None, include_metadata=None):
+        """Build a preprocessing function that can be applied ot a tf.data.Dataset"""
+        vocab = self.tokenizer
+        include_response = not for_inference
+        if include_metadata is None:
+            include_metadata = for_inference
+        @seqio.map_over_dataset(num_seeds=2)
+        def to_inputs_and_targets(ex, seeds):
+            if "unconditioned" in ex:
+                raise NotImplementedError()
+            if "image" not in ex:
+                image = None
+            elif ex['image'].dtype == tf.string:
+                image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
+            else:
+                image = ex['image']
+            raw_image = image
+            if image is not None and len(tf.shape(image)) == 3:
+                image = tf.expand_dims(image, axis=0)
+            unconditioned = self.unconditioned
+            if unconditioned and isinstance(unconditioned, float):
+                assert image is not None
+                if is_training and tf.random.uniform((), 0, 1, dtype=tf.float32) < unconditioned:
+                    image = image[:0]
+            elif unconditioned:
+                image = None
+            messages = self.get_messages(ex, style, is_training, for_inference, seeds[0], seeds[1])
+            targets, loss_masks, subsegments = self.get_tokens_input(
+                messages, for_inference, ex.get("text_weights"))
+            # if "scifi" in style and style.endswith("_explanation"):
+            #     logging.warning(f"No loss on EOS for {style}")
+            #     loss_masks = tf.where(targets == self.tokenizer.eos_token_id, tf.zeros_like(loss_masks), loss_masks)
+            out = self.build_multimodel_features(targets, loss_masks, subsegments, image, is_training)
+            if include_metadata:
+                # FIXME remove these special cases
+                if "text" in ex:
+                    if len(ex["text"].shape) > 0:
+                        # FIXME can this be variable lengths after all?
+                        out["metadata/captions"] = tf.strings.reduce_join(
+                            tf.strings.regex_replace(ex['text'], "\\s+", " "),
+                            separator="\n"
+                        )
+                    else:
+                        out["metadata/captions"] = ex["text"]
+                if "image_url" in ex:
+                    out["metadata/image_url"] = ex["image_url"]
+                elif "url" in ex:
+                    out["metadata/image_url"] = ex["url"]
+                if "image_id" in ex:
+                    out["metadata/image_id"] = ex["image_id"]
+                for k, v in ex.items():
+                    if k.startswith("metadata"):
+                        out[k] = v
+                if raw_image is not None and "metadata/image_size" not in out:
+                    img_h = tf.shape(raw_image)[0]
+                    img_w = tf.shape(raw_image)[1]
+                    out["metadata/image_size"] = [img_w, img_h]
+                if "metadata/image_url" not in out and raw_image is not None:
+                    if len(ex["image"].shape) < 4:
+                        # For visualizations FIXME can we make this variable length
+                        out["metadata/image"] = tf.io.encode_jpeg(
+                            tf.image.convert_image_dtype(raw_image, tf.uint8))
+            return out
+        return to_inputs_and_targets
+    def get_post_mixing_preprocessor(self, pack=False):
+        """Build a feature conversion function that can be applied ot a tf.data.Dataset
+        This function applies a second stage of pre-processing, but unlike `self.get_preprocessor`
+        this stage can be applied after mixing tf.data.Datasets into a mixture
+        """
+        return MultiModalLMFeatureConverter(
+            loss_token_weighting=self.loss_token_weighting,
+            bos_id=self.tokenizer.bos_token_id,
+            fix_image_input_idx=self.fix_image_input_idx,
+            pack=pack,
+            special_tokens=list(self.special_token_ids.values()),
+        )
+class MultiModalLMFeatureConverter:
+    def __init__(
+        self, pack: bool = False, loss_token_weighting: str=None, bos_id: int = 1,
+        special_tokens=None, fix_image_input_idx=2
+    ):
+        self.pack = pack
+        self.bos_id = bos_id
+        self.fix_image_input_idx = fix_image_input_idx
+        self.special_tokens = tf.constant(special_tokens) if special_tokens else None
+        self.loss_token_weighting = loss_token_weighting
+    def _convert_example(
+        self, features: Mapping[str, tf.Tensor]
+    ) -> Mapping[str, tf.Tensor]:
+        """Convert an LM example into an example with model features."""
+        # targets_segment_id is present only for a packed dataset.
+        decoder_input_tokens = make_autoregressive_inputs(
+            features["target_tokens"],
+            sequence_id=features.get("targets_segment_ids", None),
+            bos_id=self.bos_id,
+        )
+        tf.assert_equal(
+            True,
+            tf.reduce_all(decoder_input_tokens[-1] != self.special_tokens),
+            message="An input ends with an image special token",
+        )
+        image_input_idx = features["image_input_idx"]
+        if self.fix_image_input_idx == 2:
+            # plus one sine we have added BOS to the inputs
+            image_input_idx = tf.where(image_input_idx < 0,  image_input_idx, image_input_idx + 1)
+        else:
+            # Some old models trained like this, sometimes image_input_idx will go from -1 -> 0 didn't
+            # effect performance but keep this code path for backwards compatiblity with those checkpoints
+            image_input_idx = image_input_idx + 1
+        d = {
+            "target_tokens": features["target_tokens"],
+            "input_tokens": decoder_input_tokens,
+            "loss_masks": features["loss_masks"],
+            "images": features["images"],
+            "image_input_idx": image_input_idx
+        }
+        if "image_masks" in features:
+            d["image_masks"] = features["image_masks"]
+        has_custom_text_weight = features.get("has_custom_loss_weight", False)
+        if "subsegment_ids" in features:
+            subsegment_ids = make_autoregressive_inputs(
+                features["subsegment_ids"],
+                sequence_id=features.get("targets_segment_ids", None),
+                bos_id=features["subsegment_ids"][0],
+            )
+            # Subsegment have a position based on the sum of previous positions they can attend to
+            position_ids = tf.zeros_like(subsegment_ids)
+            unique_segments = tf.unique(subsegment_ids)[0]
+            for i in unique_segments:
+                segment_position_ids = tf.cumsum(tf.cast(subsegment_ids >= i, tf.int32)) - 1
+                position_ids = tf.where(subsegment_ids == i, segment_position_ids, position_ids)
+            # Apply loss weighting, this is done here so it occurs after truncation
+            if has_custom_text_weight:
+                pass
+            elif self.loss_token_weighting in ["subsegments", "root_subsegments"]:
+                n_loss_segments = tf.shape(tf.unique(tf.boolean_mask(subsegment_ids, d["loss_masks"] > 0))[0])[0]
+                n_loss_segments = tf.maximum(tf.cast(n_loss_segments, tf.float32), 1)
+                weight = 1/n_loss_segments if self.loss_token_weighting == "subsegments" else tf.math.rsqrt(n_loss_segments)
+                d["loss_masks"] = tf.where(d["loss_masks"] > 0, d["loss_masks"]*weight, d["loss_masks"])
+            elif self.loss_token_weighting is not None:
+                raise NotImplementedError(self.loss_token_weighting)
+            d["subsegment_ids"] = subsegment_ids
+            d["position_ids"] = position_ids
+        else:
+            if self.loss_token_weighting not in [None, "subsegments", "root_subsegments"] and not has_custom_text_weight:
+                raise NotImplementedError(self.loss_token_weighting)
+        if self.pack:
+            d["decoder_segment_ids"] = features["targets_segment_ids"]
+            d["decoder_positions"] = features["targets_positions"]
+        for k in features:
+            if k.startswith("metadata/"):
+                d[k] = features[k]
+        return d
+    def _pack_or_pad(self, ds, task_feature_lengths):
+        if self.pack:
+            raise NotImplementedError()
+        else:
+            return trim_and_pad_dataset(ds, task_feature_lengths)
+    def __call__(self, ds: tf.data.Dataset, task_feature_lengths: Mapping[str, int]) -> tf.data.Dataset:
+        """Convert the dataset to be fed to a language model."""
+        task_feature_lengths = dict(task_feature_lengths)
+        if "images" in ds.element_spec and "images" in task_feature_lengths:
+            # Images should never be truncated
+            ds = assert_not_truncated(ds, ["images", "image_input_idx"], task_feature_lengths["images"])
+        if any(x.startswith("metadata/") for x in ds.element_spec):
+            # Metadata indicates the dataset is being used for inference, inference datasets
+            # should not be truncated
+            ds = assert_not_truncated(ds, ["target_tokens"], task_feature_lengths["target_tokens"])
+        if "image_masks" in ds.element_spec and "images" in task_feature_lengths:
+            task_feature_lengths["image_masks"] = task_feature_lengths["images"]
+        if "subsegment_ids" in ds.element_spec and "target_tokens" in task_feature_lengths:
+            task_feature_lengths["subsegment_ids"] = task_feature_lengths["target_tokens"]
+        if "loss_masks" not in task_feature_lengths and "target_tokens" in task_feature_lengths:
+            task_feature_lengths["loss_masks"] = task_feature_lengths["target_tokens"]
+        ds = self._pack_or_pad(ds, task_feature_lengths)
+        return ds.map(
+            self._convert_example, num_parallel_calls=tf.data.experimental.AUTOTUNE)

preprocesssors.py ADDED Viewed

	@@ -0,0 +1,2472 @@

+import hashlib
+import json
+import math
+from functools import reduce
+from typing import Mapping, Optional, Sequence
+import numpy as np
+import tensorflow as tf
+import seqio
+import gin
+from .data_utils import flatten_parts, stateless_permutation, stateless_shuffle
+from .. import config
+def get_from_dict(data, keys):
+    """Iterate nested dictionary"""
+    return reduce(dict.get, keys, data)
+def get_blank_image():
+    image = tf.zeros([224, 224, 3], dtype=tf.uint8)
+    image = tf.expand_dims(image, 0)[:1]
+    return image
+@seqio.utils.map_over_dataset
+def rekey(x, key_map=None):
+    """Replace the feature keys according to the mapping in `key_map`.
+    For example, if the dataset returns examples of the format:
+    {'foo': 'something', 'bar': 'something else'}
+    and key_map = {'boo': 'foo', 'spar': 'bar'} then this function will return
+    examples with the format
+    {'boo': 'something', 'spar': 'something else'}
+    If a mapping is to an empty key or None, set the new key to an empty string.
+    Args:
+        x: an example to process.
+        key_map: dictionary mapping new keys to original keys
+    Returns:
+        A preprocessed example with the format listed above.
+    """
+    if key_map:
+        out = {}
+        for new_key, old_key in key_map.items():
+            if isinstance(old_key, list):
+                out[new_key] = get_from_dict(x, old_key)
+            else:
+                out[new_key] = x[old_key]
+        return out
+    return x
+def rename(**kwargs):
+    @seqio.map_over_dataset
+    def _fn(x):
+        updates = {}
+        for new_key, old_key in kwargs.items():
+            if isinstance(old_key, list):
+                val = x[old_key[0]]
+                for k in old_key[1:-1]:
+                    val = val[k]
+                updates[new_key] = val.pop(old_key[-1])
+            else:
+                updates[new_key] = x.pop(old_key)
+        x.update(updates)
+        return x
+    return _fn
+def extract_transcripts(ds):
+    ds = flatten_parts(ds, ["transcripts"])
+    def _map(ex):
+        return dict(
+            image=ex["image"],
+            text=ex["transcripts"],
+            url=ex["url"]
+        )
+    return ds.map(_map)
+@seqio.map_over_dataset
+def extract_caption_and_all_transcripts(ex):
+    transcripts = tf.random.shuffle(ex["transcripts"])[:3]
+    weight = 1.0 / tf.cast(tf.shape(transcripts)[0], tf.float32)
+    return dict(
+        image=ex["image"],
+        text=tf.concat([tf.expand_dims(ex["caption"], 0), transcripts], 0),
+        url=ex["url"],
+        text_weights=tf.pad(
+            tf.ones((1,), dtype=tf.float32), [[0, tf.shape(transcripts)[0]]],
+            constant_values=weight),
+    )
+@seqio.map_over_dataset
+def extract_all_transcripts(ex):
+    transcripts = tf.random.shuffle(ex["transcripts"])[:3]
+    weight = 3.0 / tf.cast(tf.shape(transcripts)[0], tf.float32)
+    return dict(
+        image=ex["image"],
+        text=transcripts,
+        url=ex["url"],
+        text_weights=tf.fill((tf.shape(transcripts)[0],), weight),
+    )
+@seqio.map_over_dataset
+def extract_transcript(ex):
+    transcripts = tf.random.shuffle(ex["transcripts"])
+    return dict(
+        image=ex["image"],
+        text=transcripts[0],
+        url=ex["url"],
+    )
+@seqio.map_over_dataset
+def extract_caption(ex):
+    caption = ex["caption"]
+    if len(caption.shape) > 0:
+        ex["text"] = caption[0]
+    else:
+        ex["text"] = caption
+    return ex
+@seqio.map_over_dataset
+def extract_joint_captions(ex):
+    caption = ex["caption"]
+    if len(caption.shape) > 0:
+        caption = caption[0]
+    _ix = tf.random.uniform((), 0, tf.shape(ex["transcripts"])[0], dtype=tf.int32)
+    _ix = _ix % tf.shape(ex["transcripts"])[0]
+    return dict(
+        image=ex["image"],
+        text=tf.stack([caption, ex["mistral_caption"], ex["transcripts"][_ix]], 0),
+        url=ex["url"]
+    )
+@seqio.map_over_dataset(num_seeds=1)
+def extract_caption_and_transcript(ex, seed):
+    caption = ex["caption"]
+    if len(caption.shape) > 0:
+        caption = caption[0]
+    _ix = tf.random.stateless_uniform((), seed, 0, tf.shape(ex["transcripts"])[0], dtype=tf.int32)
+    return dict(
+        image=ex["image"],
+        text=tf.stack([caption, ex["transcripts"][_ix]], 0),
+        url=ex["url"]
+    )
+@seqio.map_over_dataset
+def caption_transcript_augmented(ex, sequence_length):
+    caption = ex["caption"]
+    if len(caption.shape) > 0:
+        caption = caption[0]
+    image = ex["image"]
+    properties = []
+    do_augmentation = sequence_length["is_training"]
+    # do_augmentation = False
+    # Keep this off, it screws up OCR
+    # do_hflip = (tf.random.uniform(()) > 0.2 and do_augmentation)
+    do_hflip = False
+    if do_hflip:
+        image = image[:, ::-1]
+    # Mild color jitter
+    do_color = (tf.random.uniform(()) > 0.5 and do_augmentation)
+    if do_color:
+        image = tf.image.random_hue(image, max_delta=0.05)
+        image = tf.image.random_brightness(image, max_delta=0.2)
+        image = tf.image.random_saturation(image, 0.7, 1.3)
+        image = tf.image.random_contrast(image, 0.7, 1.3)
+    # Mild affine transformation
+    do_affine = (tf.random.uniform(()) > 0.5 and do_augmentation)
+    if do_affine and do_augmentation:
+        shift_x = tf.random.uniform((), -10, 10) * 0
+        shift_y = tf.random.uniform((), -10, 10) * 0
+        shear_x = tf.random.uniform((), -2, 2)
+        shear_y = tf.random.uniform((), -2, 2)
+        rotation = tf.random.uniform((), -6, 6)
+        max_scale = 1.1
+        scale = tf.random.uniform((), 0.8, max_scale)
+        center = tf.cast(tf.shape(image), tf.float32)/2
+        image = tf.keras.ops.image.affine_transform(
+            image,
+            tf.stack(get_affine_matrix(
+                [center[0], center[1]],
+                rotation,
+                [shift_x, shift_y],
+                1/scale,
+                [shear_x, shear_y]
+            ) + [0., 0.]),
+            interpolation='bilinear',
+            fill_mode='constant',
+            fill_value=1.,
+            data_format='channels_last'
+        )
+    properties = tf.stack([
+        ("[hflip]" if do_hflip else ""),
+        ("[color]" if do_color else ""),
+        ("[affine]" if do_affine else "")
+    ])
+    properties = tf.boolean_mask(properties, tf.strings.length(properties) > 0)
+    prompt = tf.strings.reduce_join(properties, separator=" ")
+    ix = tf.random.uniform((), 0, tf.shape(ex["transcripts"])[0], dtype=tf.int32)
+    out = dict(
+        image=image,
+        text=tf.stack([caption, ex["transcripts"][ix]], 0),
+        url=ex["url"],
+        prompt=prompt,
+    )
+    # out["metadata/unaugmented_image"] = image
+    return out
+def extract_caption_and_transcript_hflip(ds):
+    # Just in case they are ordered somehow in Matt's data
+    @seqio.map_over_dataset
+    def _shuffle_transcripts(_ex):
+        _ex["transcripts"] = tf.random.shuffle(_ex["transcripts"])
+        _ex["hflip"] = tf.random.uniform((), 0, 3, dtype=tf.int32)
+        return _ex
+    ds = _shuffle_transcripts(ds)
+    # Build a 3x long dataset with each individual transcript so we iterate through
+    # each transcript
+    @seqio.map_over_dataset
+    def _with_transcript(ex, _ix):
+        caption = ex["caption"]
+        if len(caption.shape) > 0:
+            caption = caption[0]
+        hflip = ex["hflip"] == _ix
+        if hflip:
+            ex["image"] = ex["image"][:, ::-1]
+            style = ["long_caption_flipped", "transcript_flipped"]
+        else:
+            style = ["long_caption", "transcript"]
+        return dict(
+            image=ex["image"],
+            text=tf.stack([caption, ex["transcripts"][_ix]], 0),
+            url=ex["url"],
+            style=style
+        )
+    joint_ds = _with_transcript(ds, 0)
+    for i in range(1, 3):
+        joint_ds = joint_ds.concatenate(_with_transcript(ds, i))
+    return joint_ds
+@seqio.map_over_dataset
+def extract_llava(ex, sequence_length, output_features):
+    tf.assert_equal(tf.shape(ex['conversations']['value'])[0], 2)
+    prompt = ex['conversations']['value'][0]
+    text = ex['conversations']['value'][1]
+    ex.pop('conversations')
+    ex["text"] = text
+    ex["prompt"] = prompt
+    return ex
+def extract_localized_narrative(ds):
+    ds = ds.filter(lambda ex: tf.shape(ex["cap/cap_caption"])[0] > 0)
+    def _map(ex):
+        return dict(
+            image=ex["image"],
+            text=tf.strings.reduce_join(ex["cap/cap_caption"], separator="\n")
+        )
+    return ds.map(_map)
+def float_to_text(val):
+    return tf.strings.as_string(tf.cast(val * 100, tf.int32))
+@seqio.map_over_dataset
+def extract_vqa(ex):
+    questions = ex["vqa"]["questions"]
+    answers = ex["vqa"]["answers"]
+    answers = tf.strings.reduce_join(answers, 1, separator="; ")
+    qas = tf.strings.reduce_join(tf.stack([questions, answers], 1), separator=" ")
+    return dict(
+        image=ex["image"],
+        text=tf.strings.reduce_join(qas, separator="\n")
+    )
+@seqio.map_over_dataset
+def coco_image_id_from_path(ex):
+    image_id = tf.strings.substr(ex["image/filename"], 0, tf.strings.length(ex["image/filename"])-4)
+    ex["image_id"] = tf.strings.to_number(image_id)
+    return ex
+@seqio.map_over_dataset
+def add_coco_url(ex):
+    """Turns a COCO path into a URL, which can then be used in visualizations"""
+    path = ex["image/filename"]
+    if not tf.strings.regex_full_match(path, ".*/.*"):
+        prefix = tf.strings.regex_replace(path, "COCO_", "")
+        prefix = tf.strings.regex_replace(prefix, "_[0-9]+.jpg", "")
+        path = tf.strings.join([prefix, path], separator="/")
+    # images are hosted by the COCO website here
+    url = tf.strings.join(["https://s3.us-east-1.amazonaws.com/images.cocodataset.org/", path])
+    ex["metadata/image_url"] = url
+    return ex
+def flatten_vqa(ds):
+    parts = ["questions", "answers"]
+    for k in ["id", "question_id"]:
+        if k in ds.element_spec:
+            parts.append(k)
+    return flatten_parts(ds, parts)
+def format_gqa(ds, is_balanced=True, flatten=True):
+    if is_balanced:
+        ds = ds.filter(lambda x: tf.reduce_any(x["questions"]["is_balanced"]))
+        def _filter_qs(ex):
+            qs = ex["questions"]
+            mask = qs["is_balanced"]
+            qs = {k: tf.boolean_mask(v, mask) for k, v in qs.items()}
+            ex["questions"] = qs
+            return ex
+        ds = ds.map(_filter_qs)
+    if flatten:
+        ds = flatten_parts(ds, ["questions"])
+    def _rename(ex):
+        out = ex["questions"]
+        out["image"] = ex["image"]
+        out["image_id"] = ex["image_id"]
+        return out
+    return ds.map(_rename)
+@seqio.map_over_dataset
+def fix_doqa_url(x):
+    x["image_url"] = tf.strings.regex_replace(x["image_url"], "gs://", "")
+    return x
+def _add_metadata(ex):
+    out = {}
+    if "id" in ex:
+        out["metadata/example_id"] = ex["id"]
+    elif "example_id" in ex:
+        out["metadata/example_id"] = ex["example_id"]
+    elif "question_id" in ex:
+        out["metadata/example_id"] = ex["question_id"]
+    if "image_url" in ex:
+        out["metadata/image_url"] = ex["image_url"]
+    for k, v in ex.items():
+        if k.startswith("metadata/"):
+            out[k] = v
+    return out
+def image_only(ds):
+    return ds.filter(lambda x: x["has_image"])
+def filter_difficult_direct_answer(ds):
+    return ds.filter(lambda x: not x["difficult_direct_answer"])
+@seqio.map_over_dataset()
+def format_ai2d(ex, variable_style=True):
+    abc = tf.constant(list("abcdefg".upper()))
+    out = dict(image=ex["image"])
+    out.update(_add_metadata(ex))
+    options = ex["choices"]
+    # >= 3 in case of none of the above like answers
+    n_options = tf.shape(ex["option_is_abc"])[0]
+    if ex["abc_label"] and tf.reduce_sum(tf.cast(ex["option_is_abc"], tf.int32)) >= (n_options - 1):
+        # The image labels are always upper, so use upper in the answer ptions
+        options = tf.where(
+            ex["option_is_abc"],
+            tf.strings.upper(options),
+            options
+        )
+        short_options = options
+        style = "ai2_diagram_no_letter"
+    else:
+        short_options = abc[:tf.shape(options)[0]]
+        options = tf.stack([short_options, options,], 1)
+        options = tf.strings.reduce_join(options, axis=-1, separator=": ")
+        style = "ai2_diagram"
+    options = tf.strings.reduce_join(options, separator="\n")
+    out["question"] = ex["question"]
+    out["options"] = options
+    if variable_style:
+        out["style"] = style
+    if ex["answer_idx"] < 0:
+        out["text"] = "?"
+    else:
+        out["text"] = short_options[ex["answer_idx"]]
+    out["metadata/answer_idx"] = ex["answer_idx"]
+    tf.debugging.assert_equal(tf.reduce_any(tf.strings.regex_full_match(options, ".*\|\|\|.*")), False)
+    out["metadata/option_names"] = tf.strings.reduce_join(short_options, separator="|||")
+    out["metadata/has_transparent_box"] = ex.get("has_transparent_box", tf.constant(False))
+    out["metadata/abc_label"] = ex["abc_label"]
+    return out
+@gin.configurable()
+@seqio.map_over_dataset()
+def format_multiple_choice_qa(ex, option_format="abc"):
+    assert option_format == "abc"
+    abc = tf.constant(list("abcdefg".upper()))
+    out = dict(image=ex["image"])
+    out.update(_add_metadata(ex))
+    options = ex["choices"]
+    short_options = abc[:tf.shape(options)[0]]
+    options = tf.stack([short_options, options,], 1)
+    options = tf.strings.reduce_join(options, axis=-1, separator=": ")
+    options = tf.strings.reduce_join(options, separator="\n")
+    out["question"] = ex["question"]
+    out["options"] = options
+    if ex["answer_idx"] < 0:
+        out["text"] = "?"
+    else:
+        out["text"] = short_options[ex["answer_idx"]]
+    out["metadata/answer_idx"] = ex["answer_idx"]
+    tf.debugging.assert_equal(tf.reduce_any(tf.strings.regex_full_match(options, ".*\|\|\|.*")), False)
+    out["metadata/option_names"] = tf.strings.reduce_join(short_options, separator="|||")
+    # out["metadata/option_names"] = tf.RaggedTensor.from_row_lengths(short_options, tf.shape(short_options))
+    # out["metadata/option_names"] = short_options
+    return out
+@seqio.map_over_dataset()
+def output_options(ex):
+    ex["metadata/options"] = ex["options"]
+    return ex
+@seqio.map_over_dataset()
+def extract_tally_qa(ex):
+    questions = ex.pop("questions")
+    ex["questions"] = questions["question"]
+    ex["answers"] = tf.strings.as_string(questions["answer"])
+    ex["question_id"] = questions["question_id"]
+    return ex
+@seqio.map_over_dataset()
+def count_bench_preprocessor(ex):
+    return {
+        "image": ex["image"],
+        "text": tf.strings.as_string(ex["number"]),
+        "object": ex["noun"],
+        "question": tf.strings.join([
+            "How many ", ex["noun"], " are there?"
+        ]),
+        "metadata/count": ex["number"],
+    }
+def filter_human(ds):
+    return ds.filter(lambda x: x["is_human"])
+def filter_aug(ds):
+    return ds.filter(lambda x: not x["is_human"])
+@seqio.map_over_dataset()
+def reweight_chartqa(ex, human, aug):
+    is_human = ex["metadata/is_human"]
+    ex["text_weights"] = human if is_human else aug
+    return ex
+@seqio.map_over_dataset()
+def chartqa_prompting(ex):
+    question = tf.strings.join([ex["question"], " Answer:"])
+    return dict(
+        image=ex["image"],
+        question=question,
+        answer=ex["answer"]
+    )
+@seqio.map_over_dataset()
+def chartqa_explanation(ex):
+    question = tf.strings.join([ex["question"], " Explanation:"])
+    out = {
+        "image": ex["image"],
+        "question": question,
+        "answer": ex["answer"],
+    }
+    out.update({k: v for k, v in ex.items() if k.startswith("metadata/")})
+    return out
+@seqio.map_over_dataset(num_seeds=1)
+def _preprocess_scifi(ex, seed):
+    if "qa_pairs" in ex:
+        q = ex["qa_pairs"]
+    else:
+        q = ex["qa"]
+    ix = stateless_permutation(tf.shape(q["question"])[0], seed)
+    return dict(
+        image=ex["image"],
+        question=tf.gather(q["question"], ix),
+        explanation=tf.gather(q["explanation"], ix),
+        answer=tf.gather(q["answer"], ix),
+    )
+@seqio.map_over_dataset
+def scifi_explanation_only(ex):
+    return dict(
+        image=ex["image"],
+        question=ex["question"],
+        answer=ex["explanation"],
+    )
+def filter_named_entity(ds):
+    @seqio.map_over_dataset
+    def _load_image(ex):
+        ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
+        return ex
+    ds = _load_image(ds)
+    return ds.filter(lambda x: tf.reduce_min(tf.shape(x["image"])[:2]) >= 32)
+@seqio.map_over_dataset()
+def extract_named_entity(ex):
+    qs = ex["questions"]
+    return {
+        "image": ex["image"],
+        "metadata/image_url": ex["url"],
+        "metadata/entity": ex["entity"],
+        "questions": qs["question"],
+        "answers": qs["answer"],
+    }
+@gin.configurable()
+def extract_individual_vqa(ds, test=False, answer_mode="best"):
+    @seqio.map_over_dataset(num_seeds=1)
+    def _extract(ex, seed):
+        if "questions" in ex:
+            question = ex["questions"]
+        else:
+            question = ex["question"]
+        out = dict(
+            image=ex["image"],
+            question=question,
+        )
+        out.update(_add_metadata(ex))
+        out["metadata/question"] = question
+        if ex.get("answers") is not None:
+            out["metadata/references"] = tf.strings.reduce_join(ex["answers"], separator="\n")
+        elif ex.get("answer") is not None:
+            out["metadata/references"] = ex["answer"]
+        if not test:
+            if "answer" in ex:
+                answer = ex["answer"]
+            else:
+                answer = ex["answers"]
+            if answer.dtype in [tf.int32, tf.int64]:
+                answer = tf.strings.as_string(answer)
+            if len(answer.shape) == 1 and tf.shape(answer)[0] == 0:
+                answer = tf.expand_dims("", 0)
+            if len(answer.shape) == len(question.shape):
+                pass
+            # Handle questions with multiple answers
+            elif answer_mode == "random":
+                assert len(answer.shape) == 1
+                answer = answer[tf.random.stateless_uniform((), seed, 0, tf.shape(answer)[0], dtype=tf.int32)]
+            elif answer_mode == "best":
+                def _get_best(_answer):
+                    vals, _, counts = tf.unique_with_counts(_answer)
+                    count_thresh = tf.reduce_max(counts)
+                    vals = tf.boolean_mask(vals, counts >= count_thresh)
+                    return vals[tf.random.stateless_uniform((), seed, 0, tf.shape(vals)[0], dtype=tf.int32)]
+                if len(answer.shape) == 1:
+                    answer = _get_best(answer)
+                elif isinstance(answer, tf.RaggedTensor):
+                    n = tf.shape(answer)[0]
+                    answer_arr = tf.TensorArray(dtype=tf.string, size=n, element_shape=())
+                    for i in range(n):
+                        answer_arr = answer_arr.write(i, _get_best(answer[i]))
+                    answer = answer_arr.stack()
+                else:
+                    answer = tf.map_fn(_get_best, answer)
+            elif answer_mode == "all_segments":
+                out["text"] = answer
+            elif answer_mode == "all_segments_weighted":
+                out["text"] = answer
+                out["text_weights"] = 1.0 / tf.cast(tf.shape(answer)[-1], tf.float32)
+            elif answer_mode == "all":
+                if len(answer.shape) == 1:
+                    answer = stateless_shuffle(answer, seed)
+                    answer = tf.strings.reduce_join(answer, separator="\n", axis=-1)
+                elif isinstance(answer, tf.RaggedTensor):
+                    n = tf.shape(answer)[0]
+                    answer_arr = tf.TensorArray(dtype=tf.string, size=n, element_shape=())
+                    for i in range(n):
+                        answer_arr = answer_arr.write(i, tf.strings.reduce_join(tf.random.shuffle(answer[i]), separator="\n", axis=-1))
+                    answer = answer_arr.stack()
+                else:
+                    answer = tf.map_fn(tf.random.shuffle, answer)
+                    answer = tf.strings.reduce_join(answer, separator="\n", axis=-1)
+            else:
+                raise NotImplementedError()
+            out["text"] = answer
+        return out
+    return _extract(ds)
+@seqio.map_over_dataset()
+def extract_khan_academy(ex):
+    return dict(
+        image=ex["image"],
+        image_url=ex["image_url"],
+        prompt="Answer this question",
+        text=ex["gptResponse"]
+    )
+@seqio.map_over_dataset()
+def extract_vaia_qa_latex_image(ex, add_short_answer=False, set_short_answer_first=False):
+    if ex["has_image"]:
+        image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
+        image = tf.expand_dims(image, 0)[:1]
+    else:
+        # image = get_blank_image() # blank image
+        image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
+        image = tf.expand_dims(image, 0)[:0]
+    img_h = tf.shape(image)[1]
+    img_w = tf.shape(image)[2]
+    if add_short_answer:
+        if set_short_answer_first:
+            answer = tf.strings.join(["Answer: ", ex["short_answer"], "\n\n", ex["answer"]])
+        else:
+            answer = tf.strings.join([ex["answer"], "\n\n", "Answer: ", ex["short_answer"]])
+    else:
+        answer = ex["answer"]
+    out = dict(
+        image=image, # 4-d tensor
+        text=answer,
+        prompt=tf.strings.join([ex["latex_question"], "\n"]),
+    )
+    out["metadata/images"] = image
+    out.update(_add_metadata(ex))
+    out["metadata/batch_id"] = ex["batch_id"]
+    out["metadata/image_size"] = [img_w, img_h]
+    return out
+@seqio.map_over_dataset()
+def extract_vqa_online(ex):
+    out = dict(
+        image=ex["image"],
+        prompt=tf.strings.join([ex["question"], "\n"]),
+        text=ex["answer"]
+    )
+    out.update(_add_metadata(ex))
+    out["metadata/row_id"] = ex["row_id"]
+    return out
+@seqio.map_over_dataset()
+def extract_scifi_joint(ex):
+    if "qa_pairs" in ex:
+        q = ex["qa_pairs"]
+    else:
+        q = ex["qa"]
+    prompts = tf.concat([["Describe this image in detail."], q["question"]], 0)
+    responses = tf.concat([ex["summary"][None], q["answer"]], 0)
+    return dict(
+        image=ex["image"],
+        prompt=prompts,
+        text=responses,
+    )
+def remove_no_qa(ds):
+    def _filter(ex):
+        if "qa_pairs" in ex:
+            q = ex["qa_pairs"]
+        else:
+            q = ex["qa"]
+        return tf.shape(q["question"])[0] > 0
+    return ds.filter(_filter)
+@seqio.map_over_dataset()
+def extract_scifi_qa_exp(ex):
+    return dict(
+        image=ex["image"],
+        question=ex["question"],  # Array of questions
+        answer=tf.strings.join([ex["explanation"], " Answer: ", ex["answer"]]),
+    )
+@seqio.map_over_dataset(num_seeds=1)
+def extract_scifi_qa_demo(ex, seed):
+    # if tf.random.stateless_uniform((), 0, 1) > 0.5:
+    answer = tf.strings.join([ex["explanation"], " Answer: ", ex["answer"]])
+    # else:
+    #     answer = ex["explanation"]
+    return dict(
+        image=ex["image"],
+        question=ex["question"],  # Array of questions
+        answer=answer,
+    )
+@seqio.map_over_dataset()
+def clock_bench_preprocessor(ex):
+    out = dict(
+        image=ex["image"],
+        prompt="What time is being shown?",
+    )
+    for k in ["hour", "minute", "second", "answerable"]:
+        out[f"metadata/{k}"] = ex[k]
+    return out
+def deg2rad(x):
+    return x*math.pi/180.0
+def get_affine_matrix(center, angle, translate, scale, shear):
+    # From https://github.com/pytorch/vision/blob/f96c42fca53230057b16941b078a0a9eee06e20f/torchvision/transforms/functional.py#L1006
+    rot = deg2rad(angle)
+    sx = deg2rad(shear[0])
+    sy = deg2rad(shear[1])
+    cx, cy = center
+    tx, ty = translate
+    # RSS without scaling
+    a = tf.cos(rot - sy) / tf.cos(sy)
+    b = -tf.cos(rot - sy) * tf.tan(sx) / tf.cos(sy) - tf.sin(rot)
+    c = tf.sin(rot - sy) / tf.cos(sy)
+    d = -tf.sin(rot - sy) * tf.tan(sx) / tf.cos(sy) + tf.cos(rot)
+    matrix = [a, b, 0.0, c, d, 0.0]
+    matrix = [x * scale for x in matrix]
+    # Apply inverse of center translation: RSS * C^-1
+    matrix[2] += matrix[0] * (-cx) + matrix[1] * (-cy)
+    matrix[5] += matrix[3] * (-cx) + matrix[4] * (-cy)
+    # Apply translation and center : T * C * RSS * C^-1
+    matrix[2] += cx + tx
+    matrix[5] += cy + ty
+    return matrix
+def quantize_point(coor, max_dim, mode="percent-precision-1"):
+    max_dim = tf.cast(max_dim, tf.float32)
+    coor = tf.cast(coor, tf.float32)
+    x = (coor / max_dim)
+    if mode == "percent-precision-1":
+        return tf.strings.as_string(x*100, precision=1)
+    elif mode == "zero_to_one":
+        return tf.strings.as_string(x, precision=3)
+    elif mode == "1k":
+        return tf.strings.as_string(x*1000, precision=0)
+    else:
+        raise NotImplementedError(mode)
+def construct_pointing_format(label_text, alt_text, x_str, y_str):
+    if alt_text is None:
+        alt_text = label_text
+    np = tf.shape(x_str)[0]
+    if np == 0:
+        output = ""
+    elif np == 1:
+        output = tf.strings.join([
+            '<point x="', x_str[0], '" y="', y_str[0], '" alt="',
+            alt_text, '">', label_text, '</point>'
+        ])
+    else:
+        ids = tf.strings.as_string(tf.range(1, np + 1, dtype=tf.int32))
+        xs = tf.strings.join(["x", ids, '="', x_str, '"'])
+        ys = tf.strings.join(["y", ids, '="', y_str, '"'])
+        points = tf.strings.reduce_join(tf.reshape(tf.stack([xs, ys], 1), [-1]), separator=' ', axis=-1)
+        output = tf.strings.join(
+            ["<points ", points, ' alt="', alt_text, '">', label_text, "</points>"])
+    return output
+def order_points(x, y, seed, point_order):
+    if point_order == "natural":
+        return x, y
+    if point_order == "random":
+        ix = stateless_permutation(tf.shape(x)[0], seed)
+    elif point_order == "xy":
+        x_float, y_float = tf.strings.to_number(x), tf.strings.to_number(y)
+        ix = tf.argsort(x_float*100000 + y_float)
+    elif point_order == "yx":
+        x_float, y_float = tf.strings.to_number(x), tf.strings.to_number(y)
+        ix = tf.argsort(y_float*100000 + x_float)
+    else:
+        raise NotImplementedError(point_order)
+    return tf.gather(x, ix), tf.gather(y, ix)
+@gin.configurable()
+def points_to_text(x, y, w, h, seed, label=None, alt_text=None, point_mode="percent-precision-1",
+                   point_order="xy", point_list_mode="tag"):
+    """Returns a string encoding of a list of points"""
+    x = quantize_point(x, w, point_mode)
+    y = quantize_point(y, h, point_mode)
+    # Order the quantized points to make the order matches what was generated, this can matter
+    # when points have the same quantized value e.g, (10.001, 20) (10.002, 10) should be
+    # represented (10, 10), (10, 20), but if we sort before quantization we get (10, 20), (10, 10)
+    x, y = order_points(x, y, seed, point_order)
+    if point_list_mode == "tag":
+        return construct_pointing_format(label, alt_text, x, y)
+    elif point_list_mode == "paren":
+        n = tf.shape(x)[0]
+        return tf.strings.reduce_join(tf.strings.join([
+            "(", x, ", ", y, ")"
+        ]), separator=", ")
+        # if n == 0:
+        #     output = ""
+        # else:
+        #     ids = tf.strings.as_string(tf.range(1, np + 1, dtype=tf.int32))
+        #     xs = tf.strings.join(["x", ids, '="', x_str, '"'])
+        #     ys = tf.strings.join(["y", ids, '="', y_str, '"'])
+        #     points = tf.strings.reduce_join(tf.reshape(tf.stack([xs, ys], 1), [-1]), separator=' ', axis=-1)
+        #     output = tf.strings.join(
+        #         ["<points ", points, ' alt="', alt_text, '">', label_text, "</points>"])
+        # return output
+    else:
+        raise NotImplementedError(point_list_mode)
+def points_to_answer(x, y, w, h, seed, label, is_counting, alt_text=None):
+    count = tf.shape(x)[0]
+    if is_counting:
+        if count == 0:
+            return "There are none."
+        else:
+            point_text = points_to_text(x, y, w, h, seed, label, alt_text)
+            return tf.strings.join([
+                "Counting the ", point_text,
+                " shows a total of ",
+                tf.strings.as_string(count),
+                "."
+            ])
+    else:
+        if count == 0:
+            return "There are none."
+        else:
+            return points_to_text(x, y, w, h, seed, label, alt_text)
+@seqio.map_over_dataset(num_seeds=2)
+def extract_point_qa(ex, seeds, answer_type="y_major"):
+    ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
+    img_h = tf.shape(ex["image"])[0]
+    img_w = tf.shape(ex["image"])[1]
+    questions = ex["questions"]
+    question = questions["question"]
+    n = tf.shape(question)[0]
+    answers = tf.TensorArray(tf.string, size=n, element_shape=())
+    point_text = questions["annotations"]["point_text"]
+    point_seeds = tf.RaggedTensor.from_row_splits(
+        row_splits=point_text.row_splits,
+        values=tf.random.split(seeds[0], num=tf.shape(point_text.values)[0])
+    )
+    for question_ix in range(n):
+        anno = questions["annotations"]
+        answer = questions["answer_with_placeholders"][question_ix]
+        n_anno = tf.shape(anno["point_text"][question_ix])[0]
+        for anno_ix in range(n_anno):
+            points = anno["points"][question_ix, anno_ix]
+            point_text = points_to_answer(
+                points[:, 0], points[:, 1], 100, 100,
+                point_seeds[question_ix, anno_ix],
+                anno["point_text"][question_ix, anno_ix],
+                False,
+                alt_text=anno["alt_text"][question_ix, anno_ix],
+            )
+            answer_split = tf.strings.split(answer, sep="<|POINT|>", maxsplit=1)
+            answer = tf.strings.join([answer_split[0], point_text, answer_split[1]])
+        # Make sure all placeholders where used
+        tf.debugging.assert_equal(tf.shape(tf.strings.split(answer, sep="<|POINT|>"))[0], 1)
+        answers = answers.write(question_ix, answer)
+    messages = tf.stack([question, answers.stack()], axis=1)
+    messages = tf.reshape(messages, [-1])
+    conversation_ids = tf.range(tf.shape(messages)[0] // 2, dtype=tf.int32)
+    conversation_ids = tf.repeat(conversation_ids, 2)
+    out = dict(
+        image=ex["image"],
+        messages=tf.RaggedTensor.from_value_rowids(messages, conversation_ids)
+    )
+    ix = stateless_permutation(tf.shape(messages)[0], seeds[1])
+    messages = tf.gather(messages, ix)
+    out.update(_add_metadata(ex))
+    out["metadata/image_size"] = [img_w, img_h]
+    return out
+def select_point(mask):
+    bs = tf.shape(mask)[0]
+    valid = tf.cast(mask, tf.float32)
+    h, w = tf.shape(mask)[1], tf.shape(mask)[2]
+    ys = tf.range(h, dtype=tf.int32)
+    xs = tf.range(w, dtype=tf.int32)
+    n = tf.reduce_sum(valid, [1, 2])
+    cy = tf.reduce_sum(tf.cast(ys[None, :, None], tf.float32) * valid, [1, 2]) / n  # [bs]
+    cx = tf.reduce_sum(tf.cast(xs[None, None, :], tf.float32) * valid, [1, 2]) / n  # [bs]
+    dist_y = tf.square(tf.range(h, dtype=tf.float32)[None, :] - cy[:, None])  # [bs, h]
+    dist_x = tf.square(tf.range(w, dtype=tf.float32)[None, :] - cx[:, None])  # [bs, w]
+    dist = dist_y[:, :, None] + dist_x[:, None, :]  # [batch, h, w]
+    dist = dist + (1 - valid) * 1e12
+    min_dist = tf.argmin(tf.reshape(dist, [bs, -1]), axis=-1)  # [batch]
+    w = tf.cast(w, min_dist.dtype)
+    cy = tf.cast(min_dist // w, tf.float32)
+    cx = tf.cast(min_dist % w, tf.float32)
+    return cx, cy
+@seqio.map_over_dataset
+def refexp_pointing(ex):
+    img_h = tf.shape(ex["image"])[0]
+    img_w = tf.shape(ex["image"])[1]
+    objects = ex["objects"]
+    # Shuffle objects so what object gets truncated if the sequence gets truncated is randomized
+    refexps = objects['refexp']['raw']
+    bbox = objects["bbox"]
+    mask = tf.squeeze(objects["mask"], -1)
+    ix = tf.range(0, tf.shape(refexps)[0], dtype=tf.int32)
+    ix = tf.random.shuffle(ix)
+    refexps = tf.gather(refexps, ix)
+    bbox = tf.gather(bbox, ix)
+    mask = tf.gather(mask, ix)
+    cx, cy = select_point(mask)
+    answers = points_to_text(img_h, img_w, cx, cy)
+    out = {
+        "image": ex["image"],
+        "refexp": refexps.values,
+        "metadata/image_size": tf.stack([img_w, img_h,]),
+        "text": tf.repeat(answers, refexps.row_lengths()),
+    }
+    if "image_url" in ex:
+        out["metadata/image_url"] = ex["image_url"]
+    return out
+@seqio.map_over_dataset
+def refexp_pointing_inf(ex):
+    img_h = tf.shape(ex["image"])[0]
+    img_w = tf.shape(ex["image"])[1]
+    objects = ex["objects"]
+    mask = tf.squeeze(objects["mask"], -1)
+    cx, cy = select_point(mask)
+    answers = points_to_text(img_h, img_w, cx, cy)
+    refexps = objects["refexp"]["raw"]
+    # We can't use `mask` directly since it is variable size, and thus it
+    # will break batching. Here we serialize it instead
+    serialized_masks = tf.map_fn(tf.io.serialize_tensor, mask, fn_output_signature=tf.string)
+    out = {
+        "image": ex["image"],
+        "refexp": refexps,
+        "metadata/bbox": objects["bbox"],
+        "metadata/answer": answers,
+        "metadata/mask": serialized_masks,
+        "metadata/image_size": tf.stack([img_w, img_h]),
+    }
+    out.update({k: v for k, v in ex.items() if k.startswith("metadata/")})
+    return out
+@seqio.map_over_dataset
+def extract_andriod_control_inf(ex, mode):
+    if mode == "ll":
+        prompt = tf.strings.join(["low_level: ", ex["metadata/ll_instruction"]])
+    elif mode == "hl_ll":
+        prompt = tf.strings.join([
+            "high_level: ", ex["metadata/hl_instruction"],
+            " low_level: ", ex["metadata/ll_instruction"]
+        ])
+    elif mode == "hl":
+        prompt = tf.strings.join(["high_level: ", ex["metadata/hl_instruction"]])
+    elif mode == "hl_cot":
+        prompt = tf.strings.join(["high_level_cot: ", ex["metadata/hl_instruction"]])
+    else:
+        raise NotImplementedError()
+    out = dict(
+        image=ex["image"],
+        prompt=prompt,
+        text=ex["metadata/target_action"]
+    )
+    out.update(_add_metadata(ex))
+    return out
+@seqio.map_over_dataset
+def extract_android_control(ex):
+    # Each image has three tasks:
+    # low level -> action
+    # high+low level -> action
+    # high level -> action
+    # high level -> low level + action (CoT)
+    out = dict(
+        image=ex["image"],
+        prompt=tf.stack([
+            tf.strings.join(["low_level: ", ex["metadata/ll_instruction"]]),
+            tf.strings.join([
+                "high_level: ", ex["metadata/hl_instruction"],
+                " low_level: ", ex["metadata/ll_instruction"]
+            ]),
+            tf.strings.join(["high_level: ", ex["metadata/hl_instruction"]]),
+            tf.strings.join(["high_level_cot: ", ex["metadata/hl_instruction"]]),
+        ]),
+        text=tf.stack([
+            ex["metadata/target_action"],
+            ex["metadata/target_action"],
+            ex["metadata/target_action"],
+            tf.strings.join(["Plan: ", ex["metadata/ll_instruction"], " Action: ", ex["metadata/target_action"]]),
+        ])
+    )
+    # Only needed if visualizing
+    # ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
+    # img_h = tf.shape(ex["image"])[0]
+    # img_w = tf.shape(ex["image"])[1]
+    # out["metadata/image_size"] = tf.stack([img_w, img_h,])
+    out.update(_add_metadata(ex))
+    return out
+@seqio.map_over_dataset(num_seeds=1)
+def refexp(ex, seed):
+    img_h = tf.shape(ex["image"])[0]
+    img_w = tf.shape(ex["image"])[1]
+    objects = ex["objects"]
+    # Shuffle objects so what object gets truncated if the sequence gets truncated is randomized
+    refexps = objects['refexp']['raw']
+    bbox = objects["bbox"]
+    ix = stateless_permutation(tf.shape(refexps)[0], seed)
+    refexps = tf.gather(refexps, ix)
+    bbox = tf.gather(bbox, ix)
+    x2 = bbox[:, 0] + bbox[:, 2]
+    y2 = bbox[:, 1] + bbox[:, 3]
+    with tf.control_dependencies([
+        tf.debugging.assert_equal(tf.reduce_any(x2 <= tf.cast(img_w, tf.float32)), True),
+        tf.debugging.assert_equal(tf.reduce_any(y2 <= tf.cast(img_h, tf.float32)), True)
+    ]):
+        answers = points_to_text(
+            img_h, img_w,
+            tf.reshape(tf.stack([bbox[:, 0], x2], 1), [-1]),
+            tf.reshape(tf.stack([bbox[:, 1], y2], 1), [-1]))
+        answers = tf.strings.reduce_join(tf.reshape(answers, [-1, 2]), separator=" ", axis=1)
+    out = {
+        "image": ex["image"],
+        "refexp": refexps.values,
+        "metadata/bbox": bbox,
+        "metadata/image_size": tf.stack([img_w, img_h,]),
+        "text": tf.repeat(answers, refexps.row_lengths()),
+    }
+    if "image_url" in ex:
+        out["image_url"] = ex["image_url"]
+    return out
+@seqio.map_over_dataset
+def refexp_inf(ex):
+    img_h = tf.shape(ex["image"])[0]
+    img_w = tf.shape(ex["image"])[1]
+    out = {
+        "image": ex["image"],
+        "refexp": ex["objects"]["refexp"]["raw"],
+        "metadata/bbox": ex["objects"]["bbox"],
+        "metadata/image_size": tf.stack([img_w, img_h,]),
+    }
+    out.update({k: v for k, v in ex.items() if k.startswith("metadata/")})
+    return out
+def point_text_interleaved(*args):
+    raise NotImplementedError()
+@seqio.map_over_dataset
+def web_pointing_preprocessor(ex):
+    img_h = tf.shape(ex["image"])[0]
+    img_w = tf.shape(ex["image"])[1]
+    question = point_text_interleaved(
+        img_h, img_w, ex["question"], ex["question_points"]["x"], ex["question_points"]["y"])
+    answer = point_text_interleaved(
+        img_h, img_w, ex["answer"], ex["answer_points"]["x"], ex["answer_points"]["y"])
+    answer_points = tf.stack([ex["answer_points"]["x"], ex["answer_points"]["y"]], axis=1)
+    return {
+        "question": question,
+        "answer": answer,
+        "image": ex["image"],
+        "metadata/image_size": [img_w, img_h],
+        "metadata/question_type": ex["question_type"],
+        "metadata/answer_points": tf.io.serialize_tensor(answer_points),
+        "metadata/answer": answer,
+    }
+def filter_pointing(ds):
+    return ds.filter(lambda ex: tf.shape(ex["answer_points"]["x"])[0] >= 1)
+def filter_qa(ds):
+    return ds.filter(lambda ex: tf.shape(ex["answer_points"]["x"])[0] == 0)
+# vaia filtering
+def filter_image_only(ds):
+    return ds.filter(lambda ex: ex["has_image"])
+def filter_mc(ds):
+    return ds.filter(lambda ex: ex["is_mc"])
+def remove_is_long(ds):
+    return ds.filter(lambda ex: not ex["is_long"])
+def remove_has_multiple_parts(ds):
+    return ds.filter(lambda ex: not ex["has_multiple_parts"])
+def _split(ds: tf.data.Dataset, keys, n_splits=2):
+    def _map(ex):
+        n = tf.shape(ex[keys[0]])[0]
+        if n < n_splits:
+            return tf.data.Dataset.from_tensors(ex)
+        else:
+            # import pdb; pdb.set_trace()
+            bs = n // n_splits
+            remainder = n - bs*n_splits
+            lens = tf.concat([
+                tf.ones([remainder], dtype=tf.int32),
+                tf.zeros([n_splits-remainder], dtype=tf.int32),
+            ], axis=0) + bs
+            tf.debugging.assert_equal(tf.reduce_sum(lens), n)
+            ends = tf.cumsum(lens)
+            parts = []
+            for split_ix in range(n_splits):
+                part_ex = dict(ex)
+                e = ends[split_ix]
+                s = e - lens[split_ix]
+                for k in keys:
+                    if isinstance(k, tuple):
+                        assert len(k) == 2
+                        part_ex[k[0]][k[1]] = ex[k[0]][k[1]][s:e]
+                    else:
+                        part_ex[k] = ex[k][s:e]
+                parts.append(part_ex)
+            ds = tf.data.Dataset.from_tensors(parts[0])
+            for sub_ds in parts[1:]:
+                sub_ds = tf.data.Dataset.from_tensors(sub_ds)
+                ds = ds.concatenate(sub_ds)
+            return ds
+    return ds.flat_map(_map)
+def split(ds, n=2):
+    # return ds
+    return _split(ds, [k for k in [
+        "question",
+        "label",
+        "text",
+        "entity",
+        "messages"
+    ] if k in ds.element_spec], n_splits=n)
+def split_points(ds, max_points=50):
+    label = "question" if "question" in ds.element_spec else "label"
+    return _split(ds, [
+        "question", label, "notInImage",
+        ("answer_points", "x"),
+        ("answer_points", "y"),
+    ])
+@seqio.map_over_dataset
+def fix_count_qa(ex):
+    ex["label"] = ex["label"][::2]
+    tf.debugging.assert_equal(tf.shape(ex["answer_points"]["x"])[0], tf.shape(ex["label"])[0])
+    return ex
+def filter_points(ds, max_number=40):
+    def _add_valid(ex):
+        valid = (
+            tf.reduce_all(ex["answer_points"]["x"] >= 0.0, axis=-1) &
+            tf.reduce_all(ex["answer_points"]["x"] <= 100.0, axis=-1) &
+            tf.reduce_all(ex["answer_points"]["y"] >= 0.0, axis=-1) &
+            tf.reduce_all(ex["answer_points"]["y"] <= 100.0, axis=-1) &
+            (ex["answer_points"]["y"].row_lengths() <= max_number)
+        )
+        ex["valid"] = valid
+        return ex
+    ds = ds.map(_add_valid)
+    ds = ds.filter(lambda ex: tf.reduce_any(ex["valid"]))
+    return ds
+# def filter_points(ds, max_number=30):
+# n_points = ds["answer_points"]["x"].row_lengths()
+# parts = tf.TensorArray(tf.int32, size=tf.shape(n_points[0]), element_shape=tf.TensorShape([None]))
+# total = 0
+# on_row = 0
+# for i in range(n_points):
+#     n = n_points[i]
+#     if n > max_number:
+#         continue
+#     if n + total > max_number:
+#
+# return ds
+@seqio.map_over_dataset(num_seeds=2)
+def pointing_preprocessor(ex, sequence_length, seeds, with_count=False):
+    image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
+    img_h = tf.shape(image)[0]
+    img_w = tf.shape(image)[1]
+    ix = tf.where(ex["valid"])[:, 0]
+    ix = stateless_shuffle(ix, seeds[0])
+    if "label" in ex:
+        question = tf.strings.lower(ex["label"])
+    else:
+        question = ex["question"]
+    question = tf.gather(question, ix)  # [n_question]
+    points_x = tf.gather(ex["answer_points"]["x"], ix)  # [n_question, n_points[ragged]]]
+    points_y = tf.gather(ex["answer_points"]["y"], ix)
+    not_in_image = tf.gather(ex["notInImage"], ix)  # [n_question]
+    n = tf.shape(points_x)[0]
+    point_text = tf.TensorArray(dtype=tf.string, size=n, element_shape=())  # [n_question]
+    point_seeds = tf.random.split(seeds[1], n)
+    for i in range(n):
+        answer = points_to_answer(points_x[i], points_y[i], 100, 100, point_seeds[i], question[i], with_count)
+        point_text = point_text.write(i, answer)
+    return {
+        "image": image,
+        "metadata/image_size": [img_w, img_h],
+        "entity": question,
+        "question": question,
+        "text": point_text.stack(),
+    }
+@seqio.map_over_dataset
+def pointing_inf_preprocessor(ex):
+    ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
+    img_h = tf.shape(ex["image"])[0]
+    img_w = tf.shape(ex["image"])[1]
+    question = ex["question"]
+    not_in_image = tf.shape(ex["answer_points"]["x"])[0] == 0
+    # points are stored in normalized format, de-normalize here
+    points_x = ex["answer_points"]["x"] * tf.cast(img_w, tf.float32) / 100.0
+    points_y = ex["answer_points"]["y"] * tf.cast(img_h, tf.float32) / 100.0
+    out = dict(
+        image=ex["image"],
+        question=question,
+        entity=question,
+    )
+    out.update(_add_metadata(ex))
+    out["metadata/not_in_image"] = not_in_image
+    # We can't use `mask` directly since it is variable size, and thus it
+    # will break batching. Here we serialize it instead
+    serialized_masks = tf.map_fn(tf.io.serialize_tensor, ex["masks"], fn_output_signature=tf.string)
+    serialized_masks = tf.strings.reduce_join(serialized_masks, separator="|||")
+    out["metadata/mask"] = serialized_masks
+    out["metadata/question"] = question
+    out["metadata/answer_points"] = tf.io.serialize_tensor(tf.stack([points_x, points_y], 1))
+    out["metadata/image_size"] = [img_w, img_h]
+    return out
+@seqio.map_over_dataset(num_seeds=1)
+def count_qa_preprocessor_inf(ex, sequence_length, seed):
+    image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
+    img_h = tf.shape(image)[0]
+    img_w = tf.shape(image)[1]
+    entity = tf.strings.substr(
+        ex["question"], len("How many "), tf.strings.length(ex["question"]) - len("How many "))
+    entity = tf.strings.split(entity, sep=" are ", maxsplit=1)[0]
+    entity = tf.strings.lower(entity)
+    tf.debugging.assert_equal(tf.strings.length(entity) != 0, True)
+    return {
+        "image": image,
+        "metadata/image_size": [img_w, img_h],
+        "metadata/count": tf.strings.to_number(ex["answer"]),
+        "question": ex["question"],
+        "entity": entity,
+    }
+@seqio.map_over_dataset(num_seeds=1)
+def count_qa_preprocessor(ex, sequence_length, seed, with_count=False,
+                          for_inference=False):
+    point_answer = ex["point_answer"]
+    numbers_str = tf.strings.regex_replace(point_answer, r'\.$', '')
+    numbers_str = tf.strings.regex_replace(numbers_str, r'[^\d\.\s]+', '')
+    numbers_str = tf.strings.strip(numbers_str)
+    numbers = tf.strings.split(numbers_str)
+    float_numbers = tf.strings.to_number(numbers, out_type=tf.float32)
+    coordinates = tf.reshape(float_numbers, (-1, 3))
+    points_x = coordinates[:, 1]
+    points_y = coordinates[:, 2]
+    image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
+    img_h = tf.shape(image)[0]
+    img_w = tf.shape(image)[1]
+    entity = tf.strings.substr(
+        ex["question"], len("How many "), tf.strings.length(ex["question"]) - len("How many "))
+    entity = tf.strings.split(entity, sep=" are ", maxsplit=1)[0]
+    entity = tf.strings.lower(entity)
+    tf.debugging.assert_equal(tf.strings.length(entity) != 0, True)
+    count = tf.strings.to_number(ex["answer"], out_type=tf.int32)
+    if for_inference:
+        return {
+            "image": image,
+            "metadata/image_size": [img_w, img_h],
+            "metadata/count": count,
+            "question": ex["question"],
+            "entity": entity,
+        }
+    else:
+        tf.debugging.assert_equal(count, tf.shape(points_x)[0])
+        # points are already normalized so use w=1, h=1
+        answer = points_to_answer(points_x, points_y, 1, 1, seed, entity, with_count)
+        return {
+            "image": image,
+            "metadata/image_size": [img_w, img_h],
+            "metadata/count": count,
+            "question": ex["question"],
+            "entity": entity,
+            "text": answer,
+        }
+@gin.configurable()
+@seqio.map_over_dataset
+def cleanup_preprocessor(ex, preprocess=False):
+    if preprocess:
+        ex["prompt"] = tf.strings.join(
+            [
+                "[[User]]: Correct the spelling and punctuation mistakes on the following transcript based on what appears in the image.\n\n{before} ",
+                ex["prompt"],
+                "\n[[Assistant]]: {after}"
+            ]
+        )
+        return ex
+    else:
+        return ex
+@gin.configurable()
+@seqio.map_over_dataset
+def random_text_preprocessor(ex, preprocess=False):
+    ex["prompt"] = "What does the text say in this image?"
+    if preprocess:
+        ex["prompt"] = tf.strings.join(["[[User]]: ", ex["prompt"], "\n[[Assistant]]:"])
+        return ex
+    else:
+        return ex
+@seqio.map_over_dataset(num_seeds=25)
+def clock_augmentation(ex, seeds):
+    seeds = list(seeds)
+    image = ex["image"]
+    # Apply shear, rotation, and scale through one affine matrix
+    height = tf.cast(tf.shape(image)[0], tf.float32)
+    width = tf.cast(tf.shape(image)[1], tf.float32)
+    _call_id = [0]
+    def _rng(_minval=0, _maxval=1, shape=(), dtype=tf.float32):
+        return tf.random.stateless_uniform(shape, seeds.pop(), _minval, _maxval, dtype=dtype)
+    sel = _rng(0, 1)
+    if sel < 0.1:
+        # Straight on
+        shear_x = 0.
+        shear_y = 0.
+        rotation = 0.
+    elif sel < 0.5:
+        # Normal looking
+        shear_x = _rng(-10, 10)
+        shear_y = _rng(-10, 10)
+        rotation = _rng(-25, 25)
+    else:
+        # Allowed to be very wonky
+        # if tf.random.stateless_uniform((), seeds.pop(), 0, 1) > 0.8:
+        #     image = image[:, ::-1]
+        if _rng() > 0.5:
+            shear_x = _rng( -30, 30)
+            shear_y = _rng( -30, 30)
+        else:
+            shear_x = _rng( -10, 10)
+            shear_y = _rng( -10, 10)
+        rng = _rng( 0, 1)
+        if rng < 0.2:
+            rotation = _rng( -25, 25)
+        elif rng < 0.6:
+            rotation = _rng( -80, 80)
+        else:
+            rotation = _rng( -180, 180)
+    if _rng() > 0.5:
+        scale = _rng( 0.3, 2)
+    else:
+        scale = _rng( 0.3, 1)
+    # Pad so upscaling/rotation will not move the image out of bounds
+    pad = tf.cast(tf.maximum(height, width)*0.5, tf.int32)
+    image = tf.pad(image, [[pad, pad], [pad, pad], [0, 0]], constant_values=1)
+    height = tf.cast(tf.shape(image)[0], tf.float32)
+    width = tf.cast(tf.shape(image)[1], tf.float32)
+    image = tf.keras.ops.image.affine_transform(
+        image,
+        tf.stack(get_affine_matrix(
+            [height/2, width/2],
+            rotation,
+            [0, 0],
+            1/scale,
+            [shear_x, shear_y]
+        ) + [0., 0.]),
+        interpolation='bilinear',
+        fill_mode='constant',
+        fill_value=1.,
+        data_format='channels_last'
+    )
+    # Crop, otherwise it would be impossible to put the image at the corner of the image
+    not_white = tf.logical_not(tf.reduce_all(image > 0.99, -1))
+    no_white_ix = tf.where(not_white)
+    top_left = tf.reduce_min(no_white_ix, axis=0)
+    bottom_right = tf.reduce_max(no_white_ix, axis=0)
+    image = tf.image.crop_to_bounding_box(
+        image,
+        offset_height=tf.cast(top_left[0], tf.int32),
+        offset_width=tf.cast(top_left[1], tf.int32),
+        target_height=tf.cast(bottom_right[0] - top_left[0] + 1, tf.int32),
+        target_width=tf.cast(bottom_right[1] - top_left[1] + 1, tf.int32),
+    )
+    # Translate
+    height, width = tf.shape(image)[0], tf.shape(image)[1]
+    translation_seed = _rng(0, 1)
+    if translation_seed < 0.2:
+        h_pad = _rng(0, height//2, (2,), dtype=tf.int32)
+        w_pad = _rng(0, width//2, (2,), dtype=tf.int32)
+    else:
+        h_pad = _rng(0, height*2, (2,), dtype=tf.int32)
+        w_pad = _rng(0, width*2, (2,), dtype=tf.int32)
+    image = tf.pad(image, [[h_pad[0], w_pad[0]], [h_pad[1], w_pad[1]], [0, 0]],
+                   constant_values=1)
+    # Random background color
+    # color_rng = tf.random.stateless_uniform((4,), seeds.pop(), 0, 1)
+    # random_color = color_rng[:3]
+    # valid = tf.reduce_all(tf.reduce_sum(tf.abs(random_color[None, None, :] - image), -1) > 0.03)
+    # if color_rng[0] < 0.2 and valid:
+    #     image = tf.where(tf.reduce_all(image < 0.99, axis=-1, keepdims=True),
+    #                      image, image * 0 + random_color[None, None, :])
+    # Mild color hitter
+    image = tf.image.stateless_random_hue(image, max_delta=0.05, seed=seeds.pop())
+    image = tf.image.stateless_random_brightness(image, max_delta=0.15, seed=seeds.pop())
+    image = tf.image.stateless_random_saturation(image, 0.8, 1.2, seed=seeds.pop())
+    image = tf.image.stateless_random_contrast(image, 0.8, 1.2, seed=seeds.pop())
+    # ex["metadata/unaugmented_image"] = ex["image"]
+    ex["image"] = image
+    return ex
+@seqio.map_over_dataset
+def clocks_preprocessor(ex):
+    time_format = ex["time_format"]
+    shows_seconds = ex["shows_seconds"]
+    hour, minute, second = [tf.cast(ex[k], tf.int32) for k in ["hour", "minute", "second"]]
+    if hour == 0:  # Midnight of the previous day
+        am_pm = "PM"
+        hour_str = 12
+        hour = 24
+    elif hour > 12:
+        am_pm = "PM"
+        hour_str = hour - 12
+    else:
+        hour_str = hour
+        am_pm = "AM"
+    hour_str = tf.strings.as_string(hour_str)
+    minute_str = tf.strings.as_string(minute)
+    if tf.strings.length(minute_str) == 1:
+        minute_str = tf.strings.join(["0", minute_str])
+    second_str = tf.strings.as_string(second)
+    if tf.strings.length(second_str) == 1:
+        second_str = tf.strings.join(["0", second_str])
+    prefix = "The time shown is "
+    if time_format == "The time is not shown":
+        text = "The time is not shown in the image."
+        hour, minute, second = -1, -1, -1
+    else:
+        if not shows_seconds:
+            second = -1
+        if time_format == "12 hour clock (without AM/PM)" and shows_seconds:
+            if hour > 12:
+                hour = hour - 12
+            time = tf.strings.join([hour_str, ":", minute_str, ":", second_str])
+        elif time_format == "12 hour clock (with AM/PM)" and shows_seconds:
+            time = tf.strings.join([hour_str, ":", minute_str, ":", second_str, " ", am_pm])
+        elif time_format == "12 hour clock (with AM/PM)" and not shows_seconds:
+            time = tf.strings.join([hour_str, ":", minute_str, " ", am_pm])
+        elif time_format == "12 hour clock (without AM/PM)" and not shows_seconds:
+            if hour > 12:
+                hour = hour - 12
+            time = tf.strings.join([hour_str, ":", minute_str])
+        else:
+            time = ""  # Should never occur, but needed for tf analysis
+        tf.debugging.assert_equal(tf.strings.length(time) > 0, True)
+        text = tf.strings.join(["The time shown is ", time])
+    image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
+    image = tf.image.convert_image_dtype(image, tf.float32)[:-120]  # remove the black shadow at the bottom
+    return {
+        "image": image,
+        "prompt": "What time is being shown?",
+        "text": text,
+        "metadata/time_format": time_format,
+        "metadata/hour": hour,
+        "metadata/minute": minute,
+        "metadata/text": text,
+        "metadata/second": second,
+    }
+@seqio.map_over_dataset()
+def atlas_obscura_preprocessor(ex):
+    out = dict(
+        image=ex["image"],
+        prompt="Where was this picture taken?",
+        text=tf.strings.join([
+            ex["place"],
+            " in ",
+            ex["city"]
+        ])
+    )
+    out["metadata/image_url"] = ex["image_url"]
+    out["metadata/references"] = out["text"]
+    return out
+@seqio.map_over_dataset()
+def famous_birthdays_preprocessor(ex):
+    out = dict(
+        image=ex["image"],
+        image_url=ex["image_url"],
+        prompt="Who is this?",
+        text=ex["name"]
+    )
+    out["metadata/references"] = out["text"]
+    return out
+@seqio.map_over_dataset()
+def mild_color_aug_preprocessor(ex):
+    if "image_url" in ex:  # URL won't show the augmentations
+        del ex["image_url"]
+    # ex["metadata/unaugmented_image"] = ex["image"]
+    ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
+    ex["image"] = mild_color_aug(ex["image"])
+    return ex
+def build_text_with_points(text, points, img_h, img_w):
+    points = points_to_text(img_h, img_w, points[:, 0], points[:, 1])
+    parts = tf.strings.split(text, sep="<ANS>")
+    with_points = tf.strings.reduce_join(tf.reshape(tf.stack([
+        parts,
+        tf.pad(points, [[0, 1]], constant_values=""),
+    ], 1), [-1]), separator="")
+    return tf.strings.split(with_points, "\n\n")
+@seqio.map_over_dataset()
+def synth_count_preprocessor(example):
+    image_shape = tf.shape(example["image"])
+    h, w = image_shape[0], image_shape[1]
+    questions = build_text_with_points(example["questions"], example["question_points"], h, w)
+    answers = build_text_with_points(example["answers"], example["answer_points"], h, w)
+    keep_q = tf.strings.regex_full_match(questions, "How many.*")
+    keep_ans = tf.strings.regex_full_match(answers, "There are [0-9]+.*")
+    keep = tf.logical_and(keep_q, keep_ans)
+    questions = tf.boolean_mask(questions, keep)
+    answers = tf.boolean_mask(answers, keep)
+    ix = tf.range(0, tf.shape(answers)[0], dtype=tf.int32)
+    ix = tf.random.shuffle(ix)
+    return dict(
+        image=example["image"],
+        prompt=tf.gather(questions, ix),
+        text=tf.gather(answers, ix),
+    )
+def synth_count_inf_preprocessor(ds):
+    @seqio.map_over_dataset(num_seeds=1)
+    def get_two(example, seed):
+        image_shape = tf.shape(example["image"])
+        h, w = image_shape[0], image_shape[1]
+        questions = build_text_with_points(example["questions"], example["question_points"], h, w)
+        answers = build_text_with_points(example["answers"], example["answer_points"], h, w)
+        keep_q = tf.strings.regex_full_match(questions, "How many.*")
+        keep_ans = tf.strings.regex_full_match(answers, "There are [0-9]+.*")
+        keep = tf.logical_and(keep_q, keep_ans)
+        questions = tf.boolean_mask(questions, keep)
+        answers = tf.boolean_mask(answers, keep)
+        ix = stateless_permutation(tf.shape(answers)[0], seed)[:2]
+        return {
+            "image": example["image"],
+            "prompt": tf.gather(questions, ix),
+            "metadata/references": tf.gather(answers, ix),
+        }
+    ds = get_two(ds)
+    return flatten_parts(ds, ["prompt", "metadata/references"])
+def mild_color_aug(image):
+    image = tf.image.random_hue(image, max_delta=0.05)
+    image = tf.image.random_brightness(image, max_delta=0.15)
+    image = tf.image.random_saturation(image, 0.7, 1.3)
+    image = tf.image.random_contrast(image, 0.8, 1.2)
+    return image
+@seqio.map_over_dataset()
+def name_entity_augmentation(ex, p_high_color=0.7):
+    ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
+    image = ex["image"]
+    image = tf.image.convert_image_dtype(image, tf.float32)
+    # Horizontal flip
+    if tf.random.uniform((), 0, 1) > 0.85:
+        image = image[:, ::-1]
+    # Random crop
+    height = tf.cast(tf.shape(image)[0], tf.float32)
+    width = tf.cast(tf.shape(image)[1], tf.float32)
+    crop_rng = tf.random.uniform((), 0, 1)
+    if crop_rng < 0.2:
+        pass
+    else:
+        if crop_rng < 0.4:
+            h_crop = height * 0.15
+            w_crop = width * 0.15
+        else:
+            h_crop = height * 0.4
+            w_crop = width * 0.4
+        crop_h = tf.cast(tf.random.uniform((2,), 0, h_crop/2), tf.int32)
+        crop_w = tf.cast(tf.random.uniform((2,), 0, w_crop/2), tf.int32)
+        image = image[crop_h[0]:-crop_h[1]-1, crop_w[0]:-crop_w[1]-1]
+        height = tf.cast(tf.shape(image)[0], tf.float32)
+        width = tf.cast(tf.shape(image)[1], tf.float32)
+    if tf.random.uniform(()) > p_high_color:
+        image = tf.image.random_hue(image, max_delta=0.05)
+        image = tf.image.random_brightness(image, max_delta=0.15)
+        image = tf.image.random_saturation(image, 0.7, 1.3)
+        image = tf.image.random_contrast(image, 0.8, 1.2)
+    else:
+        image = tf.image.random_hue(image, max_delta=0.1)
+        image = tf.image.random_brightness(image, max_delta=0.3)
+        image = tf.image.random_saturation(image, 0.0, 2.0)
+        image = tf.image.random_contrast(image, 0.2, 1.5)
+    # Apply shear, rotation, and scale through one affine matrix
+    sel = tf.random.uniform((), 0, 1)
+    if sel < 0.1:
+        pass
+    else:
+        if sel < 0.15:  # Scale only
+            shear_x = 0
+            shear_y = 0
+            rotation = 0
+        if sel < 0.7:  # Mild
+            shear_x = tf.random.uniform((), -2, 2)
+            shear_y = tf.random.uniform((), -2, 2)
+            rotation = tf.random.uniform((), -5, 5)
+        else:  # Severe
+            shear_x = tf.random.uniform((), -10, 10)
+            shear_y = tf.random.uniform((), -10, 10)
+            rotation = tf.random.uniform((), -20, 20)
+        max_scale = 1.2
+        scale = tf.random.uniform((), 0.4, max_scale)
+        # Pad so upscaling/rotation will not move the image out of bounds
+        pad = tf.cast(tf.maximum(height, width)*0.2, tf.int32)
+        image = tf.pad(image, [[pad, pad], [pad, pad], [0, 0]], constant_values=1)
+        image = tf.keras.ops.image.affine_transform(
+            image,
+            tf.stack(get_affine_matrix(
+                [height/2, width/2],
+                rotation,
+                [0, 0],
+                1/scale,
+                [shear_x, shear_y]
+            ) + [0., 0.]),
+            interpolation='bilinear',
+            fill_mode='constant',
+            fill_value=1.,
+            data_format='channels_last'
+        )
+    # Crop, otherwise it would be impossible to put the image at the corner of the image
+    not_white = tf.logical_not(tf.reduce_all(image > 0.99, -1))
+    no_white_ix = tf.where(not_white)
+    top_left = tf.reduce_min(no_white_ix, axis=0)
+    bottom_right = tf.reduce_max(no_white_ix, axis=0)
+    # Very low chance center crop will get nothing but white space, we just skip
+    if (
+        (bottom_right[0] - top_left[0]) > 1 and (bottom_right[1] - top_left[1]) > 1
+    ):
+        image = tf.image.crop_to_bounding_box(
+            image,
+            offset_height=tf.cast(top_left[0], tf.int32),
+            offset_width=tf.cast(top_left[1], tf.int32),
+            target_height=tf.cast(bottom_right[0] - top_left[0] + 1, tf.int32),
+            target_width=tf.cast(bottom_right[1] - top_left[1] + 1, tf.int32),
+        )
+    # Translate
+    height, width = tf.shape(image)[0], tf.shape(image)[1]
+    if tf.random.uniform((), 0, 1) < 0.1:
+        h_pad = tf.zeros((2,), dtype=tf.int32)
+        w_pad = tf.zeros((2,), dtype=tf.int32)
+    elif tf.random.uniform((), 0, 1) < 0.8:
+        h_pad = tf.random.uniform((2,), 0, 50, dtype=tf.int32)
+        w_pad = tf.random.uniform((2,), 0, 50, dtype=tf.int32)
+    else:
+        pad = tf.cast(tf.maximum(height, width), tf.int32)
+        h_pad = tf.random.uniform((2,), 0, pad, dtype=tf.int32)
+        w_pad = tf.random.uniform((2,), 0, pad, dtype=tf.int32)
+    image = tf.pad(image, [[h_pad[0], w_pad[0]], [h_pad[1], w_pad[1]], [0, 0]],
+                   constant_values=1)
+    if "image_url" in ex:  # URL won't show the augmentations
+        del ex["image_url"]
+    # ex["metadata/unaugmented_image"] = ex["image"]
+    ex["image"] = image
+    return ex
+@seqio.map_over_dataset()
+def wiki_art_preprocessor(ex):
+    out = dict(
+        image=ex["image"],
+        prompt="What is this?",
+        text=ex["question"]
+    )
+    out["metadata/title"] = ex["title"]
+    out["metadata/gt"] = ex["question"]
+    out["metadata/artist"] = ex["artist"]
+    out["metadata/painting_url"] = ex["painting_url"]
+    # if "metadata/unaugmented_image" in ex:
+    #     out["metadata/unaugmented_image"] = ex["metadata/unaugmented_image"]
+    return out
+@seqio.map_over_dataset()
+def oscar_preprocessor(ex):
+    out = dict(
+        image=ex["image"],
+        prompt=ex["question"]
+    )
+    out.update(_add_metadata(ex))
+    out["metadata/question"] = ex["question"]
+    out["metadata/answer"] = ex["answer"]
+    out["metadata/category"] = ex["category"]
+    return out
+@seqio.map_over_dataset()
+def tulu_preprocessor(ex):
+    return {
+        "messages": ex["messages"]["content"],
+    }
+    # logging.info("Debugging tulue")
+    # return {"messages": ex["messages"]["content"], "text_weights": 1e-6}
+WIKI_DATA_QUESTION = "What is this? Respond with just a proper name."
+@seqio.map_over_dataset()
+def extract_wiki_data(ex):
+    return dict(
+        image=ex["image"],
+        image_url=ex["image_url"],
+        prompt=[
+            WIKI_DATA_QUESTION,
+            "What is this? Respond with the proper name of the main focus of the image and a few details about it."
+        ],
+        text=[
+            tf.strings.strip(tf.strings.regex_replace(ex["question"], r"\(.*\)", "")),
+            ex["gptResponse"],
+        ]
+    )
+@seqio.map_over_dataset()
+def extract_wiki_data_name(ex):
+    target = tf.strings.strip(tf.strings.regex_replace(ex["question"], r"\(.*\)", ""))
+    out = dict(
+        image=ex["image"],
+        image_url=ex["image_url"],
+        prompt=WIKI_DATA_QUESTION,
+        text=target,
+    )
+    out["metadata/references"] = target
+    return out
+@seqio.map_over_dataset()
+def extract_wiki_data_describe(ex):
+    out = dict(
+        image=ex["image"],
+        image_url=ex["image_url"],
+        prompt="What is this? Respond with the proper name of the main focus of the image and a few details about it.",
+    )
+    out["metadata/references"] = ex["gptResponse"]
+    return out
+@gin.configurable()
+def format_multiple_style_qa(ds, types=['multiple_choice', 'short_answer'], styles=['ai2_diagram', 'vqa2'], default_style='vqa2',
+                             strip_instruction=False):
+    def _extract(ex):
+        prompt = ex["question"]
+        out = dict(image=ex["image"])
+        out.update(_add_metadata(ex))
+        out["text"] = ex["answer"]
+        out["metadata/references"] = ex["answer"]
+        if ex["metadata/question_type"] == 'multiple_choice':
+            style = styles[0]
+        else:
+            style = styles[1]
+        if strip_instruction:
+            if ex["metadata/question_type"] == "multiple_choice":
+                # parts = tf.strings.split(prompt, "\n")
+                # parts 1 is blank and part -1 is the instruction
+                # prompt = tf.strings.reduce_join(tf.concat([parts[:1], parts[2:-1]], 0), separator="\n")
+                prompt = prompt
+            else:
+                prompt = tf.strings.split(prompt, "\n")[0]
+        out["style"] = style
+        out["prompt"] = prompt
+        return out
+    ds = ds.map(_extract, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    return ds
+@gin.configurable()
+def extract_mmmu(ds, types=['multiple-choice', 'open'], styles=['ai2_diagram', 'vqa2'], default_style='ai2_diagram', option_format="abc"):
+    assert option_format == "abc"
+    keys_tensor = tf.constant(types, dtype=tf.string)
+    values_tensor = tf.constant(styles, dtype=tf.string)
+    table = tf.lookup.StaticHashTable(
+        tf.lookup.KeyValueTensorInitializer(keys_tensor, values_tensor),
+        default_value=tf.constant(default_style, dtype=tf.string),
+    )
+    def _extract(ex):
+        out = dict(image=tf.expand_dims(ex["image_1"], 0))
+        out.update(_add_metadata(ex))
+        style = table.lookup(ex["metadata/question_type"])
+        out["style"] = style
+        out["text"] = ex["answer"]
+        out["metadata/references"] = ex["answer"]
+        if style == styles[0]:
+            abc = tf.constant(list("abcdefghi".upper()))
+            options = ex["options"]
+            num_options = tf.shape(options)[0]
+            dummy_options = tf.tile(tf.constant([""], dtype=tf.string), [9 - num_options])
+            out["metadata/options"] = tf.concat([options, dummy_options], axis=0)
+            out["metadata/options"] = tf.ensure_shape(out["metadata/options"], [9])
+            short_options = abc[:num_options]
+            options = tf.stack([short_options, options,], 1)
+            options = tf.strings.reduce_join(options, axis=-1, separator=": ")
+            options = tf.strings.reduce_join(options, separator="\n")
+            out["prompt"] = tf.strings.join([ex["question"], "\n", options, "\n"])
+            if tf.reduce_sum(tf.cast(tf.strings.regex_full_match(options, "<img='(.*?)'>"), tf.int32)) > 1:
+                # Following LLaVa, don't use any images if there are multiple images paths
+                # I think the rationale is that this means the image are answer-options
+                out["image"] = out["image"][:0]
+        else:
+            out["metadata/options"] = tf.constant([""] * 9, dtype=tf.string)
+            out["prompt"] = ex["question"]
+            out["image"] = out["image"][:0]
+        return out
+    ds = ds.map(_extract, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    return ds
+@gin.configurable()
+def extract_mmmu_cot(ds, types=['multiple-choice', 'open'], styles=['ai2_diagram', 'vqa2'], default_style='ai2_diagram', option_format="abc"):
+    assert option_format == "abc"
+    keys_tensor = tf.constant(types, dtype=tf.string)
+    values_tensor = tf.constant(styles, dtype=tf.string)
+    table = tf.lookup.StaticHashTable(
+        tf.lookup.KeyValueTensorInitializer(keys_tensor, values_tensor),
+        default_value=tf.constant(default_style, dtype=tf.string),
+    )
+    def _extract(ex):
+        # out = dict(image=tf.expand_dims(ex["image_with_question"], 0))
+        out = dict(image=tf.expand_dims(ex["image_1"], 0))
+        out.update(_add_metadata(ex))
+        style = table.lookup(ex["metadata/question_type"])
+        # out["style"] = style
+        out["text"] = ex["answer"]
+        out["metadata/question"] = ex["question"]
+        out["metadata/references"] = ex["answer"]
+        if style == styles[0]:
+            abc = tf.constant(list("abcdefghi".upper()))
+            options = ex["options"]
+            num_options = tf.shape(options)[0]
+            dummy_options = tf.tile(tf.constant([""], dtype=tf.string), [9 - num_options])
+            out["metadata/options"] = tf.concat([options, dummy_options], axis=0)
+            out["metadata/options"] = tf.ensure_shape(out["metadata/options"], [9])
+            short_options = abc[:num_options]
+            options = tf.stack([short_options, options,], 1)
+            options = tf.strings.reduce_join(options, axis=-1, separator=": ")
+            options = tf.strings.reduce_join(options, separator="\n")
+            out["prompt"] = tf.strings.join([ex["question"], "\n", options, "\n"])
+            # out["prompt"] = ex["question"]
+            if tf.reduce_sum(tf.cast(tf.strings.regex_full_match(options, "<img='(.*?)'>"), tf.int32)) > 1:
+                # Following LLaVa, don't use any images if there are multiple images paths
+                # I think the rationale is that this means the image are answer-options
+                out["image"] = out["image"][:0]
+        else:
+            out["metadata/options"] = tf.constant([""] * 9, dtype=tf.string)
+            out["prompt"] = ex["question"]
+            # out["image"] = out["image"][:0]
+        return out
+    ds = ds.map(_extract, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    return ds
+@seqio.map_over_dataset
+def reformat_math_vista(ex):
+    query = ex["query"]
+    query = tf.strings.split(query, sep="Question:")[-1]
+    query = tf.strings.strip(tf.strings.split(query, sep="Hint:")[0])
+    ex["query"] = query
+    return ex
+@seqio.map_over_dataset
+def extract_math_vista(ex, styles=['ai2_diagram', 'vqa2']):
+    out = dict(image=ex["image"])
+    out.update(_add_metadata(ex))
+    is_mc = ex["metadata/question_type"] == 'multi_choice'
+    if is_mc:
+        style = styles[0]
+        abc = tf.constant(list("abcdefghi".upper()))
+        options = ex["choices"]
+        num_options = tf.shape(options)[0]
+        dummy_options = tf.tile(tf.constant([""], dtype=tf.string), [9 - num_options])
+        out["metadata/options"] = tf.concat([options, dummy_options], axis=0)
+        out["metadata/options"] = tf.ensure_shape(out["metadata/options"], [9])
+        if ex["metadata/split"] != "test":
+            short_options = abc[:num_options]
+            answer_short_option = tf.boolean_mask(short_options, options == ex["answer"])[0]
+            out["text"] = answer_short_option
+        else:
+            out["text"] = ex["answer"]
+    else:
+        style = styles[1]
+        out["metadata/options"] = tf.constant([""] * 9, dtype=tf.string)
+        out["text"] = ex["answer"]
+    out["style"] = style
+    out["prompt"] = ex["query"]
+    out["metadata/query"] = ex["query"]
+    out["metadata/references"] = ex["answer"]
+    return out
+NO_POINT_PREFIX = [
+    "No pointing: ",
+    "No pointing: ",
+    "no pointing:\n",
+    "No pointing:\n",
+    "Not pointing:\n",
+    "No Points: ",
+    "No Points: ",
+    "NO POINTING\n",
+    "No pontiing\n",
+    "No Points:\n ",
+    "No pointing\n",
+    "Do not point. ",
+    "Refrain from pointing. ",
+    "Avoid generating points . ",
+    "For this question, do not use points. ",
+    "Refrain from using points:\n",
+    "Don't include points in your response. ",
+    "Don't point. ",
+    "Don't use points. ",
+    "Please don't use points.\n\n",
+    "Please don't use points.\n\n",
+    "Respond without using points. ",
+    "Respond without pointing:\n",
+    "Do not generate ponits: ",
+    "Do not point. ",
+    "Do not point\n",
+    "no pointing\n\n",
+    "Answer without points: ",
+    "Answer this question without pointing: ",
+    "Answer without poiints. ",
+    "answer without points: ",
+    "answer with text only, do not points\n"
+]
+assert all(x[-1].isspace() for x in NO_POINT_PREFIX)
+NO_POINT_PREFIX_TF = tf.constant(NO_POINT_PREFIX)
+def prefix_how_many(messages, seed):
+    question = messages[0]
+    if tf.strings.regex_full_match(tf.strings.lower(question), "how many.*"):
+        ix = tf.random.stateless_uniform((), seed, 0, len(NO_POINT_PREFIX),  tf.int32)
+        question = tf.strings.join([NO_POINT_PREFIX_TF[ix], question])
+        return tf.concat([tf.expand_dims(question, 0), messages[1:]], axis=0)
+    else:
+        return messages
+@seqio.map_over_dataset(num_seeds=1)
+def prefix_how_many_messages(ex, seed):
+    messages = ex["messages"]
+    n = tf.shape(messages)[0]
+    seeds = tf.random.split(seed, n)
+    message_arr = tf.TensorArray(dtype=tf.string, size=n, element_shape=(None,))
+    for i in range(n):
+        message_arr = message_arr.write(i, prefix_how_many(messages[i], seeds[i]))
+    ex["messages"] = tf.RaggedTensor.from_row_splits(
+        values=message_arr.concat(), row_splits=messages.row_splits)
+    return ex
+def filter_single_turn(ds):
+    @seqio.map_over_dataset
+    def _filter(ex):
+        multi_turn = ex["messages"].row_lengths() > 2
+        ex["messages"] = tf.ragged.boolean_mask(ex["messages"], multi_turn)
+        return ex
+    ds = _filter(ds)
+    ds = ds.filter(lambda x: tf.shape(x["messages"])[0] > 0)
+    return ds
+@seqio.map_over_dataset(num_seeds=1)
+def extract_cockatoo_qa_v2(ex, seed):
+    messages = tf.RaggedTensor.from_value_rowids(ex["messages"], ex["conversation_ids"])
+    ix = stateless_permutation(tf.shape(messages)[0], seed)
+    messages = tf.gather(messages, ix)
+    out = dict(
+        image=ex["image"],
+        messages=messages
+    )
+    out.update(_add_metadata(ex))
+    return out
+def format_mmbench(ds):
+    def _trim(ex):
+        num_passes = tf.shape(ex["id"])[0]
+        ex["choices"] = ex["choices"][:num_passes, :num_passes]
+        ex["answer"] = ex["answer"][:num_passes]
+        return ex
+    ds = ds.map(_trim)
+    ds = flatten_parts(ds, ["id", "query", "choices", "answer"])
+    def _extract(ex):
+        out = dict(image=ex["image"])
+        out.update(_add_metadata(ex))
+        out["prompt"] = ex["query"]
+        out["text"] = ex["answer"]
+        options = ex["choices"]
+        tf.debugging.assert_equal(tf.reduce_any(tf.strings.regex_full_match(options, ".*\|\|\|.*")), False)
+        out["metadata/options"] = tf.strings.reduce_join(options, separator="|||")
+        out["metadata/question"] = ex["question"]
+        out["metadata/references"] = ex["answer"]
+        return out
+    ds = ds.map(_extract, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    return ds
+@seqio.map_over_dataset
+def extract_lvis(ex, class_name_file="gs://oe-training-chrisc/cockatoo/data/lvis_class_names.json"):
+    with tf.io.gfile.GFile(class_name_file) as f:
+        class_names = json.load(f)
+    class_names_arr = [None]*len(class_names)
+    for k, v in class_names.items():
+        class_names_arr[int(k)] = v
+    assert all(x is not None for x in class_names_arr)
+    class_names_arr = tf.constant(class_names_arr)
+    return dict(
+        image=ex["image"],
+        bbox=ex["objects"]["bbox"],
+        label=tf.gather(class_names_arr, ex["objects"]["label"]),
+    )
+def extract_open_images_boxes(ds):
+    # ds = ds.filter(lambda ex: tf.logical_or(
+    #     tf.shape(ex["cap/cap_caption"])[0] > 0,
+    # tf.shape(ex["detection/bbox"])[0] > 0
+    # ))
+    ds = ds.filter(lambda ex: tf.shape(ex["cap/cap_caption"])[0] > 0)
+    @seqio.map_over_dataset
+    def _map(ex):
+        bbox = tf.reshape(ex["detection/bbox"], (-1, 4))
+        bbox = tf.stack([
+            bbox[:, 2],
+            bbox[:, 0],
+            bbox[:, 3],
+            bbox[:, 1]
+        ], 1)
+        return dict(
+            image=tf.image.decode_jpeg(ex["image"]),
+            bbox=bbox,
+            label=ex["detection/label"],
+            caption=tf.strings.reduce_join(ex["cap/cap_caption"], separator="\n")
+        )
+    return _map(ds)
+@seqio.map_over_dataset
+def region_captions_to_dense(ex):
+    if "captions" in ex:
+        captions = ex["captions"]["text"]
+        boxes = ex["captions"]["bbox"]
+    else:
+        captions = ex["label"]
+        boxes = ex["bbox"]
+    sh = tf.cast(tf.shape(ex["image"])[:2], tf.float32)
+    # image_h, image_w = sh[0], sh[1]
+    w = boxes[:, 2] - boxes[:, 0]
+    h = boxes[:, 3] - boxes[:, 1]
+    cx = tf.cast(boxes[:, 0] + w/2, tf.float32)
+    cy = tf.cast(boxes[:, 1] + h/2, tf.float32)
+    # w = w / image_w
+    # h = h / image_h
+    coor = tf.strings.reduce_join(
+        float_to_text(tf.stack([cx, cy, w, h], 1)), separator=",", axis=1)
+    area = w*h
+    if tf.random.uniform(()) < 0.5:
+        coor_text = "before"
+        captions = tf.strings.join([coor, captions], separator=": ")
+    else:
+        coor_text = "after"
+        captions = tf.strings.join([captions, coor], separator=": ")
+    ix = tf.random.uniform((), 0, 6, tf.int32)
+    center = boxes
+    if ix == 0:
+        order_text = "left"
+        sort_by = boxes[:, 0]
+    elif ix == 1:
+        order_text = "right"
+        sort_by = -boxes[:, 2]
+    elif ix == 2:
+        order_text = "top"
+        sort_by = boxes[:, 1]
+    elif ix == 3:
+        order_text = "bottom"
+        sort_by = -boxes[:, 3]
+    elif ix == 4:
+        order_text = "largest"
+        sort_by = area
+    else:
+        order_text = "smallest"
+        sort_by = -area
+    ixs = tf.argsort(sort_by)
+    captions = tf.gather(captions, ixs)
+    text = tf.strings.join([
+        order_text,
+        coor_text,
+        tf.strings.reduce_join(captions, separator="\n")
+    ], separator="; ")
+    if "caption" in ex:
+        if tf.random.uniform(()) > 0.5:
+            text = tf.strings.join([text, "\ncaption: ", ex["caption"]])
+        else:
+            text = tf.strings.join(["caption: ", ex["caption"], "\n", text])
+    return dict(
+        image=ex["image"],
+        text=text
+    )
+@seqio.map_over_dataset()
+def join_captions(ex):
+    text = tf.random.shuffle(ex['text'])
+    ex["text"] = tf.strings.reduce_join(text, separator="\n")
+    return ex
+@seqio.map_over_dataset(num_seeds=1)
+def extract_figureqa(ex, seed):
+    questions = ex["questions"]
+    n = stateless_permutation(tf.shape(questions["question"])[0], seed)
+    return dict(
+        image=ex["image"],
+        questions=tf.gather(questions["question"], n),
+        question_id=tf.gather(questions["question_id"], n),
+        answer=tf.gather(tf.strings.as_string(questions["answer"]), n)
+    )
+@seqio.map_over_dataset
+def convert_figureqa_answer(ex):
+    keys_tensor = tf.constant(["0", "1"])
+    values_tensor = tf.constant(["no", "yes"])
+    table = tf.lookup.StaticHashTable(
+        tf.lookup.KeyValueTensorInitializer(keys_tensor, values_tensor),
+        default_value=tf.constant("nan", dtype=tf.string),
+    )
+    answer = table.lookup(ex["answer"])
+    ex["answer"] = answer
+    return ex
+@seqio.map_over_dataset()
+def build_question_with_hint(ex):
+    hint = ex["hint"]
+    if tf.strings.length(hint) > 0:
+        ex["question"] = tf.strings.join([hint, ex["question"]], separator="\n")
+    return ex
+@seqio.map_over_dataset()
+def build_question_with_context(ex):
+    context = ex["context"]
+    if tf.strings.length(context) > 0:
+        ex["question"] = tf.strings.join([context, ex["question"]], separator="\n")
+    return ex
+def max_words(ds, max_words):
+    return ds.filter(lambda x: x["n_words"] <= max_words)
+@seqio.map_over_dataset
+def format_pdfa_eng_wds(example):
+    return dict(
+        image=example["image"],
+        text=tf.strings.reduce_join(example["lines"]["text"], separator="\n"),
+    )
+@gin.configurable()
+def accuracy_conditioned_joint(ds, sequence_length, is_eval=False, eval_quality=17,
+                               transcript_quality=None):
+    # v2: Transcripts no longer get a quality score
+    is_training = sequence_length.get('is_training', True)
+    if not is_training:
+        if is_eval:
+            prompt = f"quality {eval_quality}:"
+        else:
+            prompt = f"quality 17:"
+        @seqio.map_over_dataset
+        def _with_prompt(ex):
+            out = dict(
+                image=ex["image"],
+                url=ex["url"],
+                prompt=prompt,
+            )
+            if "text" in ex:
+                out["text"] = ex["text"]
+            elif "caption" in ex:
+                out["text"] = ex["caption"]
+            return out
+        return _with_prompt(ds)
+    elif is_eval:
+        raise ValueError("is_eval=True and is_training=False")
+    # each transcript
+    @seqio.map_over_dataset
+    def _with_transcript(ex):
+        if tf.shape(ex["edited_captions"]["caption"])[0] > 0:
+            edited_caption = ex["edited_captions"]["caption"][0]
+            n = ex["edited_captions"]["n_edits"][0]
+        else:
+            edited_caption = ""
+            n = 0
+        text = [
+            ex["caption"],
+            ex["transcripts"][tf.random.uniform((), 0, tf.shape(ex["transcripts"])[0], dtype=tf.int32)],
+            edited_caption
+        ]
+        edit_quality = 17 - n
+        prompt = [
+            "quality 17:",
+            "" if transcript_quality is None else f"quality: {edit_quality}:",
+            tf.strings.join(["quality ", tf.strings.as_string(edit_quality), ":"])
+        ]
+        return dict(
+            image=ex["image"],
+            text=tf.stack(text, 0),
+            url=ex["url"],
+            prompt=tf.stack(prompt, 0),
+            style=["long_caption", "transcript", "long_caption"]
+        )
+    return _with_transcript(ds)
+def select_dense_caption_sample(ds, samples=200):
+    def compute_hash(string: str) -> str:
+        return hashlib.sha256(string.encode("utf-8")).hexdigest()
+    with tf.io.gfile.GFile("gs://oe-training-chrisc/cockatoo/data/dense-caption-eval-v0-final-data.json") as f:
+        data = json.load(f)
+    for ex in data:
+        ex["image_id"] = compute_hash(ex["image"])
+    data.sort(key=lambda x: x["image_id"])
+    np.random.RandomState(12312).shuffle(data)
+    keep = tf.constant([x["image"] for x in data[:samples]])
+    def _keep(ex):
+        return tf.reduce_any(ex["url"] == keep)
+    ds = ds.filter(_keep)
+    ds = tf.data.experimental.assert_cardinality(samples)(ds)
+    return ds
+@seqio.map_over_dataset()
+def charxiv_preprocessor(ex):
+    question_names = ["descriptive_q1", "descriptive_q2", "descriptive_q3", "descriptive_q4", "reasoning_q"]
+    answer_names = ["descriptive_a1", "descriptive_a2", "descriptive_a3", "descriptive_a4", "reasoning_a"]
+    questions = [ex[name] for name in question_names]
+    answers = [ex[name] for name in answer_names]
+    return dict(
+        image=ex["image"],
+        question=tf.stack(questions, 0),
+        answer=tf.stack(answers, 0)
+    )
+@seqio.map_over_dataset()
+def charxiv_descriptive_preprocessor(ex):
+    question_names = ["descriptive_q1", "descriptive_q2", "descriptive_q3", "descriptive_q4"]
+    answer_names = ["descriptive_a1", "descriptive_a2", "descriptive_a3", "descriptive_a4"]
+    questions = [ex[name] for name in question_names]
+    answers = [ex[name] for name in answer_names]
+    return dict(
+        image=ex["image"],
+        question=tf.stack(questions, 0),
+        answer=tf.stack(answers, 0)
+    )
+@seqio.map_over_dataset()
+def charxiv_reasoning_preprocessor(ex):
+    return dict(
+        image=ex["image"],
+        question=ex["reasoning_q"],
+        answer=ex["reasoning_a"]
+    )
+@seqio.map_over_dataset()
+def tablevqa_preprocessor(ex):
+    return dict(
+        image=ex["image"],
+        question=ex["question"],
+        answer=ex["gt"]
+    )
+@seqio.map_over_dataset()
+def vtabfact_preprocessor(ex):
+    return dict(
+        image=ex["image"],
+        question=tf.strings.join([ex["question"], "Answer with yes or no."], separator="\n"),
+        answer=ex["gt"]
+    )
+@seqio.map_over_dataset()
+def nutrition_fact_preprocessor(ex):
+    question_names = ["descriptive_q", "reasoning_q"]
+    answer_names = ["descriptive_a", "reasoning_a"]
+    questions = [ex[name] for name in question_names]
+    answers = [ex[name] for name in answer_names]
+    return dict(
+        image=ex["image"],
+        question=tf.stack(questions, 0),
+        answer=tf.stack(answers, 0)
+    )

prompts.py ADDED Viewed

	@@ -0,0 +1,385 @@

+import re
+import tensorflow as tf
+IMAGE_PROMPT = "<|image|>"
+GENERAL_PROMPTS_V1 = {
+    "short_answer": [
+        "Answer this question very briefly\n{question}",
+        "{question} Answer with a few words",
+        "{question} Response very briefly",
+        "{question} Answer directly without any details, explanation, or elaboration",
+        "I have a question about this image, please answer it very briefly: {question}",
+        "Question: {question} Short Answer:",
+        "Question: {question}\nShort Answer:",
+        '{question}\nAnswer the question as briefly as possible.',
+        'Answer very briefly:\n{question}',
+        'The question "{question}" can be answered using the image. A short answer is',
+        "{question} Based on the image, respond to this question with a short answer:",
+        "{question} Short answer:",
+        "{question} A short answer to the question is",
+        "Give a short, matter-of-fact answer to this question: {question}",
+        "Give me a simple, direct answer to this question, do not elaborate or explain your answer:\n{question}"
+    ],
+    "short_caption": [
+        'Caption the image with 1 or two sentences',
+        'Write a very short description of this image.',
+        'Briefly describe the image.',
+        'Look and this image, and then summarize it in a sentence or two.',
+        'Write a brief caption describing the image',
+        'Brief Caption:'
+        'A short image caption:',
+        'A short image description',
+        'Briefly describe the content of the image.',
+        'Can you give me one sentence summary of the picture?',
+        'How would you describe this image in a sentence or two?',
+    ],
+    "long_caption": [
+        'Describe this image.',
+        'Describe this image',
+        'describe the image',
+        'Write a long description of this image.',
+        'caption the picture',
+        'Caption',
+        'caption',
+        'Construct a long caption for this image',
+        'Generate a caption',
+        'Create a detailed caption',
+        'Write a long caption',
+        'Describe this image in detail',
+        'Describe this',
+        'describe this',
+        'Caption this',
+        'What can be seen in this image?',
+        'What do you see in the image?',
+        'Look at this photo carefully and then tell me about it in detail',
+        'Write a long description of this image',
+        'Tell me about this picture.',
+        'Write a paragraph about this image.',
+        'Look at this image carefully and then describe it in detail',
+        'Generate a long caption about this image.'
+    ],
+    "long_caption_no_pointing": [
+        'Describe this image in detail, but without any pointing.',
+        'Write a long description of this image, do not produce any points.',
+        'Tell me about this picture, use plain text only.',
+        'Generate a plain text description of this caption',
+        "What is in this image?\nNo pointing\nGive lots of detail"
+        "Write a long caption.\nDo not use image coordinates\nOutput a full paragraph"
+    ],
+    "transcript": [
+        'Describe this image as if you are a person speaking',
+        'Imagine you are a person talking about this image. Generate a transcript of what you would say.',
+        "Generate an audio transcript of a person describing this image",
+        "Create a transcript of a human describing this image out load",
+        "Describe this in this style of a human talking",
+    ],
+    "refexp": [
+        'What region does \"{refexp}\" refer to?',
+    ],
+    "count_bench": [
+        'How many {object} are there?',
+    ],
+    "refexp_pointing": [
+        'Where is the \"{refexp}\"?',
+        'Point to {refexp}',
+        'point at {refexp}',
+        'Find the {refexp}.',
+        'Which object in the image does \"{refexp}\" refer to?',
+        'Locate the object \"{refexp}\" refers to.',
+        'Point to the object that best matches the expression:\n{refexp}\n',
+        'What object could be described as: {refexp}.\nPoint:',
+        'Referring Expression: {refexp}.\nPoint:',
+        'Expression: {refexp}\nPoint to the refexp',
+        'Task: Point to the object that best matches the expression.\nExpression: {refexp}\nPoint:',
+        'Instruction: Locate the object that matches the expression by returning a point.\nReferring Expression: {refexp}\n',
+        'Help me find an object in this image by pointing to the {refexp}',
+        'What point of the image might the expression \'{refexp}\' refer to?',
+    ],
+    "plain": ["{question}"],
+    "multiple_choice": [
+        "{question}\n{options}\nReturn only the letter of the best answer option",
+        "Answer this question by naming one of the provided options:\n{question}\n{options}",
+        "{question}\n{options}\nWhat option best answers the question?",
+        "{question}\n{options}\nReturn the best answer option",
+        "Look at the options, then return the letter of the option that best answers the question.\nQuesiton: {question}\nOptions: {options}",
+        "{question}? Select an answer option from:\n{options}",
+        "{question}\nSelect an answer option from:\n{options}\n\n",
+        "Question: {question}? Options: {options} Answer:",
+        "Answer the question by selecting an answer options\nQuestion: {question}\nOptions: {options}",
+        "{question}?\n{options}\nReturn only the letter of the correct answer",
+        "Help me answer this question: \"{question}\", by stating which of the following options is correct\n{options}."
+    ],
+    "binary": ["{question}\nAnswer with 'yes' or 'no'"],
+    "pointing": [
+        "Point to {entity}\nPlease say 'This isn't in the image.' if it is not in the image.",
+        "Point to all occurrences of \"{entity}\"",
+        "Point to any {entity} in the image",
+        "Point to any {entity} in the image.",
+        "Point: Where are the {entity}",
+        "Show me where the {entity} are",
+        "Can you show me where the {entity} are?",
+        "Show me where the {entity} are",
+        "Show me where a {entity} is",
+        "Show me where a {entity} is.",
+        "If there are any {entity} in the image? Show me where they are.",
+        "Where are the {entity}?",
+        "Generate a list of points showing where the {entity} are.",
+        "Find the \"{entity}\".",
+        "Find a \"{entity}\".",
+        "Locate all {entity}.",
+        "Locate an {entity}.",
+        "Locate a {entity}.",
+        "Locate every {entity}.",
+        "Locate {entity}.",
+        "Locate the {entity}.",
+        "Object: {entity}\nInstruction: Point to the object.",
+        "find {entity}",
+        "find {entity}.",
+        "Point to every {entity}",
+        "find any {entity} in the picture",
+        "Find the {entity}",
+        "Find any {entity}",
+        "Point to a {entity}",
+        "Point to an {entity}",
+        "Look for {entity} in the image and show me where they are.",
+        "Help me find an object in the image by pointing to them.\nObject: {entity}.",
+        "I am looking for {entity}, where can they be found in the image?",
+        "Can you see any {entity} in the image? Point to them.",
+        "Point out each {entity} in the image.",
+        "Point out every {entity} in the image.",
+        "Point to the {entity} in the image.",
+        "Locate each {entity} in the image.",
+        "Can you point out all {entity} in this image?",
+        "Please find {entity} and show me where they are.",
+        "If there are any {entity} present, indicate their positions.",
+        "If there is a {entity} present, indicate its positions.",
+        "show me all visible {entity}",
+    ],
+    "point_count": [
+        "How many {entity} are there?",
+        "How many {entity}?",
+        "How many {entity}.",
+        "how many {entity}.",
+        "how many {entity}?",
+        "How many {entity} are there in the image?",
+        "Tell me how many {entity} there are",
+        "Tell me how many {entity} there are and point to them.",
+        "how many {entity}",
+        "Tell me where each {entity} is.",
+        "Tell me how many {entity} are in the image",
+        "count {entity}",
+        "count every {entity}",
+        "count each {entity}",
+        "count {entity}.",
+        "Count the {entity}.",
+        "How many {entity} do you see?",
+        "How many {entity} are visible?",
+        "Count all the {entity}",
+        "how mmny {entity}?",
+        "Count every {entity} in the picture.",
+        "Count all the {entity}",
+        "Count each {entity}",
+        "Point to and count the {entity} in the picture.",
+        "Point and count {entity}",
+        "Point to every {entity}",
+        "Locate the {entity} and count them",
+        "Locate every {entity} and count them",
+        "Find all the {entity}. How many are there?",
+        "Find each {entity}. How many are there?",
+        "Point at {entity} and then tell me the count.",
+        "What is the total number of {entity} in the image?",
+        "In all the picture, how many {entity} are there?",
+        "Point at the {entity} and then count them.",
+        "Point to all the visible {entity} output the total count.",
+        "Point to all the {entity} visible and output the total count. \nPlease say 'This isn't in the image.' if it is not in the image.",
+        "Point to all occurrences of \"{entity}\" and output the total count.",
+        "Show me where the {entity} are and output the total count.",
+        "Where are the {entity}? How many are there?",
+        "Generate list of points showing where the {entity} are and output the total count.",
+        "Object: {entity}\nInstruction: Point to the object and output the total count.",
+        "find any {entity} in the picture and output the total count.",
+        "Can you see any {entity} in the image? Point to them and output the total count.",
+        "Can you point out all {entity} in this image? How many are there?",
+        "If there are any {entity} present, indicate their positions and output the total count.",
+        "How many {entity} are there in the image? Point to them and output the total count.",
+        "How many {entity} are there in the image?",
+        "Give me the count of {entity} in the image.",
+        "How many {entity} are visible in the image?",
+        "How many {entity} are there?",
+        "In the image, how many {entity} are there?",
+        "Can you count the number of {entity} in the image?",
+        "Can you count every {entity} in the picture?",
+        "Can you see any {entity} in the image? How many are there?",
+        "Are there any {entity} in the image? How many are there?",
+        "If you see any {entity} in the image, give me the count. Otherwise, say 'This isn't in the image.'",
+        "Object: {entity}\nInstruction: How many are there?",
+    ],
+    # vaia
+    "detailed_solution": [
+        "Answer the question providing a step by step solution and answer in the end.\n"
+        "Provide a step-by-step solution to the question, ending with your final answer.\n",
+        "Please provide a step-by-step solution to the question shown in the image.\n",
+        "Give a detailed explanation for the question, concluding with your final answer.\n",
+        "Solve the problem presented in the question with a thorough explanation. Give me your final answer at the end.\n",
+        "Please analyze the question and provide a complete solution, finishing with your final answer.\n",
+        "Work through the problem, offering detailed reasoning before stating your final answer.\n",
+        "Interpret the question and guide me through the solution, concluding with your answer.\n",
+        "Review the question and deliver a well-explained solution, making sure to include your final answer.\n",
+        "Examine the question: provide a detailed explanation followed by your final answer.\n"
+    ],
+    # vaia first answer with short_answer
+    "detailed_solution_answer_first": [
+        "Answer the question directly, then provide a step-by-step solution.\n",
+        "Please provide the answer first, followed by a step-by-step solution to the question shown in the image.\n",
+        "Give the final answer first, then provide a detailed explanation for the question.\n",
+        "Provide the final answer, then solve the problem presented in the question with a thorough explanation.\n",
+        "First, give the final answer, then analyze the question and provide a complete solution.\n",
+        "State the final answer first, then work through the problem, offering detailed reasoning.\n",
+        "Provide the final answer, then interpret the question and guide me through the solution.\n",
+        "Give the final answer first, then review the question and deliver a well-explained solution.\n",
+        "First, provide the final answer, then examine the question and give a detailed explanation.\n"
+    ],
+    # vqa_online
+    "detailed_answer": [
+        "Answer the question providing a step-by-step explanation and answer in the end.\n",
+        "Provide a step-by-step explanation to the question, ending with your final answer.\n",
+        "Please provide a step-by-step explanation to the question shown in the image.\n",
+        "Give a detailed explanation for the question, concluding with your final answer.\n",
+        "Address the problem presented in the question with a thorough explanation. Give me your final answer at the end.\n",
+        "Please analyze the question and provide a complete explanation, finishing with your final answer.\n",
+        "Work through the problem, offering detailed reasoning before stating your final answer.\n",
+        "Interpret the question and guide me through the explanation, concluding with your answer.\n",
+        "Review the question and deliver a well-explained answer, making sure to include your final answer.\n",
+        "Examine the question: provide a detailed explanation followed by your final answer.\n"
+    ],
+}
+GENERAL_PROMPTS_V1["pointing_tag"] = [txt + " Make the alt text and the inside of the tag the target label." for txt in GENERAL_PROMPTS_V1["pointing"]]
+STYLE_TO_GENERAL_PROMPT = {
+    "vqa2": "short_answer",
+    "coco_captioning": "short_caption",
+    "gqa": "short_answer",
+    "ocr_vqa": "short_answer",
+    "tally_qa": "short_answer",
+    "text_vqa": "short_answer",
+    "okvqa": "short_answer",
+    "chart_qa": "short_answer",
+    "doc_qa": "short_answer",
+    "info_qa": "short_answer",
+    "science_qa": "multiple_choice",
+    "ai2_diagram": "multiple_choice",
+    "a_okvqa_mc": "multiple_choice",
+    "a_okvqa_da": "short_answer",
+    "long_caption": "long_caption",
+    "web_pointing": "plain",
+    "count_bench": "count_bench",
+    "refexp": "refexp",
+    "refexp_pointing": "refexp_pointing",
+    "vtabfact": "binary",
+    "vwtq": "short_answer",
+    "vwtq_syn": "short_answer",
+    "fintabnetqa": "short_answer",
+    "scifi_charts": "short_answer",
+    "scifi_charts_qa": "short_answer",
+    "charxiv_descriptive": "short_answer",
+    "charxiv_reasoning": "short_answer",
+    "pointing": "pointing",
+    "pointing_tag": "pointing_tag",
+    "point_count": "point_count",
+    "plain": "plain",
+}
+# def maybe_format_options(example, option_style="basic"):
+#     abc = tf.constant(list("abcdefg".upper()))
+#     if option_style == "random-v1":
+#         letter_option_sep = [": ", ". ", ")"]
+#         option_sep = ["\n", "\n", "\n", " ", ". ", ".\n", "; ", ", "]
+#         option_sep = tf.constant(option_sep)[tf.random.uniform((), 0, len(option_sep), tf.int32)]
+#     elif option_style == "basic":
+#         letter_option_sep = ": "
+#         option_sep = "\n"
+#     else:
+#         raise NotImplementedError(option_style)
+#
+#     options = example["options"]
+#     short_options = abc[:tf.shape(options)[0]]
+#     sep = tf.constant(letter_option_sep)[tf.random.uniform((), 0, len(letter_option_sep), tf.int32)]
+#
+#     options = tf.stack([short_options, options,], 1)
+#
+#     options = tf.strings.reduce_join(options, axis=-1, separator=sep)
+#
+#     options = tf.strings.reduce_join(options, separator=option_sep)
+#     example["options"] = options
+#     tf.debugging.assert_equal(tf.reduce_any(tf.strings.regex_full_match(options, ".*\|\|\|.*")), False)
+#     example["metadata/option_names"] = tf.strings.reduce_join(short_options, separator="|||")
+#
+#     if "answer_idx" in example:
+#         if example["answer_idx"] < 0:
+#             example["text"] = "?"
+#         else:
+#             example["text"] = short_options[example["answer_idx"]]
+#         example["metadata/answer_idx"] = example["answer_idx"]
+#     return example
+def apply_keyword_prompt(prompts, example, seed=None, weights=None, keywords=None):
+    if isinstance(prompts, list):
+        assert keywords is None
+        all_keywords = [sorted(re.findall("{([^{}]+)}", x)) for x in prompts]
+        keywords = all_keywords[0]
+        assert len(keywords) == len(set(keywords)), f"Repeated keywords in {keywords}"
+        assert all(keywords == x for x in all_keywords), f"Inconsistent keywords in prompts {all_keywords}"
+        assert not any("{" not in word[1:-1] and "}" in word[1:-1] for word in keywords)
+        for k in keywords:
+            assert k in example, f"Example missing expected field {k}, example={example}"
+        prompts = tf.constant(prompts)
+    multiple = False
+    if "text" in example and len(example["text"].shape) > 0:
+        multiple = True
+    if weights is not None:
+        weights = tf.expand_dims(tf.math.log(weights), 0)
+    if seed is None:
+        raise ValueError()
+    if not multiple:
+        if weights is None:
+            prompt = prompts[tf.random.stateless_uniform((), seed, 0, len(prompts), dtype=tf.int32)]
+        else:
+            prompt = prompts[tf.random.stateless_categorical(weights, 1, seed, 0, len(prompts), dtype=tf.int32)][0, 0]
+        for keyword in keywords:
+            # We use split not regex_replace because regex_replace has issues with
+            # value strings with backslashes
+            res = tf.strings.split(prompt, "{"+keyword+"}", maxsplit=2)
+            prompt = tf.strings.join([res[0], example[keyword], res[1]])
+        return prompt
+    else:
+        n_prompts = tf.shape(example["text"])[0]
+        if weights is None:
+            ix = tf.random.stateless_uniform(
+                (n_prompts,), seed, 0, tf.shape(prompts)[0], dtype=tf.int32)
+        else:
+            ix = tf.random.stateless_categorical(
+                weights, tf.shape(prompts)[0], seed, 0, len(prompts), dtype=tf.int32)[0]
+        prompt = tf.gather(prompts, ix)
+        out = tf.TensorArray(dtype=tf.string, size=n_prompts, element_shape=())
+        for i in range(n_prompts):
+            modified = prompt[i]
+            for keyword in keywords:
+                res = tf.strings.split(modified, "{"+keyword+"}", maxsplit=2)
+                modified = tf.strings.join([res[0], example[keyword][i], res[1]])
+            out = out.write(i, modified)
+        return out.stack()

seqio_tokenizer.py ADDED Viewed

	@@ -0,0 +1,659 @@

+# Copyright 2023 The SeqIO Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Vocabularies."""
+import abc
+import dataclasses
+import functools
+import hashlib
+import threading
+from typing import Any, ClassVar, Dict, Iterable, Optional, Sequence, Union, List, Tuple
+import numpy as np
+from absl import logging
+import tensorflow.compat.v2 as tf
+from sentencepiece import sentencepiece_model_pb2
+import sentencepiece as sentencepiece_processor
+PAD_ID = -1 # -1 for llama tokenizer
+class Vocabulary(metaclass=abc.ABCMeta):
+    """Abstract class for all vocabularies.
+    Subclasses must implement methods for converting between strings and tokens
+    both in pure python (`_encode`/`_decode`) and in TensorFlow
+    (`_encode_tf`/`_decode_tf`).
+    Subclasses are responsible for reserving PAD_ID=0 as well as optionally
+    reserving EOS_ID and UNK_ID
+    `_base_vocab_size` should account for PAD, EOS, and UNK but not `extra_ids`.
+    """
+    def __init__(self, extra_ids: int = 0):
+        """Vocabulary constructor.
+        Args:
+          extra_ids: The number of extra IDs to reserve.
+        """
+        self._extra_ids = extra_ids or 0
+    @property
+    def bos_token_id(self) -> Optional[int]:
+        raise NotImplementedError("need to implement bos_id")
+    @property
+    @abc.abstractmethod
+    def eos_token_id(self) -> Optional[int]:
+        raise NotImplementedError("need to implement eos_id")
+    @property
+    def pad_id(self) -> int:
+        return PAD_ID
+    @property
+    @abc.abstractmethod
+    def unk_id(self) -> Optional[int]:
+        raise NotImplementedError("need to implement unk_id")
+    @property
+    def extra_ids(self) -> int:
+        return self._extra_ids
+    @property
+    def vocab_size(self) -> int:
+        """Vocabulary size, including extra ids."""
+        return self._base_vocab_size + self.extra_ids
+    @property
+    @abc.abstractmethod
+    def _base_vocab_size(self) -> int:
+        """Vocabulary size, excluding extra ids but including PAD/EOS/UNK."""
+        # TODO(fjord): add a check that pad_id and unk_id (if present)
+        #   are less than _base_vocab_size.
+        raise NotImplementedError
+    @abc.abstractmethod
+    def _encode(self, s: str) -> Sequence[int]:
+        raise NotImplementedError
+    def encode(self, s: Union[Sequence[int], str]) -> Sequence[int]:
+        """Tokenizes string to an int sequence, without adding EOS."""
+        return self._encode(s)
+    @abc.abstractmethod
+    def _decode(self, ids):
+        raise NotImplementedError
+    def decode(self, ids: Iterable[int], truncate_at_eos=True):
+        """Detokenizes int32 iterable to a string, up through first EOS."""
+        clean_ids = list(ids)
+        if self.unk_id is not None:
+            vocab_size = self._base_vocab_size
+            clean_ids = [self.unk_id if i >= vocab_size else i for i in clean_ids]
+        if truncate_at_eos and (self.eos_token_id is not None and self.eos_token_id in clean_ids):
+            clean_ids = clean_ids[: clean_ids.index(self.eos_token_id) + 1]
+        return self._decode(clean_ids)
+    @abc.abstractmethod
+    def _encode_tf(self, s: tf.Tensor) -> tf.Tensor:
+        raise NotImplementedError
+    def encode_tf(self, s: tf.Tensor) -> tf.Tensor:
+        """Tokenizes string Scalar to an int32 Tensor, without adding EOS."""
+        return self._encode_tf(s)
+    @abc.abstractmethod
+    def _decode_tf(self, ids: tf.Tensor) -> tf.Tensor:
+        raise NotImplementedError
+    def decode_tf(self, ids: tf.Tensor) -> tf.Tensor:
+        """Detokenizes int32 batched Tensor through first EOS."""
+        clean_ids = ids
+        if self.unk_id is not None:
+            base_vocab_size = self._base_vocab_size
+            clean_ids = tf.where(
+                tf.less(clean_ids, base_vocab_size), clean_ids, self.unk_id
+            )
+        if self.eos_id is not None:
+            # Replace everything after the first eos_id with pad_id.
+            after_eos = tf.cumsum(
+                tf.cast(tf.equal(clean_ids, self.eos_id), tf.int32),
+                exclusive=True,
+                axis=-1,
+            )
+            clean_ids = tf.where(tf.cast(after_eos, tf.bool), self.pad_id, clean_ids)
+        return self._decode_tf(clean_ids)
+class PassThroughVocabulary(Vocabulary):
+    """Vocabulary that passes through inputs unchanged."""
+    def __init__(self, size: int, eos_id: Optional[Any] = None):
+        """PassThroughVocabulary constructor.
+        Args:
+          size: the full size of the vocabulary.
+          eos_id: the end-of-sequence token.
+        """
+        self._size = size
+        self._eos_id = eos_id
+        super().__init__()
+    @property
+    def _base_vocab_size(self):
+        return self._size
+    def _encode(self, s: Sequence[Any]) -> Sequence[Any]:
+        return s
+    def _decode(self, ids: Sequence[Any]) -> Sequence[Any]:
+        return ids
+    def _encode_tf(self, s: tf.Tensor) -> tf.Tensor:
+        return s
+    def _decode_tf(self, ids: tf.Tensor) -> tf.Tensor:
+        return ids
+    @property
+    def eos_id(self) -> Optional[Any]:
+        return self._eos_id
+    @property
+    def unk_id(self) -> Optional[Any]:
+        return None
+    def __eq__(self, other):
+        if not isinstance(other, PassThroughVocabulary):
+            return False
+        return self._size == other._size and self.eos_id == other.eos_id
+    def __str__(self) -> str:
+        return f"PassThroughVocabulary(size={self._size}, eos_id={self.eos_id})"
+class UnigramVocabulary(Vocabulary):
+    """Vocabulary that does table-lookup of unigrams."""
+    def __init__(self, unigrams: Sequence[str]):
+        """UnigramVocabulary constructor.
+        Args:
+          unigrams: the collection of in-vocabulary tokens. This collection should
+            not include PAD or UNK, which are automatically assigned ids and managed
+            as possible decode tokens.
+        """
+        super().__init__()
+        unigrams_as_list = list(unigrams)
+        self._unigram_by_id = ["PAD"] + unigrams_as_list + ["UNK"]
+        self._id_by_unigram = {u: i for i, u in enumerate(self._unigram_by_id)}
+        initializer = tf.lookup.KeyValueTensorInitializer(
+            keys=tf.constant(["PAD"] + unigrams_as_list),
+            # One extra value because the leading 0 corresponds to PAD
+            values=tf.constant(range(len(unigrams) + 1), dtype=tf.int64),
+        )
+        self._id_by_unigram_tf = tf.lookup.StaticVocabularyTable(
+            initializer, num_oov_buckets=1
+        )
+        self._unigram_by_id_tf = tf.constant(self._unigram_by_id)
+    def _encode(self, s: str) -> Sequence[int]:
+        return [self._id_by_unigram.get(s, self.unk_id)]
+    def _encode_tf(self, s: tf.Tensor) -> tf.Tensor:
+        tf_ids = self._id_by_unigram_tf.lookup(s)
+        return tf.expand_dims(tf.dtypes.cast(tf_ids, tf.int32), -1)
+    def _decode(self, ids: Sequence[int]) -> str:
+        return " ".join(self._unigram_by_id[id] for id in ids)
+    def _decode_tf(self, ids: tf.Tensor) -> tf.Tensor:
+        return self._unigram_by_id_tf[ids[0]]
+    @property
+    def _base_vocab_size(self):
+        return len(self._unigram_by_id)
+    @property
+    def eos_id(self):
+        return None
+    @property
+    def unk_id(self):
+        return self._base_vocab_size - 1
+class SentencePieceVocabulary(Vocabulary):
+    """Wrapper for nlp/sentencepiece encoder.
+    Assumes the model was built using flags to reserve ID=0 for padding, ID=1 for
+    EOS, and ID=2 for UNK.
+    If using extra ids, you can represent them in string-form as `<extra_id_0>`,
+    `<extra_id_1>`, etc. They will be indexed starting from the end of the
+    vocabulary to match how the masking preprocessors are set up.
+    IMPORTANT NOTE: these placeholders only work properly when they are used at
+    word starts (e.g., "I like peanut butter and <extra_id_0> sandwiches." or
+    "I like peanut butter and <extra_id_0>ly sandwiches" are both okay, but
+    "I like peanut butter and jel<extra_id_0> sandwiches" is not.).
+    """
+    @dataclasses.dataclass
+    class _ModelContext:
+        tokenizer: sentencepiece_processor.SentencePieceProcessor
+        sp_model: bytes
+    _load_model_lock: ClassVar[threading.Lock] = threading.Lock()
+    def __init__(
+            self,
+            sentencepiece_model_file: str,
+            extra_ids: int = 0,
+            normalizer_spec_overrides: Optional[
+                sentencepiece_model_pb2.NormalizerSpec
+            ] = None,
+            reverse_extra_ids: bool = False,
+            extra_tokens: Tuple[str] = None,
+            hack_to_t5_start_tokens: bool = False,
+    ):
+        """Create a SentencePieceVocabulary.
+        Optionally, specify a number of extra ids to add to the end of the
+        vocabulary for use as sentinels.
+        Args:
+          sentencepiece_model_file: path of the sentence piece model.
+          extra_ids: number of extra ids to include.
+          normalizer_spec_overrides: If not None, this proto will be merged into the
+            model's normalizer and denormalizer specs. Thus, any options set on this
+            object will override the values of those options in the loaded model.
+          reverse_extra_ids: if True, extra_ids are numbered in descending order, so
+            the first extra_id has the highest number. This is done for
+            compatibility with span_corruption mask generation in T5.
+        """
+        self._sentencepiece_model_file = sentencepiece_model_file
+        self._normalizer_spec_overrides = normalizer_spec_overrides
+        self._reverse_extra_ids = reverse_extra_ids
+        self._model: Optional[SentencePieceVocabulary._ModelContext] = None
+        self._extra_tokens = extra_tokens
+        self._hack_to_t5_start_tokens = hack_to_t5_start_tokens
+        super().__init__(extra_ids=extra_ids)
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        # Gin config makes a deep copy of the keyword arguments of configurables.
+        # When a SentencePieceVocabulary vocabulary is used as a keyword argument
+        # in a Gin configurable, it must be picklable. We therefore remove
+        # _model; will be initialized lazily as needed.
+        del state["_model"]
+        return state
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        self._model = None
+    def load_model(self) -> None:
+        _ = self._model_context()
+    def _model_context(
+            self,
+    ) -> _ModelContext:
+        """Loads model if not yet loaded and returns the model context.
+        Returns:
+          The model context as a tuple of (tokenizer, sp_model).
+        """
+        if self._model:
+            return self._model
+        normalizer_spec_overrides_serialized = (
+            self._normalizer_spec_overrides.SerializeToString(deterministic=True)
+            if self._normalizer_spec_overrides
+            else None
+        )
+        self._model = self._load_model(
+            self._sentencepiece_model_file,
+            self._extra_ids,
+            normalizer_spec_overrides_serialized,
+            self._reverse_extra_ids,
+            extra_tokens=self._extra_tokens,
+            hack_to_t5_start_tokens=self._hack_to_t5_start_tokens,
+        )
+        return self._model
+    @classmethod
+    @functools.lru_cache(maxsize=None)
+    def _load_model(
+            cls,
+            sentencepiece_model_file: str,
+            extra_ids: int,
+            normalizer_spec_overrides_serialized: Optional[bytes] = None,
+            reverse_extra_ids: bool = True,
+            extra_tokens: Tuple[str] = None,
+            hack_to_t5_start_tokens=False,
+    ) -> _ModelContext:
+        """Load SPM, Python tokenizer, and cache results to the class definition."""
+        # SentencePieceProcessor::LoadFromSerializedProto is not thread-safe.
+        # Without a lock, users may randomly see SIGSEGV on
+        # sentencepiece::ModelInterface::pad_piece when using the vocabulary in
+        # SeqIO preprocessors.
+        with cls._load_model_lock:
+            # Handle cases where SP can't load the file, but gfile can.
+            with tf.io.gfile.GFile(sentencepiece_model_file, "rb") as f:
+                sp_model = f.read()
+                model = sentencepiece_model_pb2.ModelProto.FromString(sp_model)
+                if hack_to_t5_start_tokens:
+                    # PAD token would still be 0 same as BOS for consistency as previous!
+                    unk = model.pieces[0]
+                    bos = model.pieces[1]
+                    eos = model.pieces[2]
+                    model.pieces.remove(unk)
+                    model.pieces.remove(bos)
+                    model.pieces.remove(eos)
+                    model.pieces.insert(0, bos)  # BOS is token 0
+                    model.pieces.insert(1, eos)  # EOS is token 1
+                    model.pieces.insert(2, unk)  # UNK is token 2
+                # Add placeholder strings for extra IDs.
+                if extra_ids:
+                    # By default, we them in reverse order to match span corruption.
+                    if reverse_extra_ids:
+                        extra_id_tokens = reversed(range(extra_ids))
+                    else:
+                        extra_id_tokens = range(extra_ids)
+                    for i in extra_id_tokens:
+                        model.pieces.add(
+                            piece=f"▁<extra_id_{i}>",
+                            score=0.0,
+                            type=sentencepiece_model_pb2.ModelProto.SentencePiece.USER_DEFINED,
+                        )
+                if extra_tokens:
+                    for s in extra_tokens:
+                        model.pieces.add(
+                            piece=f"▁"+s,
+                            score=0.0,
+                            type=sentencepiece_model_pb2.ModelProto.SentencePiece.USER_DEFINED,
+                        )
+                if normalizer_spec_overrides_serialized is not None:
+                    normalizer_spec_overrides = (
+                        sentencepiece_model_pb2.NormalizerSpec.FromString(
+                            normalizer_spec_overrides_serialized
+                        )
+                    )
+                    model.normalizer_spec.MergeFrom(normalizer_spec_overrides)
+                    model.denormalizer_spec.MergeFrom(normalizer_spec_overrides)
+                sp_model = model.SerializeToString()
+            # Load Python tokenizer and ensure the EOS and PAD IDs are correct.
+            tokenizer = sentencepiece_processor.SentencePieceProcessor()
+            tokenizer.LoadFromSerializedProto(sp_model)
+            if tokenizer.pad_id() != PAD_ID:
+                logging.warning(
+                    (
+                        "T5 library uses PAD_ID=%s, which is different from the "
+                        "sentencepiece vocabulary, which defines pad_id=%s"
+                    ),
+                    PAD_ID,
+                    tokenizer.pad_id(),
+                )
+            return cls._ModelContext(tokenizer=tokenizer, sp_model=sp_model)
+    @property
+    def num_extra_tokens(self):
+        if self._extra_tokens:
+            return len(self._extra_tokens)
+        return 0
+    @property
+    def bos_id(self) -> Optional[int]:
+        return self.tokenizer.bos_id()
+    @property
+    def bos_token_id(self) -> Optional[int]:
+        return self.tokenizer.bos_id()
+    @property
+    def eos_token_id(self) -> Optional[int]:
+        return self.tokenizer.eos_id()
+    @property
+    def eos_id(self) -> Optional[int]:
+        return self.tokenizer.eos_id()
+    @property
+    def unk_id(self) -> Optional[int]:
+        return self.tokenizer.unk_id()
+    @property
+    def sp_model(self) -> Optional[bytes]:
+        """Retrieve the SPM."""
+        return self._model_context().sp_model
+    @property
+    def sentencepiece_model_file(self) -> str:
+        return self._sentencepiece_model_file
+    @property
+    def tokenizer(self) -> sentencepiece_processor.SentencePieceProcessor:
+        """Returns the Python tokenizer."""
+        return self._model_context().tokenizer
+    @property
+    def tf_tokenizer(self):
+        """Instantiate and return a TF tokenizer."""
+        import tensorflow_text as tf_text  # import here to keep the dependency optional
+        return tf_text.SentencepieceTokenizer(model=self.sp_model)
+    @property
+    def vocab_size(self):
+        return self._base_vocab_size
+    @property
+    def _base_vocab_size(self):
+        """Number of ids (including 0=PAD, 1=EOS, and 2=UNK).
+        Returns:
+          an integer, the vocabulary size
+        """
+        return self.tokenizer.GetPieceSize()
+    def _encode(self, s):
+        """Encode a python string as a list of integers.
+        Args:
+          s: a string
+        Returns:
+          a list of integers (not terminated by EOS)
+        """
+        return self.tokenizer.EncodeAsIds(s)
+    def _decode(self, ids):
+        """Decode a list of integers to a python string.
+        Args:
+          ids: a list of integers (not terminated by EOS)
+        Returns:
+          a string
+        """
+        # convert all the extra ids (sentinels) to UNK=2
+        unk_id = self.tokenizer.unk_id()
+        piece_size = self.tokenizer.GetPieceSize()
+        ids = [unk_id if i >= piece_size else int(i) for i in ids]
+        return self.tokenizer.DecodeIds(ids)
+    def _encode_tf(self, s):
+        """Encode a tf.Scalar string to a tf.Tensor.
+        This will be necessary for on-the-fly tokenization.
+        Args:
+          s: a tf.Scalar with dtype tf.string
+        Returns:
+          a 1d tf.Tensor with dtype tf.int32
+        """
+        return self.tf_tokenizer.tokenize(s)
+    def _decode_tf(self, ids):
+        """Decode in TensorFlow.
+        Args:
+          ids: a 1d or 2d tf.Tensor with dtype tf.int32
+        Returns:
+          a 1d or 2d tf.Tensor with dtype tf.string
+        """
+        return self.tf_tokenizer.detokenize(ids)
+    def __eq__(self, other):
+        if not isinstance(other, SentencePieceVocabulary):
+            return False
+        try:
+            their_md5 = hashlib.md5(other.sp_model).hexdigest()
+        # If other has no sp_model attribute, we can't test for equality
+        except AttributeError:
+            return False
+        if self.sp_model is None:
+            return False
+        our_md5 = hashlib.md5(self.sp_model).hexdigest()
+        return our_md5 == their_md5
+    def __str__(self) -> str:
+        return (
+            f"SentencePieceVocabulary(file={self.sentencepiece_model_file}, "
+            f"extra_ids={self._extra_ids}, "
+            f"spm_md5={hashlib.md5(self.sp_model).hexdigest()})"
+        )
+    @property
+    def adds_space(self):
+        return True
+class HfTokenizerWrapper:
+    def __init__(self, tokenizer, bos_token_id=None, adds_space=False):
+        """
+        tokenizer: Tokenizer to wrap
+        bos_token_id: BOS token id to use if not `tokenizer.bos_token_id`
+        adds_space: If concatenating interdependently tokenized pieces of text, will the tokens
+                    already including a seerating space?
+        """
+        self.adds_space = adds_space
+        self.tokenizer = tokenizer
+        if bos_token_id is None:
+            self.bos_token_id = tokenizer.bos_token_id
+        else:
+            self.bos_token_id = bos_token_id
+        self.eos_token_id = self.tokenizer.eos_token_id
+        self.pad_id = -1
+    def encode(self, x: str):
+        return self.tokenizer.encode(x, add_special_tokens=False)
+    def decode(self, x: List[int], truncate_at_eos=True):
+        x = [int(t) for t in x]
+        if self.eos_token_id == self.bos_token_id and (len(x) > 0 and x[0] == self.eos_token_id):
+            # Assume an EOS at the start is functioning as BOS
+            x = x[1:]
+        if truncate_at_eos:
+            # Follow seqio and automatically cut off at EOS
+            try:
+                eos_ix = x.index(self.eos_token_id)
+                x = x[:eos_ix]
+            except ValueError:
+                pass
+        return self.tokenizer.decode(x, skip_special_tokens=True)
+    def vocab_size(self):
+        return len(self.tokenizer)
+    def encode_tf(self, x):
+        if isinstance(x, str) or len(x.shape) == 0:
+            def _enc(_data):
+                _data = _data.item() if isinstance(_data, np.ndarray) else _data
+                return self.tokenizer.encode(_data.decode("utf-8"), add_special_tokens=False, return_tensors="np")[0].astype(np.int32)
+            return tf.ensure_shape(tf.numpy_function(_enc, [x], tf.int32, stateful=False), [None])
+        flattened = tf.reshape(x, [-1])
+        def _enc(_data):
+            tokens = [self.tokenizer.encode(x.decode("utf-8"), add_special_tokens=False, return_tensors="np")[0].astype(np.int32)
+                      for x in _data]
+            if len(tokens) == 0:
+                return np.zeros((0,), dtype=np.int32), np.zeros((0,), dtype=np.int32)
+            else:
+                return np.concatenate(tokens, 0), np.array([len(x) for x in tokens]).astype(np.int32)
+        if not (isinstance(x, str) or x.dtype == tf.string):
+            raise ValueError("Input be a string or a string numpy array")
+        text, lens = tf.numpy_function(_enc, [flattened], (tf.int32, tf.int32), stateful=False)
+        lens = tf.ensure_shape(lens, [None])
+        text = tf.ensure_shape(text, [None])
+        if len(x.shape) == 2:
+            n = x.shape[1]
+            assert n is not None
+            return tf.RaggedTensor.from_nested_row_lengths(
+                text,
+                [tf.ones(tf.shape(x)[0], dtype=lens.dtype)*n, lens]
+            )
+        else:
+            return tf.RaggedTensor.from_row_lengths(text, lens)
+class OLMoTokenizerWrapper(HfTokenizerWrapper):
+    def encode(self, x: str):
+        return self.tokenizer.encode(x, add_special_tokens=False)
+    def encode_tf(self, x):
+        if isinstance(x, str) or len(x.shape) == 0:
+            def _enc(_data):
+                return np.asarray(self.tokenizer.encode(_data.numpy().decode("utf-8"), add_special_tokens=False), dtype=np.int32)
+            out = tf.py_function(_enc, (x,), tf.int32)
+            return tf.ensure_shape(out, [None])
+        else:
+            def _enc(_data):
+                tokens = [self.tokenizer.encode(x.decode("utf-8"), add_special_tokens=False)
+                          for x in _data.numpy()]
+                if len(tokens) == 0:
+                    return np.zeros((0,), dtype=np.int32), np.zeros((0,), dtype=np.int32)
+                else:
+                    return np.concatenate(tokens, 0), np.array([len(x) for x in tokens])
+            text, lens = tf.py_function(_enc, (x,), (tf.int32, tf.int32))
+            lens = tf.ensure_shape(lens, [None])
+            text = tf.ensure_shape(text, [None])
+            return tf.RaggedTensor.from_row_lengths(text, lens)

tasks.py ADDED Viewed

	@@ -0,0 +1,2548 @@

+# Module that can be imported to register all tasks
+import dataclasses
+import functools
+import logging
+import os
+from collections import OrderedDict
+from typing import List, Dict, Any
+import seqio
+from seqio import dataset_providers
+import tensorflow_datasets as tfds
+from .data_utils import _strip_metadata, build_tokenizer
+from .preprocesssors import *
+from .preprocesssors import _preprocess_scifi
+@dataclasses.dataclass
+class TaskSpec:
+    name: str
+    source: seqio.DataSourceInterface
+    preprocessors: List
+    style: str
+    inference_preprocessors: List = None
+    inference_only: bool = False
+    decode_image: bool = False
+    shuffle_after: Optional[int] = None
+    ignore_errors: bool = False
+MULTITASK_TFDS_DATA_DIR = "/weka/oe-training-default/mm-olmo/tensorflow_datasets"
+TASKS: Dict[str, TaskSpec] = {}
+def add_task(
+    name,
+    source: seqio.DataSourceInterface,
+    preprocessors: List,
+    style: str,
+    inf_preprocessor=None,
+    inf_only=False,
+    decode_image=False,
+    shuffle_after=None,
+    ignore_errors=False
+):
+    TASKS[name] = TaskSpec(
+        name, source, preprocessors, style, inf_preprocessor, inf_only, decode_image,
+        shuffle_after, ignore_errors)
+@seqio.map_over_dataset
+def add_image_size(ex):
+    if ex["image"].dtype == tf.string:
+        ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
+    img_h = tf.shape(ex["image"])[0]
+    img_w = tf.shape(ex["image"])[1]
+    ex["metadata/image_size"] = [img_w, img_h]
+@dataclasses.dataclass
+class TaskDatasetBuilder:
+    """tf.data.Dataset builder for task after shuffling, sharding, and initial model pre-processing
+    have been applied"""
+    # This class is a simplified and customized version of seqio.Task
+    #
+    # The main differences are:
+    #     1: Does not prefetch by default, which wastes a small amount of RAM if we are using the
+    #        dataset in a mixture which can just have its own top-level prefetch
+    #     2: Reduce threshold for memory caching which is way too high for image datasets by default
+    #     3: Can customize when shuffling occurs to help minimizes RAM usage, in general shuffling
+    #        should happen before building image crops and tokenization so the shuffle and
+    #        dataset checkpoint take less memory
+    #     4: Don't decoding images until after shuffling for the same reason
+    #     5: Support splitting with tfds.map_split so we never have to fall back to example sharding
+    #        not default at the moment since its not well tested
+    #     6: Removes caching/output feature spec stuff from seqio that we don't need
+    name: str
+    source: Any
+    preprocessors: List
+    keep_metadata: bool
+    shuffle_after: int
+    sharding: str = "tfds_split"
+    decode_image: bool = False
+    ignore_errors: bool = False
+    def get_dataset(
+        self,  # pytype: disable=signature-mismatch  # overriding-default-value-checks
+        sequence_length: Optional[Mapping[str, int]] = None,
+        split: str = tfds.Split.TRAIN,
+        shuffle: bool = True,
+        shuffle_buffer_size: Optional[int] = 1000,
+        seed: Optional[int] = None,
+        shard_info: Optional[seqio.ShardInfo] = None,
+        num_epochs: Optional[int] = 1,
+        try_in_mem_cache: bool = True,
+        trim_output_features: bool=True
+    ) -> tf.data.Dataset:
+        source = self.source
+        if self.sharding == "seqio":
+            if source.supports_arbitrary_sharding:
+                shard_data_source = True
+            elif shard_info:
+                # Whether we should shard at source or on the examples from the source.
+                shard_data_source = (
+                    len(source.list_shards(split=split)) >= shard_info.num_shards
+                )
+                logging.info(
+                    "Sharding at the %s: %d of %d",
+                    "data source" if shard_data_source else "examples",
+                    shard_info.index + 1,
+                    shard_info.num_shards,
+                    )
+            else:
+                # Call get_dataset on the source without a shard_info.
+                shard_data_source = True
+                shard_info = None
+            if "image" in source.tfds_dataset.info.features:
+                if not self.decode_image:
+                    source.tfds_dataset._decoders = dict(image=tfds.decode.SkipDecoding())
+            if shard_data_source:
+                ds = source.get_dataset(
+                    split=split, shuffle=shuffle, seed=seed, shard_info=shard_info)
+            else:
+                ds = source.get_dataset(split=split, shuffle=shuffle, seed=seed)
+                ds = ds.shard(shard_info.num_shards, shard_info.index)
+        elif self.sharding == "tfds_split":
+            # Shard with `tfds.even_splits`, which is seems to be recommended for mult-host training
+            # https://github.com/tensorflow/datasets/blob/master/docs/splits.md#tfdseven_splits--multi-host-training
+            assert isinstance(self.source, seqio.TfdsDataSource)
+            loader: seqio.LazyTfdsLoader = self.source.tfds_dataset
+            dataset, data_dir = loader.get_split_params(split)
+            shard_split = loader._map_split(split)
+            if shard_info and shard_info.num_shards > 1:
+                shard_split = tfds.even_splits(shard_split, n=shard_info.num_shards, drop_remainder=False)[shard_info.index]
+            else:
+                shard_split = shard_split
+            read_config = loader.read_config
+            read_config.shuffle_seed = seed
+            read_config.skip_prefetch = True
+            read_config.input_context = None
+            # Don't decode images until after shuffling to save RAM
+            if "image" in loader.info.features:
+                decoders = dict(image=tfds.decode.SkipDecoding())
+            else:
+                decoders = None
+            ds = tfds.load(
+                dataset,
+                split=shard_split,
+                data_dir=data_dir,
+                shuffle_files=shuffle,
+                download=True,
+                try_gcs=True,
+                read_config=read_config,
+                decoders=decoders
+            )
+        else:
+            raise NotImplementedError(self.sharding)
+        num_shards = shard_info.num_shards if shard_info else 1
+        if try_in_mem_cache and (
+            source.num_input_examples(split)
+            and source.num_input_examples(split)
+            < 10000 * num_shards
+        ):
+            logging.info(f"Automatically caching small dataset in memory: {self.name}:{split}")
+            ds = ds.cache()
+        # We repeat before calling any (potentially) stochastic
+        # preprocessors in order to take new samples each epoch.
+        if num_epochs != 1:
+            ds = ds.repeat(num_epochs)
+        preprocessors = [
+            seqio.add_kwargs_to_transform(
+                _fn,
+                sequence_length=sequence_length,
+                output_features=None,
+            ) for _fn in self.preprocessors
+        ]
+        with seqio.utils.map_seed_manager(seed):
+            for fn in preprocessors[:self.shuffle_after]:
+                ds = fn(ds)
+            # Strip metadata before shuffling if possible so its doesn't waste space
+            if not self.keep_metadata:
+                ds = _strip_metadata(ds)
+            if shuffle:
+                if shuffle_buffer_size is None:
+                    raise ValueError("Shuffle is true, but shuffle_buffer_size is None")
+                else:
+                    ds = ds.shuffle(shuffle_buffer_size, seed=seed)
+            for fn in preprocessors[self.shuffle_after:]:
+                ds = fn(ds)
+        if self.ignore_errors:
+            ds = ds.ignore_errors(log_warning=True)
+        if trim_output_features:
+            ds = seqio.trim_dataset(ds, sequence_length, sequence_length)
+        return ds
+def get_task(preprocessor, name, is_training, for_inference,
+             include_metadata=None, style_override=None) -> TaskDatasetBuilder:
+    """Get a builder for task `name` that is pre-processed by `preprocessor`"""
+    task_spec = TASKS[name]
+    if for_inference is None:
+        for_inference = task_spec.inference_only
+    elif task_spec.inference_only and not for_inference:
+        raise ValueError(f"Inference=only task {task_spec.name} can only be used in inference mode")
+    if include_metadata is None:
+        include_metadata = for_inference
+    if preprocessor is not None:
+        style = style_override if style_override else task_spec.style
+        preprocessor = preprocessor.get_preprocessor(
+            is_training, for_inference, style, include_metadata)
+        preprocessor = [preprocessor]
+    else:
+        preprocessor = []
+    task_preprocessors = task_spec.preprocessors
+    if for_inference and task_spec.inference_preprocessors is not None:
+        task_preprocessors = task_spec.inference_preprocessors
+    if isinstance(task_spec.source, seqio.TfdsDataSource):
+        from seqio.utils import _TFDS_DATA_DIR_OVERRIDE
+        if _TFDS_DATA_DIR_OVERRIDE:
+            # Stop annoying override warnings flooding the log
+            task_spec.source.tfds_dataset._data_dir = None
+    return TaskDatasetBuilder(
+        task_spec.name,
+        task_spec.source,
+        task_preprocessors + preprocessor,
+        keep_metadata=include_metadata,
+        shuffle_after=(task_spec.shuffle_after if task_spec.shuffle_after
+                       else len(task_spec.preprocessors)),
+        sharding="seqio",
+        decode_image=task_spec.decode_image,
+        ignore_errors=task_spec.ignore_errors,
+    )
+add_task(
+    "coco_caption_2017",
+    source=seqio.TfdsDataSource(
+        tfds_name="coco_all:1.0.1",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        functools.partial(rekey, key_map={
+            "image/filename": ["image/filename"],
+            "image": ["image"],
+            "text": ["captions", "text"]
+        }),
+        functools.partial(flatten_parts, parts=["text"]),
+    ],
+    inf_preprocessor=[
+        functools.partial(rekey, key_map={
+            "image/filename": ["image/filename"],
+            "image": ["image"],
+            "text": ["captions", "text"]
+        })
+    ],
+    style="coco_captioning",
+)
+add_task(
+    "coco_captioning_karpathy",
+    source=seqio.TfdsDataSource(
+        tfds_name="coco_captioning_karpathy:1.0.2",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "val", "test": "test"}
+    ),
+    preprocessors=[
+        rename(text="captions"),
+        functools.partial(flatten_parts, parts=["text"]),
+    ],
+    inf_preprocessor=[add_coco_url],
+    style="coco_captioning",
+)
+add_task(
+    "synth_counting",
+    source=seqio.TfdsDataSource(
+        tfds_name="synth_counting:0.0.3",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train[5120:]", "validation": "train[:5120]"}
+    ),
+    preprocessors=[synth_count_preprocessor],
+    inf_preprocessor=[synth_count_inf_preprocessor],
+    style="synth_counting",
+)
+add_task(
+    "khan_academy",
+    source=seqio.TfdsDataSource(
+        tfds_name="khan_academy:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train[1024:]", "validation": "train[:1024]"}
+    ),
+    preprocessors=[extract_khan_academy],
+    style="khan_academy",
+)
+for name, src in [
+    ("vaia_qa_latex_image_math_subset", seqio.TfdsDataSource(
+        tfds_name=f"vaia_qa_latex_image_short_answer:0.1.2",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "validation"}
+    )),
+    ("vaia_qa_latex_image_all", seqio.TfdsDataSource(
+        tfds_name=f"vaia_qa_latex_image_short_answer:0.1.3",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "validation"}
+    )),
+]:
+  add_task(
+    f"{name}_short_answer",
+    source=src,
+    preprocessors=[
+      remove_is_long,
+      remove_has_multiple_parts,
+      functools.partial(extract_vaia_qa_latex_image, add_short_answer=True),
+    ],
+    style="vaia_qa",
+  )
+  add_task(
+    f"{name}_short_answer_first",
+    source=src,
+    preprocessors=[
+      remove_is_long,
+      remove_has_multiple_parts,
+      functools.partial(extract_vaia_qa_latex_image, add_short_answer=True, set_short_answer_first=True),
+    ],
+    style="vaia_qa_short_answer_first",
+  )
+  add_task(
+    f"{name}_mc_only_short_answer",
+    source=src,
+    preprocessors=[
+      remove_is_long,
+      remove_has_multiple_parts,
+      filter_mc,
+      functools.partial(extract_vaia_qa_latex_image, add_short_answer=True),
+    ],
+    style="vaia_qa_short_answer",
+  )
+  add_task(
+    f"{name}_mc_only_short_answer_first",
+    source=src,
+    preprocessors=[
+      remove_is_long,
+      remove_has_multiple_parts,
+      filter_mc,
+      functools.partial(extract_vaia_qa_latex_image, add_short_answer=True, set_short_answer_first=True),
+    ],
+    style="vaia_qa_short_answer_first",
+  )
+  add_task(
+    f"{name}_image_only_short_answer",
+    source=src,
+    preprocessors=[
+      image_only,
+      remove_is_long,
+      remove_has_multiple_parts,
+      functools.partial(extract_vaia_qa_latex_image, add_short_answer=True),
+    ],
+    style="vaia_qa_short_answer",
+  )
+  add_task(
+    f"{name}_image_only_short_answer_first",
+    source=src,
+    preprocessors=[
+      image_only,
+      remove_is_long,
+      remove_has_multiple_parts,
+      functools.partial(extract_vaia_qa_latex_image, add_short_answer=True, set_short_answer_first=True),
+    ],
+    style="vaia_qa_short_answer_first",
+  )
+add_task(
+  "vqa_online",
+  source=seqio.TfdsDataSource(
+    tfds_name="vqa_online:1.0.1",
+    tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    splits={"train": "train", "validation": "validation", "test": "validation"}
+  ),
+  preprocessors=[
+    build_question_with_context,
+    extract_vqa_online,
+  ],
+  style="vqa_online",
+)
+add_task(
+  "vqa_online_gpt_longQ_longA",
+  source=seqio.TfdsDataSource(
+    tfds_name="vqa_online_gpt_parsed:1.1.0",
+    tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    splits={"train": "train", "validation": "validation", "test": "validation"}
+  ),
+  preprocessors=[
+    rename(question="question_long", answer="answer_long"),
+    extract_vqa_online,
+  ],
+  style="vqa_online",
+)
+add_task(
+    "famous_birthdays",
+    source=seqio.TfdsDataSource(
+        tfds_name="famous_birth_days:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train[5120:]", "validation": "train[:5120]"}
+    ),
+    preprocessors=[
+        famous_birthdays_preprocessor,
+        functools.partial(name_entity_augmentation, p_high_color=0.0),
+    ],
+    style="famous_birthdays",
+)
+add_task(
+    "wiki_art",
+    source=seqio.TfdsDataSource(
+        tfds_name="wiki_art:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train[5120:]", "validation": "train[:5120]"}
+    ),
+    preprocessors=[name_entity_augmentation, wiki_art_preprocessor],
+    style="wiki_art",
+)
+add_task(
+    "wiki_art_no_aug",
+    source=seqio.TfdsDataSource(
+        tfds_name="wiki_art:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train[5120:]", "validation": "train[:5120]"}
+    ),
+    preprocessors=[wiki_art_preprocessor],
+    style="wiki_art",
+)
+add_task(
+    "atlas_obscura",
+    source=seqio.TfdsDataSource(
+        tfds_name="atlas_obscura:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train[5120:]", "validation": "train[:5120]"}
+    ),
+    preprocessors=[
+        atlas_obscura_preprocessor,
+        mild_color_aug_preprocessor
+    ],
+    style="atlas_obscura",
+)
+add_task(
+    "clocks",
+    source=seqio.TfdsDataSource(
+        tfds_name="clocks:1.0.1",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        clocks_preprocessor,
+        clock_augmentation
+    ],
+    style="clocks",
+    shuffle_after=0
+)
+add_task(
+    "count_bench",
+    source=seqio.TfdsDataSource(
+        tfds_name="count_bench:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        count_bench_preprocessor,
+    ],
+    style="count_bench",
+)
+add_task(
+    "tulu_v2_sft",
+    source=seqio.TfdsDataSource(
+        tfds_name="allenai__tulu_v2_sft_mixture:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[tulu_preprocessor],
+    style="tulu_v2",
+)
+# Pointing / Point+Count datasets
+for is_count in [True, False]:
+    if is_count:
+        task = "point_count"
+    else:
+        task = "pointing"
+    add_task(
+        task,
+        source=seqio.TfdsDataSource(
+            tfds_name="pointing:1.0.1",
+            tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+            splits={"train": "train", "validation": "validation"}
+        ),
+        preprocessors=[
+            filter_points,
+            functools.partial(pointing_preprocessor, with_count=is_count),
+            split
+        ],
+        style=task,
+    )
+    add_task(
+        task + "_eval",  # pointing validation set
+        source=seqio.TfdsDataSource(
+            tfds_name="pointing:1.0.2",
+            tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        ),
+        preprocessors=[
+            filter_points,
+            functools.partial(pointing_preprocessor, with_count=is_count),
+            split
+        ],
+        style=task,
+    )
+    add_task(
+        task + "_high_freq",
+        source=seqio.TfdsDataSource(
+            tfds_name="count_qa:0.0.2",
+            tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+            splits=dict(
+                train="train[2048:]",
+                validation="train[:2048]"
+            )
+        ),
+        preprocessors=[
+            filter_points,
+            fix_count_qa,  # Fix a tfrecord bug TODO fix the underlying records
+            functools.partial(pointing_preprocessor, with_count=is_count),
+            split,
+        ],
+        style=task,
+    )
+    add_task(
+        "fast_flickr_count_qa_" + task,
+        source=seqio.TfdsDataSource(
+            tfds_name="fast_flickr_count_qa:1.0.4",
+            tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        ),
+        preprocessors=[
+            functools.partial(count_qa_preprocessor, with_count=is_count),
+        ],
+        inf_preprocessor=[
+            functools.partial(count_qa_preprocessor, with_count=is_count, for_inference=True),
+        ],
+        style=task,
+    )
+add_task(
+    "countbench_qa",
+    source=seqio.TfdsDataSource(
+        tfds_name="countbench_qa:1.2.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    inf_only=True,
+    preprocessors=[
+        count_qa_preprocessor_inf,
+    ],
+    style="point_count",
+)
+add_task(
+    f"pointing_test",  # pointing set with segmentation ground truths
+    source=seqio.TfdsDataSource(
+        tfds_name="pointing:1.0.3",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        pointing_inf_preprocessor
+    ],
+    style=task,
+    inf_only=True,
+)
+add_task(
+    "point_qa",
+    source=seqio.TfdsDataSource(
+        tfds_name="point_qa:0.0.5",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[512:]",
+            validation="train[:512]"
+        )
+    ),
+    preprocessors=[extract_point_qa, split],
+    style="point_qa",
+)
+add_task(
+    "clocks_no_aug",
+    source=seqio.TfdsDataSource(
+        tfds_name="clocks:1.0.1",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        clocks_preprocessor
+    ],
+    style="clocks",
+)
+add_task(
+    "clock_bench",
+    source=seqio.TfdsDataSource(
+        tfds_name="clock_bench:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        clock_bench_preprocessor
+    ],
+    inf_only=True,
+    style="clocks",
+)
+add_task(
+    "wiki_data",
+    source=seqio.TfdsDataSource(
+        tfds_name="cockatoo_wiki:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train[10240:]", "validation": "train[:5120]", "test": "train[5120:10240]"}
+    ),
+    preprocessors=[extract_wiki_data],
+    style="wiki_data",
+)
+add_task(
+    "wiki_data_name",
+    source=seqio.TfdsDataSource(
+        tfds_name="cockatoo_wiki:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train[10240:]", "validation": "train[:5120]", "test": "train[5120:10240]"}
+    ),
+    preprocessors=[
+        extract_wiki_data_name,
+        mild_color_aug_preprocessor
+    ],
+    style="wiki_data",
+)
+add_task(
+    "wiki_data_describe",
+    source=seqio.TfdsDataSource(
+        tfds_name="cockatoo_wiki:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train[10240:]", "validation": "train[:5120]", "test": "train[5120:10240]"}
+    ),
+    preprocessors=[extract_wiki_data_describe],
+    inf_only=True,
+    style="wiki_data",
+)
+add_task(
+    "wiki_data_describe",
+    source=seqio.TfdsDataSource(
+        tfds_name="cockatoo_wiki:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train[10240:]", "validation": "train[:5120]", "test": "train[5120:10240]"}
+    ),
+    preprocessors=[extract_wiki_data_describe],
+    inf_only=True,
+    style="wiki_data",
+)
+for name, src in [
+    ("scifi_charts", seqio.TfdsDataSource(
+        tfds_name="sci_fi_charts:1.0.6",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train[1024:]", "validation": "train[:1024]"}
+    )),
+    ("scifi_table", seqio.TfdsDataSource(
+        tfds_name="sci_fi_table:1.0.3",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train[1024:]", "validation": "train[:1024]"}
+    )),
+    ("scifi_document", seqio.TfdsDataSource(
+        tfds_name="sci_fi_document:1.0.3",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train[1024:]", "validation": "train[:1024]"}
+    )),
+    ("scifi_diagram", seqio.TfdsDataSource(
+        tfds_name="sci_fi_diagram:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train[1024:]", "validation": "train[:1024]"}
+    )),
+    ("scifi_natural", seqio.TfdsDataSource(
+        tfds_name="sci_fi_natural:1.0.1",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train[128:]", "validation": "train[:128]"}
+    )),
+    ("scifi_nutrition", seqio.TfdsDataSource(
+        tfds_name="sci_fi_nutrition:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train[128:]", "validation": "train[:128]"}
+    ))
+]:
+    add_task(
+        name + "_qa",
+        source=src,
+        preprocessors=[
+            remove_no_qa,
+            _preprocess_scifi,
+            extract_individual_vqa,
+        ],
+        inf_preprocessor=[
+            remove_no_qa, _preprocess_scifi,
+            functools.partial(flatten_parts, parts=["question", "answer"]),
+            extract_individual_vqa,
+        ],
+        style=name,
+        )
+    add_task(
+        name + "_qa_split",
+        source=src,
+        preprocessors=[
+            remove_no_qa,
+            _preprocess_scifi,
+            extract_individual_vqa,
+            split
+        ],
+        inf_preprocessor=[
+            remove_no_qa, _preprocess_scifi,
+            functools.partial(flatten_parts, parts=["question", "answer"]),
+            extract_individual_vqa,
+        ],
+        style=name,
+        )
+    add_task(
+        name + "_qa_exp",
+        source=src,
+        preprocessors=[
+            remove_no_qa,
+            _preprocess_scifi,
+            extract_scifi_qa_exp,
+            extract_individual_vqa,
+        ],
+        inf_preprocessor=[
+            remove_no_qa, _preprocess_scifi,
+            extract_scifi_qa_exp,
+            functools.partial(flatten_parts, parts=["question", "answer"]),
+            extract_individual_vqa,
+        ],
+        style=name + "_qa_exp",
+    )
+    add_task(
+        name + "_qa_exp_split",
+        source=src,
+        preprocessors=[
+            remove_no_qa,
+            _preprocess_scifi,
+            extract_scifi_qa_exp,
+            extract_individual_vqa,
+            split,
+        ],
+        inf_preprocessor=[
+            remove_no_qa, _preprocess_scifi,
+            extract_scifi_qa_exp,
+            functools.partial(flatten_parts, parts=["question", "answer"]),
+            extract_individual_vqa,
+        ],
+        style=name + "_qa_exp",
+    )
+    add_task(
+        name + "_exp",
+        source=src,
+        preprocessors=[
+            remove_no_qa,
+            _preprocess_scifi,
+            scifi_explanation_only,
+            extract_individual_vqa,
+            split
+        ],
+        style=name + "_exp"
+    )
+    add_task(
+        name + "_demo",
+        source=src,
+        preprocessors=[
+            remove_no_qa,
+            _preprocess_scifi,
+            extract_scifi_qa_demo,
+            extract_individual_vqa,
+            split
+        ],
+        style="scifi_demo"
+    )
+add_task(
+    "chart_qa_scifi",
+    source=seqio.TfdsDataSource(
+        tfds_name="chart_qa:1.0.2",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "val", "test": "test"}
+    ),
+    preprocessors=[
+        rename(question="query", answer="label", **{"metadata/is_human": "is_human"}),
+        extract_individual_vqa,
+    ],
+    style="scifi_charts_qa_exp",
+)
+add_task(
+    "chart_qa_prompting",
+    source=seqio.TfdsDataSource(
+        tfds_name="chart_qa:1.0.2",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "val", "test": "test"}
+    ),
+    preprocessors=[
+        rename(question="query", answer="label", **{"metadata/is_human": "is_human"}),
+        chartqa_prompting,
+        extract_individual_vqa,
+    ],
+    style="chart_qa",
+)
+add_task(
+    "chart_qa_prompting_explanation",
+    source=seqio.TfdsDataSource(
+        tfds_name="chart_qa:1.0.2",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "val", "test": "test"}
+    ),
+    preprocessors=[
+        rename(question="query", answer="label", **{"metadata/is_human": "is_human"}),
+        chartqa_explanation,
+        extract_individual_vqa,
+    ],
+    style="chart_qa",
+)
+add_task(
+    "coco_captioning_karpathy_multi",
+    source=seqio.TfdsDataSource(
+        tfds_name="coco_captioning_karpathy:1.0.2",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "val", "test": "test"}
+    ),
+    preprocessors=[rename(text="captions")],
+    inf_preprocessor=[add_coco_url],
+    style="coco_captioning",
+)
+add_task(
+    "coco_caption_2017_grouped",
+    source=seqio.TfdsDataSource(
+        tfds_name="coco_all:1.0.1",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        functools.partial(
+            rekey, key_map={
+                "image/filename": ["image/filename"],
+                "image": ["image"],
+                "text": ["captions", "text"]
+            }),
+        join_captions
+    ],
+    style="coco_captioning_multiple",
+)
+add_task(
+    "llava_pretrain",
+    source=seqio.TfdsDataSource(
+        tfds_name="llava_pretrain:1.0.0",
+        tfds_data_dir="gs://mm-olmo-datasets/",
+        splits=dict(
+            train="train[4096:]",
+            validation="train[:4096]"
+        )
+    ),
+    preprocessors=[extract_llava],
+    style="web_caption"
+)
+add_task(
+    "rohun_images",
+    source=seqio.TfdsDataSource(
+        tfds_name="rohun_images:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[],
+    style="long_caption",
+    inf_only=True
+)
+add_task(
+    "dense_caption_eval",
+    source=seqio.TfdsDataSource(
+        tfds_name="dense_captioning_eval:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(validation="train")
+    ),
+    preprocessors=[],
+    style="long_caption",
+    inf_only=True
+)
+add_task(
+    "dense_caption_eval_dbg",
+    source=seqio.TfdsDataSource(
+        tfds_name="dense_captioning_eval:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(validation="train")
+    ),
+    preprocessors=[
+        lambda ds: ds.filter(lambda x: x["url"] == "https://explore-multimodal-datasets.s3.us-west-2.amazonaws.com/eval-set/v0/eval-set/a211be07e2c9c722ef75093026a608856bd07ad935ebdedea6f2944b1f2d2b0e.jpg")
+    ],
+    style="long_caption",
+    inf_only=True
+)
+add_task(
+    "dense_caption_sample",
+    source=seqio.TfdsDataSource(
+        tfds_name="dense_captioning_eval:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            validation="train"
+        )
+    ),
+    preprocessors=[select_dense_caption_sample],
+    style="long_caption",
+)
+add_task(
+    "cockatoo_1per_caption_287k",
+    source=seqio.TfdsDataSource(
+        tfds_name="cockatoo_1per_caption_287k:1.0.5",
+        tfds_data_dir="gs://mm-olmo-data/",
+        splits=dict(
+            train="train[5120:]",
+            validation="train[:5120]"
+        )
+    ),
+    preprocessors=[
+        rename(text="caption"),
+    ],
+    style="long_caption"
+)
+def _filter_large_ratio(ds):
+    return ds.filter(
+        lambda x: tf.shape(x["image"])[0] > tf.shape(x["image"])[1]*2
+    )
+add_task(
+    f"cockatoo_dbg",
+    source=    seqio.TfdsDataSource(
+        tfds_name="cockatoo_476k:1.0.5",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[5120:]",
+            validation="train[:5120]"
+        )
+    )
+    ,
+    preprocessors=[
+        _filter_large_ratio,
+        extract_caption_and_transcript
+    ],
+    style=["long_caption", "transcript"]
+)
+for name, src in [
+    ("712k_sept6", seqio.TfdsDataSource(
+        tfds_name="cockatoo_712k_sept6:1.0.5",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[5120:]",
+            validation="train[:5120]"
+        )
+    )),
+    ("476k", seqio.TfdsDataSource(
+        tfds_name="cockatoo_476k:1.0.5",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[5120:]",
+            validation="train[:5120]"
+        )
+    )),
+    ("476k_gpt_captions", seqio.TfdsDataSource(
+        tfds_name="cockatoo_476k_gpt_captions:1.0.5",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[5120:]",
+            validation="train[:5120]"
+        )
+    )),
+    ("100k_of_476k_gpt_captions", seqio.TfdsDataSource(
+        tfds_name="cockatoo_476k_gpt_captions:1.0.5",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[5120:105120]",
+            validation="train[:5120]"
+        )
+    )),
+    ("200k_of_476k_gpt_captions", seqio.TfdsDataSource(
+        tfds_name="cockatoo_476k_gpt_captions:1.0.5",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[5120:205120]",
+            validation="train[:5120]"
+        )
+    )),
+    ("300k_of_476k_gpt_captions", seqio.TfdsDataSource(
+        tfds_name="cockatoo_476k_gpt_captions:1.0.5",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[5120:305120]",
+            validation="train[:5120]"
+        )
+    )),
+    ("400k_of_476k_gpt_captions", seqio.TfdsDataSource(
+        tfds_name="cockatoo_476k_gpt_captions:1.0.5",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[5120:405120]",
+            validation="train[:5120]"
+        )
+    )),
+    ("400k_of_476k", seqio.TfdsDataSource(
+        tfds_name="cockatoo_476k:1.0.5",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[5120:405120]",
+            validation="train[:5120]"
+        )
+    )),
+    ("300k_of_476k", seqio.TfdsDataSource(
+        tfds_name="cockatoo_476k:1.0.5",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[5120:305120]",
+            validation="train[:5120]"
+        )
+    )),
+    ("200k_of_476k", seqio.TfdsDataSource(
+        tfds_name="cockatoo_476k:1.0.5",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[5120:205120]",
+            validation="train[:5120]"
+        )
+    )),
+    ("100k_of_476k", seqio.TfdsDataSource(
+        tfds_name="cockatoo_476k:1.0.5",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[5120:105120]",
+            validation="train[:5120]"
+        )
+    )),
+    ("276k", seqio.TfdsDataSource(
+        tfds_name="cockatoo:1.0.5",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[5120:]",
+            validation="train[:5120]"
+        )
+    )),
+    ("180k", seqio.TfdsDataSource(
+        tfds_name="cockatoo:1.0.3",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[4096:]",
+            validation="train[:4096]"
+        )
+    )),
+    ("84k_claude_captions", seqio.TfdsDataSource(
+        tfds_name="cockatoo_84k_claude_captions:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[1000:]",
+            validation="train[:1000]"
+        )
+    )),
+]:
+    add_task(
+        f"cockatoo_{name}",
+        source=src,
+        preprocessors=[extract_caption],
+        style="long_caption"
+    )
+    add_task(
+        f"cockatoo_and_transcript_{name}",
+        source=src,
+        preprocessors=[extract_caption_and_transcript],
+        style=["long_caption", "transcript"]
+    )
+    add_task(
+        f"cockatoo_and_transcript_stratified_{name}",
+        source=src,
+        preprocessors=[
+            extract_caption_and_transcript,
+            # put this here to hack seqio into repeating the dataset after
+            # `extract_caption_and_transcript` which will properly stratify the transcripts
+            seqio.CacheDatasetPlaceholder(),
+        ],
+        style=["long_caption", "transcript"]
+    )
+    add_task(
+        f"cockatoo_and_all_transcripts_{name}",
+        source=src,
+        preprocessors=[extract_caption_and_all_transcripts],
+        style=["long_caption", "transcript", "transcript", "transcript"]
+    )
+    add_task(
+        f"cockatoo_all_transcripts_{name}",
+        source=src,
+        preprocessors=[extract_all_transcripts],
+        style="transcript"
+    )
+    add_task(
+        f"cockatoo_transcripts_{name}",
+        source=src,
+        preprocessors=[extract_transcript],
+        style="transcript"
+    )
+TFRECORD_IMAGE_TEXT_FEATURES = {
+    'image': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
+    'text':tf.io.FixedLenFeature(shape=(), dtype=tf.string),
+}
+add_task(
+    "laion400m",
+    source=seqio.TFExampleDataSource(
+        split_to_filepattern={
+            "train": os.path.join("gs://unified-io-2-us-east/", "pretrain-datasets", "laion400m", "1.0.0", "laion400m-train*"),
+        },
+        feature_description=TFRECORD_IMAGE_TEXT_FEATURES,
+    ),
+    preprocessors=[
+        functools.partial(rekey, key_map={
+            "image": ["image"],
+            "text": ["text"]
+        }),
+    ],
+    style="laion",
+)
+add_task(
+    "laion_2B",
+    source=seqio.TFExampleDataSource(
+        split_to_filepattern={
+            "train": os.path.join(MULTITASK_TFDS_DATA_DIR, "laion2b_en", "1.0.0", "laion2b_en-train*"),
+        },
+        feature_description=TFRECORD_IMAGE_TEXT_FEATURES,
+    ),
+    preprocessors=[
+        functools.partial(rekey, key_map={
+            "image": ["image"],
+            "text": ["text"]
+        }),
+    ],
+    style="laion",
+)
+add_task(
+    "region_caption_vg",
+    source=seqio.TfdsDataSource(
+        tfds_name="vg:1.0.1",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[region_captions_to_dense],
+    style="region_captions",
+)
+add_task(
+    "pdfa_eng_wds",
+    source=seqio.TfdsDataSource(
+        tfds_name="pdfa_eng_wds:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        functools.partial(max_words, max_words=400),
+        format_pdfa_eng_wds
+    ],
+    style="pdfa_eng_wds",
+)
+add_task(
+    "idl_words",
+    source=seqio.TfdsDataSource(
+        tfds_name="idl_words:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[],
+    style="idl_words",
+)
+open_image_v6_keys_to_features = {
+    'image': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
+    'image_id': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
+    'detection/label':tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.string, allow_missing=True),
+    'detection/bbox':tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.float32,  allow_missing=True),
+    'detection/num':tf.io.FixedLenFeature(shape=(), dtype=tf.int64),
+    'vrd/sub_label': tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.string, allow_missing=True),
+    'vrd/obj_label': tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.string, allow_missing=True),
+    'vrd/sub_bbox':tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.float32, allow_missing=True),
+    'vrd/obj_bbox':tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.float32, allow_missing=True),
+    'vrd/relation': tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.string, allow_missing=True),
+    'vrd/num':tf.io.FixedLenFeature(shape=(), dtype=tf.int64),
+    'cap/cap_caption': tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.string, allow_missing=True),
+    'cap/num':tf.io.FixedLenFeature(shape=(), dtype=tf.int64),
+    'seg/masks': tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.string, allow_missing=True),
+    'seg/num':tf.io.FixedLenFeature(shape=(), dtype=tf.int64),
+    'seg/label': tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.string, allow_missing=True),
+    'seg/bbox': tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.float32, allow_missing=True),
+}
+add_task(
+    "localized_narratives_v6",
+    source=seqio.TFExampleDataSource(
+        split_to_filepattern={
+            "train": os.path.join(MULTITASK_TFDS_DATA_DIR, "open_image_v6", "1.0.0", "open_image_v6-train*"),
+        },
+        feature_description=open_image_v6_keys_to_features,
+    ),
+    preprocessors=[extract_localized_narrative],
+    style="localized_narratives",
+)
+add_task(
+    "lvis_objects",
+    # A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
+    source=seqio.TfdsDataSource(
+        tfds_name="lvis:1.2.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        extract_lvis,
+        region_captions_to_dense,
+    ],
+    style="lvis_objects",
+)
+add_task(
+    "open_images_with_objects",
+    # A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
+    source=seqio.TFExampleDataSource(
+        split_to_filepattern={
+            "train": os.path.join(MULTITASK_TFDS_DATA_DIR, "open_image_v6", "1.0.0", "open_image_v6-train*"),
+        },
+        feature_description=open_image_v6_keys_to_features,
+    ),
+    preprocessors=[
+        extract_open_images_boxes,
+        region_captions_to_dense,
+    ],
+    style="visual_narratives_with_objects",
+)
+add_task(
+    "cockatoo_with_acc_476k_gpt_captions",
+    source=seqio.TfdsDataSource(
+        tfds_name="cockatoo_with_acc_476k_gpt_captions:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[5120:]",
+            validation="train[:5120]"
+        )
+    ),
+    preprocessors=[accuracy_conditioned_joint],
+    inf_preprocessor=[functools.partial(accuracy_conditioned_joint, is_eval=True)],
+    style=None
+)
+add_task(
+    "dense_caption_eval_with_acc",
+    source=seqio.TfdsDataSource(
+        tfds_name="dense_captioning_eval:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(validation="train")
+    ),
+    preprocessors=[functools.partial(accuracy_conditioned_joint, is_eval=True)],
+    style="long_caption",
+    inf_only=True
+)
+# ************************
+# VQA Datasets
+# ************************
+add_task(
+    "science_qa_img",
+    source=seqio.TfdsDataSource(
+        tfds_name="science_qa:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "val", "test": "test"}
+    ),
+    preprocessors=[
+        image_only,
+        rename(answer_idx="answer"),
+        build_question_with_hint,
+        format_multiple_choice_qa
+    ],
+    style="science_qa",
+)
+add_task(
+    "tabwmp_da",
+    source=seqio.TfdsDataSource(
+        tfds_name="tab_mwp:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "dev", "test": "test"}
+    ),
+    preprocessors=[
+        rename(text="answer")
+    ],
+    style="tabwmp_da",
+)
+add_task(
+    "figure_qa",
+    source=seqio.TfdsDataSource(
+        tfds_name="figure_qa:1.0.2",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train1", "validation": "validation1", "test": "no_annot_test1"}
+    ),
+    preprocessors=[extract_figureqa, extract_individual_vqa],
+    style="figure_qa",
+)
+add_task(
+    "figure_qa_zero_shot",
+    source=seqio.TfdsDataSource(
+        tfds_name="figure_qa:1.0.2",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train1", "validation": "validation1", "test": "no_annot_test1"}
+    ),
+    preprocessors=[extract_figureqa, convert_figureqa_answer, extract_individual_vqa],
+    style="figure_qa",
+)
+add_task(
+    "plot_qa",
+    source=seqio.TfdsDataSource(
+        tfds_name="plot_qa:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[extract_figureqa, extract_individual_vqa],
+    inf_preprocessor=[
+        extract_figureqa,
+        functools.partial(flatten_parts, parts=["questions", "answer", "question_id"]),
+        extract_individual_vqa
+    ],
+    style="plot_qa",
+)
+add_task(
+    "ai2_diagram",
+    source=seqio.TfdsDataSource(
+        tfds_name="ai2_diagram:1.0.2",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train[1024:]", "validation": "train[:1024]", "test": "test"}
+    ),
+    preprocessors=[
+        rename(choices="answer_texts", answer_idx="correct_answer"),
+        format_multiple_choice_qa
+    ],
+    style="ai2_diagram",
+)
+add_task(
+    "ai2_diagram_v2",
+    source=seqio.TfdsDataSource(
+        tfds_name="ai2_diagram_v2:1.0.1",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        rename(choices="answer_texts", answer_idx="correct_answer"),
+        format_ai2d
+    ],
+    style="ai2_diagram",
+)
+add_task(
+    "ai2_diagram_v2_transparent",
+    source=seqio.TfdsDataSource(
+        tfds_name="ai2_diagram_v2_transparent:1.0.5",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        rename(choices="answer_texts", answer_idx="correct_answer"),
+        format_ai2d
+    ],
+    style="ai2_diagram",
+)
+# ai2_diagram_v2 mixed with addiitonal abc label questions with transparent box.
+# Shares the same image split as ai2_diagram_v2.
+add_task(
+    "ai2_diagram_v2_mix_transparent",
+    source=seqio.TfdsDataSource(
+        tfds_name="ai2_diagram_v2_mix_transparent:1.0.6",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={
+            "train": "train_mix",
+            "validation": "validation_mix",
+            "test": "test_mix", # test should only use either transparent or opaque
+            # "test": "test_opaque",
+        }
+    ),
+    preprocessors=[
+        rename(choices="answer_texts", answer_idx="correct_answer"),
+        format_ai2d
+    ],
+    style="ai2_diagram",
+)
+add_task(
+    "ai2_diagram_v2_mix_transparent_one_style",
+    source=seqio.TfdsDataSource(
+        tfds_name="ai2_diagram_v2_mix_transparent:1.0.6",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={
+            "train": "train_mix",
+            "validation": "validation_mix",
+            "test": "test_mix", # test should only use either transparent or opaque
+            # "test": "test_opaque",
+        }
+    ),
+    preprocessors=[
+        rename(choices="answer_texts", answer_idx="correct_answer"),
+        functools.partial(format_ai2d, variable_style=False),
+    ],
+    style="ai2_diagram",
+)
+for src, test_sets in [
+    ["refclef_unc", ["testA", "testB", "testC", "testAB", "testBC"]],
+    ["refcoco_unc", ["testA", "testB"]],
+    ["refcocoplus_unc", ["testA", "testB"]],
+    ["refcocog_umd", ["test"]],
+]:
+    if "coco" in src:
+        add_url = [add_coco_url]
+    else:
+        add_url = []
+    splits = {x: x for x in test_sets}
+    splits.update({"train": "train", "validation": "val"})
+    add_task(
+        src,
+        source=seqio.TfdsDataSource(
+            tfds_name=f"{src}:1.0.2",
+            tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+            splits=splits
+        ),
+        preprocessors=[refexp],
+        inf_preprocessor=add_url + [
+            refexp_inf,
+            # Flatten objects
+            functools.partial(flatten_parts, parts=["refexp", "metadata/bbox"]),
+            # Flatten expressions
+            functools.partial(flatten_parts, parts=["refexp"])
+        ],
+        style="refexp",
+        decode_image=True,
+    )
+    add_task(
+        src + "_pointing",
+        source=seqio.TfdsDataSource(
+            tfds_name=f"{src}:1.0.2",
+            tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+            splits=splits
+        ),
+        preprocessors=[refexp_pointing],
+        inf_preprocessor=add_url + [
+            refexp_pointing_inf,
+            functools.partial(flatten_parts, parts=["refexp", "metadata/bbox", "metadata/mask", "metadata/answer"]),
+            functools.partial(flatten_parts, parts=["refexp"])
+        ],
+        decode_image=True,
+        style="refexp_pointing",
+    )
+# FIXME
+add_task(
+    "ai2_diagram_test",
+    source=seqio.TfdsDataSource(
+        tfds_name="ai2_diagram:1.0.2",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train[1024:]", "validation": "train[:1024]", "test": "test"}
+    ),
+    preprocessors=[
+        rename(choices="answer_texts", answer_idx="correct_answer"),
+        format_multiple_choice_qa
+    ],
+    style="ai2_diagram",
+)
+add_task(
+    "gqa",
+    source=seqio.TfdsDataSource(
+        tfds_name="gqa:1.0.1",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "val", "test": "test"}
+    ),
+    preprocessors=[
+        functools.partial(format_gqa, is_balanced=True),
+        extract_individual_vqa,
+    ],
+    inf_preprocessor=[
+        functools.partial(format_gqa, is_balanced=True),
+        extract_individual_vqa,
+    ],
+    style="gqa",
+)
+add_task(
+    "gqa_multi",
+    source=seqio.TfdsDataSource(
+        tfds_name="gqa:1.0.1",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "val", "test": "test"}
+    ),
+    preprocessors=[
+        functools.partial(format_gqa, is_balanced=True, flatten=False),
+        extract_individual_vqa,
+    ],
+    inf_preprocessor=[
+        functools.partial(format_gqa, is_balanced=True, flatten=False),
+        extract_individual_vqa,
+    ],
+    style="gqa",
+)
+add_task(
+    "text_vqa",
+    source=seqio.TfdsDataSource(
+        tfds_name="text_vqa:1.0.3",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        functools.partial(
+            rekey, key_map={
+                "image": ["image"],
+                "questions": ["question"],
+                "answers": ["answers"],
+                "id": ["question_id"]
+            }),
+        extract_individual_vqa,
+    ],
+    style="text_vqa",
+)
+add_task(
+    "okvqa",
+    source=seqio.TfdsDataSource(
+        tfds_name="ok_vqa:1.0.2",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        rename(example_id="question_id"),
+        add_coco_url,
+        extract_individual_vqa,
+    ],
+    style="okvqa",
+)
+add_task(
+    "a_okvqa_da",
+    source=seqio.TfdsDataSource(
+        tfds_name="a_ok_vqa:1.0.2",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "val", "test": "test"}
+    ),
+    preprocessors=[
+        rename(**{
+            "example_id": "question_id",
+            "answers": "direct_answers",
+            "metadata/difficult_direct_answer": "difficult_direct_answer"
+        }),
+        extract_individual_vqa,
+    ],
+    inf_preprocessor=[
+        filter_difficult_direct_answer,
+        rename(**{
+            "example_id": "question_id",
+            "answers": "direct_answers",
+            "metadata/difficult_direct_answer": "difficult_direct_answer"
+        }),
+        add_coco_url,
+        extract_individual_vqa,
+    ],
+    style="a_okvqa_da",
+)
+add_task(
+    "a_okvqa_mc",
+    source=seqio.TfdsDataSource(
+        tfds_name="a_ok_vqa:1.0.2",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "val", "test": "test"}
+    ),
+    preprocessors=[
+        rename(**{
+            "example_id": "question_id",
+            "metadata/difficult_direct_answer": "difficult_direct_answer",
+            "answer_idx": "correct_choice_idx"
+        }),
+        add_coco_url,
+        format_multiple_choice_qa,
+    ],
+    style="a_okvqa_mc",
+)
+add_task(
+    "dv_qa",
+    source=seqio.TfdsDataSource(
+        tfds_name="dv_qa:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "val_easy"}
+    ),
+    preprocessors=[
+        extract_figureqa,
+        extract_individual_vqa,
+    ],
+    inf_preprocessor=[
+        extract_figureqa,
+        flatten_vqa,
+        extract_individual_vqa
+    ],
+    style="dv_qa",
+)
+@seqio.map_over_dataset
+def add_image_question_example_id(ex):
+    key = tf.strings.join([ex["question"], "\n\n", ex["image"]])
+    ex["metadata/example_id"] = tf.strings.to_hash_bucket(key, 2**30)
+    return ex
+add_task(
+    "chart_qa",
+    source=seqio.TfdsDataSource(
+        tfds_name="chart_qa:1.0.2",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "val", "test": "test"}
+    ),
+    preprocessors=[
+        rename(question="query", answer="label", **{"metadata/is_human": "is_human"}),
+        add_image_question_example_id,
+        extract_individual_vqa,
+    ],
+    style="chart_qa",
+)
+add_task(
+    "chart_qa_ex",
+    source=seqio.TfdsDataSource(
+        tfds_name="chart_qa:1.0.2",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "val", "test": "test"}
+    ),
+    preprocessors=[
+        rename(question="query", answer="label", **{"metadata/is_human": "is_human"}),
+        extract_individual_vqa,
+    ],
+    style="scifi_charts_qa_exp",
+)
+add_task(
+    "chart_qa_weighted",
+    source=seqio.TfdsDataSource(
+        tfds_name="chart_qa:1.0.2",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "val", "test": "test"}
+    ),
+    preprocessors=[
+        rename(question="query", answer="label", **{"metadata/is_human": "is_human"}),
+        extract_individual_vqa,
+        functools.partial(reweight_chartqa, human=2*20901/(20901+7398), aug=2*7398/(20901+7398)),
+    ],
+    style="chart_qa",
+)
+add_task(
+    "chart_qa_human",
+    source=seqio.TfdsDataSource(
+        tfds_name="chart_qa:1.0.2",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "val", "test": "test"}
+    ),
+    preprocessors=[
+        rename(question="query", answer="label"),
+        add_image_question_example_id,
+        filter_human,
+        extract_individual_vqa,
+    ],
+    style="chart_qa",
+)
+add_task(
+    "chart_qa_aug",
+    source=seqio.TfdsDataSource(
+        tfds_name="chart_qa:1.0.2",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "val", "test": "test"}
+    ),
+    preprocessors=[
+        rename(question="query", answer="label"),
+        filter_aug,
+        extract_individual_vqa,
+    ],
+    style="chart_qa",
+)
+add_task(
+    "doc_qa",
+    source=seqio.TfdsDataSource(
+        tfds_name="doc_qa:1.0.1",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "val", "test": "test"}
+    ),
+    preprocessors=[fix_doqa_url, extract_individual_vqa],
+    style="doc_qa",
+)
+add_task(
+    "ocr_qa",
+    source=seqio.TfdsDataSource(
+        tfds_name="ocr_vqa:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[extract_individual_vqa],
+    inf_preprocessor=[flatten_vqa, extract_individual_vqa],
+    style="ocr_vqa",
+)
+add_task(
+    "st_qa",
+    source=seqio.TfdsDataSource(
+        tfds_name="st_vqa:1.0.2",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train[1024:]", "validation": "train[:1024]", "test": "test"}
+    ),
+    preprocessors=[extract_individual_vqa],
+    inf_preprocessor=[extract_individual_vqa],
+    style="st_qa",
+)
+add_task(
+    "tally_qa",
+    source=seqio.TfdsDataSource(
+        tfds_name="tally_qa:1.0.2",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "test"}
+    ),
+    preprocessors=[
+        extract_tally_qa,
+        extract_individual_vqa
+    ],
+    inf_preprocessor=[
+        extract_tally_qa,
+        flatten_vqa,
+        extract_individual_vqa
+    ],
+    style="tally_qa",
+)
+add_task(
+    "info_qa",
+    source=seqio.TfdsDataSource(
+        tfds_name="info_qa:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "val", "test": "test"}
+    ),
+    preprocessors=[extract_individual_vqa],
+    style="info_qa",
+)
+add_task(
+    "android_control",
+    source=seqio.TfdsDataSource(
+        tfds_name="android_control:2.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "train", "validation": "val", "test": "test"}
+    ),
+    preprocessors=[extract_android_control],
+    style="android_control",
+)
+for mode in ["ll", "hl", "hl_ll", "hl_cot"]:
+    add_task(
+        f"android_control_{mode}",
+        source=seqio.TfdsDataSource(
+            tfds_name="android_control:2.0.0",
+            tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+            splits={"train": "train", "validation": "val", "test": "test"}
+        ),
+        preprocessors=[functools.partial(extract_andriod_control_inf, mode=mode)],
+        style="android_control",
+    )
+map_coco_vqa = functools.partial(rekey, key_map={
+    "image": ["image"],
+    "questions": ["vqa", "questions"],
+    "answers": ["vqa", "answers"],
+    "id": ["vqa", "id"],
+    "metadata/image_url": ["metadata/image_url"],
+})
+add_task(
+    "coco_2017_vqa",
+    source=seqio.TfdsDataSource(
+        tfds_name="coco_all:1.0.1",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        add_coco_url,
+        map_coco_vqa,
+        flatten_vqa,
+        extract_individual_vqa
+    ],
+    style="vqa2",
+)
+add_task(
+    "cockatoo_qa",
+    source=seqio.TfdsDataSource(
+        tfds_name="cockatoo_qa:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[5120:]",
+            validation="train[:5120]"
+        )
+    ),
+    preprocessors=[rename(text="answer")],
+    style=None,
+)
+add_task(
+    "synthetic_qa_v3",
+    source=seqio.TfdsDataSource(
+        tfds_name="synthetic_qa_v3:0.0.4",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[2048:]",
+            validation="train[:2048]"
+        )
+    ),
+    preprocessors=[extract_cockatoo_qa_v2, prefix_how_many_messages],
+    style="synthetic_qa",
+)
+add_task(
+    "synthetic_qa_v3_style_tag",
+    source=seqio.TfdsDataSource(
+        tfds_name="synthetic_qa_v3:0.0.4",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[2048:]",
+            validation="train[:2048]"
+        )
+    ),
+    preprocessors=[extract_cockatoo_qa_v2, prefix_how_many_messages],
+    style="llm_qa",
+)
+add_task(
+    "synthetic_qa_v3_as_user_qa",
+    source=seqio.TfdsDataSource(
+        tfds_name="synthetic_qa_v3:0.0.4",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[2048:]",
+            validation="train[:2048]"
+        )
+    ),
+    preprocessors=[extract_cockatoo_qa_v2, prefix_how_many_messages],
+    style="user_qa",
+)
+add_task(
+    "synthetic_qa_v3_multi_turn",
+    source=seqio.TfdsDataSource(
+        tfds_name="synthetic_qa_v3:0.0.4",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[2048:]",
+            validation="train[:2048]"
+        )
+    ),
+    preprocessors=[extract_cockatoo_qa_v2, filter_single_turn, prefix_how_many_messages],
+    style="synthetic_qa",
+)
+NE_SHARDS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
+for i in NE_SHARDS:
+    add_task(
+        f"named_entity{i}",
+        source=seqio.TfdsDataSource(
+            tfds_name=f"named_entities_qa_{i}_of_18:1.0.0",
+            tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+            splits=dict(
+                train="train[1024:]",
+                validation="train[:1024]"
+            )
+        ),
+        preprocessors=[filter_named_entity, extract_named_entity, extract_individual_vqa],
+        inf_preprocessor=[
+            filter_named_entity,
+            extract_named_entity,
+            flatten_vqa,
+            extract_individual_vqa
+        ],
+        style="named_entity",
+        ignore_errors=True
+    )
+add_task(
+    "user_qa",
+    source=seqio.TfdsDataSource(
+        tfds_name="user_qa:0.0.1",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits=dict(
+            train="train[2048:]",
+            validation="train[:2048]"
+        )
+    ),
+    preprocessors=[extract_cockatoo_qa_v2, prefix_how_many_messages],
+    style="user_qa",
+)
+add_task(
+    "user_questions_for_elo",
+    source=seqio.TfdsDataSource(
+        tfds_name="user_questions_for_elo:0.0.3",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[functools.partial(extract_individual_vqa, test=True)],
+    inf_only=True,
+    style="demo",
+)
+def _filter_by_id(ds, prediction_file, max_seq_len):
+    with open(prediction_file) as f:
+        predictions = json.load(f)
+    is_long = []
+    lens = []
+    tokenizer = build_tokenizer("hf-Qwen/Qwen2-7B")
+    for pred in predictions:
+        n_tokens = len(tokenizer.encode(pred["prediction"]))
+        lens.append(n_tokens)
+        if n_tokens >= max_seq_len:
+            is_long.append(pred["example_id"])
+    is_long = tf.constant(is_long)
+    logging.info(f"Filtering for {len(is_long)} ids")
+    return ds.filter(lambda ex: tf.reduce_any(ex["example_id"] == is_long))
+add_task(
+    "user_questions_for_elo",
+    source=seqio.TfdsDataSource(
+        tfds_name="user_questions_for_elo:0.0.3",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[functools.partial(extract_individual_vqa, test=True)],
+    inf_only=True,
+    style="demo",
+)
+add_task(
+    "user_questions_for_elo_long",
+    source=seqio.TfdsDataSource(
+        tfds_name="user_questions_for_elo:0.0.3",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        functools.partial(_filter_by_id, prediction_file="/weka/oe-training-default/chrisc/cockatoo/models/uber-model-v11/70b-335-30k-3.2-resume8k-noopt/predictions-ck20000-user_questions_for_elo-test/predictions.json", max_seq_len=230),
+        functools.partial(extract_individual_vqa, test=True)
+    ],
+    inf_only=True,
+    style="demo",
+)
+add_task(
+    "coco_2014_vqa",
+    source=seqio.TfdsDataSource(
+        tfds_name="coco_2014_all:1.0.1",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        add_coco_url,
+        map_coco_vqa,
+        flatten_vqa,
+        extract_individual_vqa
+    ],
+    inf_preprocessor=[
+        add_coco_url,
+        map_coco_vqa,
+        flatten_vqa,
+        extract_individual_vqa
+    ],
+    style="vqa2",
+)
+add_task(
+    "coco_2014_vqa_multi",
+    source=seqio.TfdsDataSource(
+        tfds_name="coco_2014_all:1.0.1",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        add_coco_url,
+        map_coco_vqa,
+        extract_individual_vqa
+    ],
+    inf_preprocessor=[
+        add_coco_url,
+        map_coco_vqa,
+        flatten_vqa,
+        extract_individual_vqa
+    ],
+    style="vqa2",
+)
+add_task(
+    "coco_2017_vqa_multi",
+    source=seqio.TfdsDataSource(
+        tfds_name="coco_all:1.0.1",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        add_coco_url,
+        map_coco_vqa,
+        extract_individual_vqa
+    ],
+    inf_preprocessor=[
+        add_coco_url,
+        map_coco_vqa,
+        flatten_vqa,
+        extract_individual_vqa
+    ],
+    style="vqa2",
+)
+add_task(
+    "vqa_v2_test",
+    source=seqio.TfdsDataSource(
+        tfds_name="coco_test_all:1.0.1",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        functools.partial(rekey, key_map={
+            "image": ["image"],
+            "questions": ["vqa", "questions"],
+            "answers": ["vqa", "answers"],
+            "id": ["vqa", "id"],
+        }),
+        flatten_vqa,
+        functools.partial(extract_individual_vqa, test=True)
+    ],
+    style="vqa2",
+    inf_only=True
+)
+# ************************
+# Eval-only Datasets
+# ************************
+add_task(
+    "seed_bench_test",
+    source=seqio.TfdsDataSource(
+        tfds_name="seed_bench:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        format_multiple_choice_qa,
+    ],
+    style="a_okvqa_mc",
+    inf_only=True
+)
+add_task(
+    "pope_test",
+    # A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
+    source=seqio.TfdsDataSource(
+        tfds_name="pope:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        add_coco_url,
+        extract_individual_vqa
+    ],
+    style="vqa2",
+    inf_only=True
+)
+MME_SOURCE = seqio.TfdsDataSource(
+    tfds_name="mme:1.0.0",
+    tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+)
+add_task(
+    "mme_test",
+    # A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
+    source=MME_SOURCE,
+    preprocessors=[
+        functools.partial(flatten_parts, parts=["questions", "answers"]),
+        rename(question="questions", answer="answers"),
+        extract_individual_vqa,
+    ],
+    style="vqa2",
+    inf_only=True
+)
+add_task(
+    "real_world_qa_test",
+    # A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
+    source=seqio.TfdsDataSource(
+        tfds_name="real_world_qa:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        functools.partial(
+            format_multiple_style_qa,
+            types=['multiple_choice', 'short_answer'],
+            styles=['a_okvqa_mc', 'vqa2'],
+            default_style="a_okvqa_mc",
+        ),
+    ],
+    style=None,
+    inf_only=True
+)
+add_task(
+    "real_world_qa_no_instruction",
+    # A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
+    source=seqio.TfdsDataSource(
+        tfds_name="real_world_qa:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        functools.partial(
+            functools.partial(format_multiple_style_qa, strip_instruction=True),
+            types=['multiple_choice', 'short_answer'],
+            styles=['a_okvqa_mc', 'vqa2'],
+            default_style="a_okvqa_mc",
+        ),
+    ],
+    style=None,
+    inf_only=True
+)
+add_task(
+    "real_world_qa_dbg",
+    # A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
+    source=seqio.TfdsDataSource(
+        tfds_name="real_world_qa:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        functools.partial(
+            format_multiple_style_qa,
+            types=['multiple_choice', 'short_answer'],
+            styles=['user_qa', 'user_qa'],
+            default_style="user_qa",
+        ),
+    ],
+    style=None,
+    inf_only=True
+)
+add_task(
+    "mmmu",
+    # A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
+    source=seqio.TfdsDataSource(
+        tfds_name="mmmu:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"train": "dev"},
+    ),
+    preprocessors=[
+        rename(img_type="metadata/img_type"),
+        functools.partial(
+            extract_mmmu,
+            types=['multiple-choice', 'open'],
+            styles=['a_okvqa_mc', 'vqa2'],
+            default_style="a_okvqa_mc",
+        ),
+    ],
+    style=None,
+)
+add_task(
+    "mmmu_test",
+    # A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
+    source=seqio.TfdsDataSource(
+        tfds_name="mmmu:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"validation": "validation", "test": "test"},
+    ),
+    preprocessors=[
+        rename(img_type="metadata/img_type"),
+        extract_mmmu,
+    ],
+    style=None,
+    inf_only=True
+)
+for style in ["vaia_qa", "vaia_qa_short_answer_first", "vqa_online", ]:
+  add_task(
+    f"mmmu_test_{style}",
+    # A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
+    source=seqio.TfdsDataSource(
+      tfds_name="mmmu:1.0.0",
+      # tfds_name="mmmu_khan_academy:1.0.1",
+      tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+      splits={"validation": "validation", "test": "test", "dev": "dev"},
+    ),
+    preprocessors=[
+      rename(img_type="metadata/img_type"),
+      extract_mmmu_cot,
+    ],
+    style=style,
+    inf_only=True
+  )
+add_task(
+    "math_vista_test",
+    # A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
+    source=seqio.TfdsDataSource(
+        tfds_name="math_vista:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"validation": "testmini", "test": "test"},
+    ),
+    preprocessors=[
+        functools.partial(rekey, key_map={
+            "id": ["id"],
+            "query": ["query"],
+            "image": ["image"],
+            "choices": ["choices"],
+            "answer": ["answer"],
+            "metadata/question_type": ["question_type"],
+            "metadata/answer_type": ["answer_type"],
+            "metadata/precision": ["precision"],
+            "metadata/split": ["metadata/split"],
+        }),
+        functools.partial(extract_math_vista, styles=['a_okvqa_mc', 'vqa2']),
+    ],
+    style=None,
+    inf_only=True
+)
+add_task(
+    "math_vista_v2",
+    # A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
+    source=seqio.TfdsDataSource(
+        tfds_name="math_vista:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"validation": "testmini", "test": "test"},
+    ),
+    preprocessors=[
+        functools.partial(rekey, key_map={
+            "id": ["id"],
+            "query": ["query"],
+            "image": ["image"],
+            "choices": ["choices"],
+            "answer": ["answer"],
+            "metadata/question_type": ["question_type"],
+            "metadata/answer_type": ["answer_type"],
+            "metadata/precision": ["precision"],
+            "metadata/split": ["metadata/split"],
+        }),
+        reformat_math_vista,
+        functools.partial(
+            extract_math_vista,
+            styles=['a_okvqa_mc', 'vqa2'],
+        ),
+    ],
+    style=None,
+    inf_only=True
+)
+MM_BENCH_SRC = seqio.TfdsDataSource(
+    tfds_name="mmbench:1.0.0",
+    tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    splits={"validation": "dev", "test": "test"},
+)
+add_task(
+    "mmbench_test",
+    source=MM_BENCH_SRC,
+    preprocessors=[format_mmbench],
+    style="a_okvqa_mc",
+    inf_only=True
+)
+add_task(
+    "sugar_crepe_test",
+    # A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
+    source=seqio.TfdsDataSource(
+        tfds_name="sugar_crepe:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        add_coco_url,
+        functools.partial(flatten_parts, parts=["choices", "answer_idx", "metadata/answer_type"]),
+        format_multiple_choice_qa,
+    ],
+    style="a_okvqa_mc",
+    inf_only=True
+)
+add_task(
+    "blink_test",
+    # A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
+    source=seqio.TfdsDataSource(
+        tfds_name="blink:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+    ),
+    preprocessors=[
+        functools.partial(rekey, key_map={
+            "id": ["id"],
+            "question": ["prompt"],
+            "image": ["image_concat"],
+            "choices": ["choices"],
+            "answer_idx": ["answer_idx"],
+            "metadata/subtask": ["metadata/subtask"],
+            "metadata/question": ["question"],
+        }),
+        format_multiple_choice_qa,
+        output_options,
+    ],
+    style="a_okvqa_mc",
+    inf_only=True
+)
+add_task(
+    "oscarbench_qa",
+    source=seqio.TfdsDataSource(
+        tfds_name="oscarbench_qa:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"validation": "val"}
+    ),
+    preprocessors=[oscar_preprocessor],
+    style="oscarbench_qa"
+)
+add_task(
+    "charxiv",
+    source=seqio.TfdsDataSource(
+        tfds_name="charxiv:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"validation": "validation", "test": "test"}
+    ),
+    preprocessors=[charxiv_preprocessor, extract_individual_vqa],
+    inf_preprocessor=[
+        charxiv_preprocessor,
+        functools.partial(flatten_parts, parts=["question", "answer"]),
+        extract_individual_vqa,
+    ],
+    style="charxiv",
+)
+add_task(
+    "charxiv_descriptive",
+    source=seqio.TfdsDataSource(
+        tfds_name="charxiv:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"validation": "validation", "test": "test"}
+    ),
+    preprocessors=[charxiv_descriptive_preprocessor, extract_individual_vqa],
+    inf_preprocessor=[
+        charxiv_descriptive_preprocessor,
+        functools.partial(flatten_parts, parts=["question", "answer"]),
+        extract_individual_vqa,
+    ],
+    style="charxiv_descriptive",
+)
+add_task(
+    "charxiv_reasoning",
+    source=seqio.TfdsDataSource(
+        tfds_name="charxiv:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"validation": "validation", "test": "test"}
+    ),
+    preprocessors=[charxiv_reasoning_preprocessor, extract_individual_vqa],
+    style="charxiv_reasoning",
+)
+for tablevqa_name in ["fintabnetqa", "vwtq", "vwtq_syn"]:
+    add_task(
+        tablevqa_name,
+        source=seqio.TfdsDataSource(
+            tfds_name=f"{tablevqa_name}:1.0.0",
+            tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+            splits={"validation": "test[:125]", "test": "test"}
+        ),
+        preprocessors=[tablevqa_preprocessor, extract_individual_vqa],
+        style=tablevqa_name,
+    )
+add_task(
+    "vtabfact",
+    source=seqio.TfdsDataSource(
+        tfds_name="vtabfact:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"validation": "test[:125]", "test": "test"}
+    ),
+    preprocessors=[vtabfact_preprocessor, extract_individual_vqa],
+    style="vtabfact",
+)
+add_task(
+    "nutrition_fact",
+    source=seqio.TfdsDataSource(
+        tfds_name="nutrition_fact:1.0.0",
+        tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
+        splits={"validation": "test", "test": "test"}
+    ),
+    preprocessors=[nutrition_fact_preprocessor, extract_individual_vqa],
+    inf_preprocessor=[
+            nutrition_fact_preprocessor,
+            functools.partial(flatten_parts, parts=["question", "answer"]),
+            extract_individual_vqa,
+        ],
+    style="nutrition_fact",
+    inf_only=True
+)
+for k in ["chart_qa", "info_qa", "doc_qa", "text_vqa", "coco_2014_vqa",
+          "ai2_diagram_v2_mix_transparent", "chart_qa_human"]:
+    TASKS[k + "_demo"] = dataclasses.replace(TASKS[k], style="demo")

torch_util.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import gc
+import os
+import logging
+from typing import Optional, TypeVar, List, Tuple
+import torch
+import torch.distributed as dist
+T = TypeVar("T")
+log = logging.getLogger(__name__)
+def seed_all(seed: int):
+    """Seed all rng objects."""
+    import random
+    import numpy as np
+    if seed < 0 or seed > 2**32 - 1:
+        raise ValueError(f"Seed {seed} is invalid. It must be on [0; 2^32 - 1]")
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    # torch.manual_seed may call manual_seed_all but calling it again here
+    # to make sure it gets called at least once
+    torch.cuda.manual_seed_all(seed)
+def is_distributed() -> bool:
+    return dist.is_available() and dist.is_initialized()
+def get_node_rank() -> int:
+    return int(os.environ.get("NODE_RANK") or (get_global_rank() - get_local_rank()) // get_local_world_size())
+def get_world_size() -> int:
+    if is_distributed():
+        return dist.get_world_size()
+    else:
+        return 1
+def get_local_world_size() -> int:
+    return int(os.environ.get("LOCAL_WORLD_SIZE") or 1)
+def get_global_rank() -> int:
+    if is_distributed():
+        return int(os.environ.get("RANK") or dist.get_rank())
+    else:
+        return 0
+def get_local_rank() -> int:
+    return int(os.environ.get("LOCAL_RANK") or 0)
+def get_fs_local_rank() -> int:
+    """Get the local rank per filesystem, meaning that, regardless of the number of nodes,
+    if all ranks share the same filesystem then `get_fs_local_rank()` will be equivalent to `get_global_rank()`,
+    but if nodes do not share the same filesystem then `get_fs_local_rank()` will be equivalent to `get_local_rank()`.
+    """
+    if os.environ.get("OLMO_SHARED_FS"):
+        return int(os.environ.get("FS_LOCAL_RANK") or get_global_rank())
+    else:
+        return int(os.environ.get("FS_LOCAL_RANK") or get_local_rank())
+def move_to_device(o: T, device: torch.device) -> T:
+    if isinstance(o, torch.Tensor):
+        return o.to(device)  # type: ignore[return-value]
+    elif isinstance(o, dict):
+        return {k: move_to_device(v, device) for k, v in o.items()}  # type: ignore[return-value]
+    elif isinstance(o, list):
+        return [move_to_device(x, device) for x in o]  # type: ignore[return-value]
+    elif isinstance(o, tuple):
+        return tuple((move_to_device(x, device) for x in o))  # type: ignore[return-value]
+    else:
+        return o
+def ensure_finite_(x: torch.Tensor, check_neg_inf: bool = True, check_pos_inf: bool = False):
+    """
+    Modify ``x`` in place to replace ``float("-inf")`` with the minimum value of the dtype when ``check_neg_inf``
+    is ``True`` and to replace ``float("inf")`` with the maximum value of the dtype when ``check_pos_inf`` is ``True``.
+    """
+    if check_neg_inf:
+        x.masked_fill_(x == float("-inf"), torch.finfo(x.dtype).min)
+    if check_pos_inf:
+        x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max)
+def get_default_device() -> torch.device:
+    if torch.cuda.is_available() and torch.cuda.is_initialized():
+        return torch.device("cuda")
+    else:
+        return torch.device("cpu")
+def barrier() -> None:
+    if is_distributed():
+        dist.barrier()
+def peak_gpu_memory(reset: bool = False) -> Optional[float]:
+    """
+    Get the peak GPU memory usage in MB across all ranks.
+    Only rank 0 will get the final result.
+    """
+    if not torch.cuda.is_available():
+        return None
+    device = torch.device("cuda")
+    peak_mb = torch.cuda.max_memory_allocated(device) / 1000000
+    if is_distributed():
+        peak_mb_tensor = torch.tensor(peak_mb, device=device)
+        dist.reduce(peak_mb_tensor, 0, dist.ReduceOp.MAX)
+        peak_mb = peak_mb_tensor.item()
+    if reset:
+        # Reset peak stats.
+        torch.cuda.reset_max_memory_allocated(device)
+    return peak_mb
+V = TypeVar("V", bool, int, float)
+def synchronize_value(value: V, device: torch.device) -> V:
+    if dist.is_available() and dist.is_initialized():
+        value_tensor = torch.tensor(value, device=device)
+        dist.broadcast(value_tensor, 0)
+        return value_tensor.item()  # type: ignore
+    else:
+        return value
+def synchronize_flag(flag: bool, device: torch.device) -> bool:
+    return synchronize_value(flag, device)
+def gc_cuda():
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+def listinstr(lst, s, delimiter=None):
+    assert isinstance(lst, list)
+    for item in lst:
+        if delimiter:
+            if all(x in s for x in item.split(delimiter)):
+                return True
+        else:
+            if item in s:
+                return True
+    return False
+def freeze_module(module: torch.nn.Module, exclude_params: Optional[List[str]] = None):
+    for name, param in module.named_parameters():
+        if exclude_params is not None and listinstr(exclude_params, name):
+            continue
+        param.requires_grad = False
+def freeze_parameters_by_name(model: torch.nn.Module, freeze_names: Tuple[str]):
+    for name in freeze_names:
+        try:
+            module_or_param = model.get_submodule(name)
+        except:
+            try:
+                module_or_param = model.get_parameter(name)
+            except:
+                log.warning(f"Could not find module or parameter with name {name}")
+        if isinstance(module_or_param, torch.nn.Module):
+            freeze_module(module_or_param)
+        else:
+            module_or_param.requires_grad = False

util.py CHANGED Viewed

@@ -33,7 +33,7 @@ from .exceptions import (
     OLMoNetworkError,
     OLMoThreadError,
 )
-from .torch_util import get_global_rank, get_local_rank, get_node_rank, is_distributed
 try:
     from functools import cache

     OLMoNetworkError,
     OLMoThreadError,
 )
+# from .torch_util import get_global_rank, get_local_rank, get_node_rank, is_distributed
 try:
     from functools import cache

utils.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import dataclasses
+import hashlib
+import sys
+import typing
+import warnings
+import socket
+from typing import Optional, Any, Dict
+import os
+import logging
+import absl.flags
+from flax.traverse_util import flatten_dict
+from ml_collections import ConfigDict, config_flags
+from ml_collections.config_dict import placeholder
+from mlxu import function_args_to_config
+_log_extra_fields: Dict[str, Any] = {}
+def is_float_printable(x):
+    try:
+        f"{x:0.2f}"
+        return True
+    except (ValueError, TypeError):
+        return False
+def compute_hash(string: str) -> str:
+    """Computes the hash of a string."""
+    return hashlib.sha256(string.encode("utf-8")).hexdigest()
+def pop_metadata(data):
+    meta = {k: data.pop(k) for k in list(data) if k.startswith("metadata")}
+    return data, meta
+def setup_logging():
+    handler: logging.Handler
+    handler = logging.StreamHandler(sys.stdout)
+    formatter = logging.Formatter(
+        "[%(levelname)-.1s %(asctime)s %(filename)s:%(lineno)s] %(message)s",
+        datefmt="%H:%M:%S"
+    )
+    handler.setFormatter(formatter)
+    logging.basicConfig(handlers=[handler], level=logging.INFO)
+    logging.captureWarnings(True)
+    logging.getLogger("urllib3").setLevel(logging.ERROR)
+def get_maybe_optional_type(field_type):
+    if type(None) in typing.get_args(field_type):
+        # Handle optional type
+        args = [x for x in typing.get_args(field_type) if x != type(None)]
+        assert len(args) == 1
+        field_type = args[0]
+    return field_type
+def config_from_dataclass(dataclass, defaults_to_none=False) -> ConfigDict:
+    """Build a `ConfigDict` matching the possibly nested dataclass
+    dataclass: A dataclass instance or a dataclass type, if an instance defaults
+               will be set to the values in the class, if a class defaults will be
+               set to the field defaults, or None if the field is required
+    defaults_to_none: Make all defaults None
+    """
+    out = {}
+    fields = dataclasses.fields(dataclass)
+    for field in fields:
+        if not field.init:
+            continue
+        if defaults_to_none:
+            default = None
+        elif hasattr(dataclass, field.name):
+            default = getattr(dataclass, field.name)
+        elif field.default is dataclasses.MISSING:
+            default = None
+        else:
+            default = field.default
+        field_type = get_maybe_optional_type(field.type)
+        if hasattr(field_type, "__dataclass_fields__"):
+            if not defaults_to_none and default is None:
+                pass
+            else:
+                out[field.name] = config_from_dataclass(
+                    default or field.type, defaults_to_none=defaults_to_none)
+        else:
+            if default is None:
+                assert not field_type == typing.Any
+                origin = getattr(field_type, "__origin__", None)
+                if origin is not None:
+                    field_type = origin
+                out[field.name] = placeholder(field_type)
+            else:
+                out[field.name] = default
+    return ConfigDict(out)
+def dataclass_with_none(cls):
+    """Build an instance of possibly nested dataclass `cls` with all attributes None"""
+    fields = dataclasses.fields(cls)
+    args = {}
+    for field in fields:
+        if not field.init:
+            pass
+        elif dataclasses.is_dataclass(field.type):
+            args[field.name] = dataclass_with_none(field.type)
+        else:
+            args[field.name] = None
+    return cls(**args)
+def dataclass_from_config(cls, config: Dict):
+    """Build an instance of `cls` with attributes from `config``"""
+    fields = dataclasses.fields(cls)
+    args = set(x.name for x in fields)
+    for k in config.keys():
+        if k not in args:
+            raise ValueError(f"Config has unknown arg {k} fr {cls}")
+    args = {}
+    for field in fields:
+        if not field.init:
+            continue
+        field_type = get_maybe_optional_type(field.type)
+        if hasattr(field_type, "__dataclass_fields__"):
+            if config.get(field.name) is None:
+                args[field.name] = None
+            elif hasattr(field_type, "from_dict"):
+                src = config[field.name]
+                if isinstance(src, ConfigDict):
+                    src = src.to_dict()
+                args[field.name] = field_type.from_dict(src)
+            else:
+                args[field.name] = dataclass_from_config(field_type, config[field.name])
+        elif field.name in config:
+            if isinstance(config[field.name], ConfigDict):
+                args[field.name] = config[field.name].to_dict()
+            else:
+                args[field.name] = config[field.name]
+    return cls(**args)
+def update_dataclass(obj, updates):
+    """Sets attributes in `obj` to match non-None fields in `updates`"""
+    fields = dataclasses.fields(obj)
+    for field in fields:
+        if not field.init:
+            continue
+        update = updates.get(field.name)
+        if update is None:
+            continue
+        current_value = getattr(obj, field.name)
+        if dataclasses.is_dataclass(current_value):
+            update_dataclass(current_value, update)
+        else:
+            if isinstance(update, (ConfigDict, dict)):
+                assert all(x is None for x in flatten_dict(update).values())
+            else:
+                setattr(obj, field.name, update)
+def log_metrics_to_console(prefix: str, metrics: Dict[str, float]):
+    # Stolen from the OLMo codebase
+    def format_value(value: float) -> str:
+        if isinstance(value, str):
+            return value
+        if value < 0.0001:
+            return str(value)  # scientific notation
+        elif value > 1000:
+            return f"{int(value):,d}"
+        elif value > 100:
+            return f"{value:.1f}"
+        elif value > 10:
+            return f"{value:.2f}"
+        elif value > 1:
+            return f"{value:.3f}"
+        else:
+            return f"{value:.4f}"
+    logging.info(
+        f"{prefix}\n"
+        + "\n".join(
+            [
+                f"    {name}={format_value(value)}"
+                for name, value in metrics.items()
+                if not name.startswith("optim/")  # there's too many optimizer metrics
+            ]
+        )
+    )