Muennighoff
commited on
Commit
•
18652d8
1
Parent(s):
d13896f
Cp over files
Browse files- beam_search.py +1087 -0
- config_molmoe.py +9 -5
- constants.py +571 -0
- data_factory.py +222 -0
- data_utils.py +827 -0
- dataset_sizes.py +262 -0
- exceptions.py +50 -0
- iterable_dataset.py +266 -0
- modeling_molmoe.py +4 -4
- multimodal_preprocessor.py +1549 -0
- preprocesssors.py +2472 -0
- prompts.py +385 -0
- seqio_tokenizer.py +659 -0
- tasks.py +2548 -0
- torch_util.py +183 -0
- util.py +1 -1
- utils.py +195 -0
beam_search.py
ADDED
@@ -0,0 +1,1087 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This is a self-contained and flexible beam search implementation adapted from
|
3 |
+
AllenNLP's beam search: https://github.com/allenai/allennlp/blob/main/allennlp/nn/beam_search.py
|
4 |
+
"""
|
5 |
+
|
6 |
+
import copy
|
7 |
+
import warnings
|
8 |
+
from abc import abstractmethod
|
9 |
+
from inspect import signature
|
10 |
+
from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar, cast
|
11 |
+
|
12 |
+
import torch
|
13 |
+
|
14 |
+
__all__ = [
|
15 |
+
"Sampler",
|
16 |
+
"DeterministicSampler",
|
17 |
+
"MultinomialSampler",
|
18 |
+
"TopKSampler",
|
19 |
+
"TopPSampler",
|
20 |
+
"GumbelSampler",
|
21 |
+
"FinalSequenceScorer",
|
22 |
+
"SequenceLogProbabilityScorer",
|
23 |
+
"LengthNormalizedSequenceLogProbabilityScorer",
|
24 |
+
"Constraint",
|
25 |
+
"RepeatedNGramBlockingConstraint",
|
26 |
+
"BeamSearch",
|
27 |
+
]
|
28 |
+
|
29 |
+
StateType = Dict[str, torch.Tensor]
|
30 |
+
StepFunctionTypeWithTimestep = Callable[[torch.Tensor, StateType, int], Tuple[torch.Tensor, StateType]]
|
31 |
+
StepFunctionTypeNoTimestep = Callable[[torch.Tensor, StateType], Tuple[torch.Tensor, StateType]]
|
32 |
+
|
33 |
+
StepFunctionType = TypeVar("StepFunctionType", StepFunctionTypeWithTimestep, StepFunctionTypeNoTimestep)
|
34 |
+
"""
|
35 |
+
The type of step function that can be passed to [`BeamSearch.search`](#search).
|
36 |
+
|
37 |
+
This can either be [`StepFunctionTypeWithTimestep`](#stepfunctiontypewithtimestep)
|
38 |
+
or [`StepFunctionTypeNoTimestep`](#stepfunctiontypenotimestep).
|
39 |
+
"""
|
40 |
+
|
41 |
+
ConstraintStateType = List[List[Dict[str, Any]]]
|
42 |
+
|
43 |
+
|
44 |
+
class Sampler:
|
45 |
+
"""
|
46 |
+
An abstract class that can be used to sample candidates (either nodes or beams)
|
47 |
+
within `BeamSearch`.
|
48 |
+
|
49 |
+
A `Sampler` just has three methods, `init_state()`, `sample_nodes()` and `sample_beams()`.
|
50 |
+
|
51 |
+
`init_state()` takes three arguments:
|
52 |
+
|
53 |
+
- a tensor of starting log probs with shape `(batch_size,, num_classes)`,
|
54 |
+
- the batch size, an int,
|
55 |
+
- and the number of classes, also an int.
|
56 |
+
|
57 |
+
It returns a state dictionary with any state tensors needed for subsequent
|
58 |
+
calls to `sample_nodes()` and `sample_beams()`.
|
59 |
+
|
60 |
+
By default this method just returns an empty dictionary.
|
61 |
+
|
62 |
+
Both `sample_nodes()` and `sample_beams()` should take three arguments:
|
63 |
+
|
64 |
+
- tensor of normalized log probabilities with shape `(batch_size, num_examples)`,
|
65 |
+
- an integer representing the number of samples to take for each example in the batch,
|
66 |
+
- and a state dictionary which could contain any tensors needed for the `Sampler` to keep
|
67 |
+
track of state.
|
68 |
+
|
69 |
+
For `sample_nodes()`, `num_examples = num_classes`, but for `sample_beams`,
|
70 |
+
`num_examples = beam_size * per_node_beam_size`.
|
71 |
+
|
72 |
+
The return value should be a tuple containing:
|
73 |
+
|
74 |
+
- a tensor of log probabilities of the sampled examples with shape `(batch_size, num_samples)`,
|
75 |
+
- a tensor of indices of the sampled examples with shape `(batch_size, num_samples)`,
|
76 |
+
- and the updated state dictionary.
|
77 |
+
|
78 |
+
A default implementation of `sample_beams` is provided, which just deterministically
|
79 |
+
picks the `k` examples with highest log probability.
|
80 |
+
"""
|
81 |
+
|
82 |
+
def init_state(
|
83 |
+
self, start_class_log_probabilities: torch.Tensor, batch_size: int, num_classes: int
|
84 |
+
) -> StateType:
|
85 |
+
del start_class_log_probabilities, batch_size, num_classes
|
86 |
+
return {}
|
87 |
+
|
88 |
+
@abstractmethod
|
89 |
+
def sample_nodes(
|
90 |
+
self, log_probs: torch.Tensor, per_node_beam_size: int, state: StateType
|
91 |
+
) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
|
92 |
+
raise NotImplementedError
|
93 |
+
|
94 |
+
def sample_beams(
|
95 |
+
self, log_probs: torch.Tensor, beam_size: int, state: StateType
|
96 |
+
) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
|
97 |
+
del state
|
98 |
+
selected_log_probs, selected_indices = torch.topk(log_probs, beam_size, dim=-1)
|
99 |
+
return selected_log_probs, selected_indices, {}
|
100 |
+
|
101 |
+
|
102 |
+
class DeterministicSampler(Sampler):
|
103 |
+
"""
|
104 |
+
A `Sampler` that just deterministically returns the `k` nodes or beams with highest
|
105 |
+
log probability.
|
106 |
+
"""
|
107 |
+
|
108 |
+
def sample_nodes(
|
109 |
+
self, log_probs: torch.Tensor, per_node_beam_size: int, state: StateType
|
110 |
+
) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
|
111 |
+
del state
|
112 |
+
selected_log_probs, selected_indices = torch.topk(log_probs, per_node_beam_size, dim=-1)
|
113 |
+
return selected_log_probs, selected_indices, {}
|
114 |
+
|
115 |
+
|
116 |
+
class MultinomialSampler(Sampler):
|
117 |
+
"""
|
118 |
+
A `Sampler` which samples nodes from the given multinomial distribution. Beams are sampled
|
119 |
+
in the default, non-deterministic way.
|
120 |
+
|
121 |
+
:param temperature: A `temperature` below 1.0 produces a sharper probability distribution and a `temperature`
|
122 |
+
above 1.0 produces a flatter probability distribution.
|
123 |
+
:param with_replacement: Whether to sample with replacement.
|
124 |
+
|
125 |
+
"""
|
126 |
+
|
127 |
+
def __init__(
|
128 |
+
self,
|
129 |
+
temperature: float = 1.0,
|
130 |
+
with_replacement: bool = False,
|
131 |
+
) -> None:
|
132 |
+
self.temperature = temperature
|
133 |
+
self.with_replacement = with_replacement
|
134 |
+
|
135 |
+
def sample_nodes(
|
136 |
+
self, log_probs: torch.Tensor, per_node_beam_size: int, state: StateType
|
137 |
+
) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
|
138 |
+
if self.temperature != 1.0:
|
139 |
+
_probabilities = torch.nn.functional.softmax(log_probs / self.temperature, dim=-1)
|
140 |
+
else:
|
141 |
+
_probabilities = log_probs.exp()
|
142 |
+
|
143 |
+
selected_indices = torch.multinomial(_probabilities, per_node_beam_size, replacement=self.with_replacement)
|
144 |
+
|
145 |
+
return torch.gather(log_probs, 1, selected_indices), selected_indices, state
|
146 |
+
|
147 |
+
|
148 |
+
class TopKSampler(Sampler):
|
149 |
+
"""
|
150 |
+
A `Sampler` which redistributes the probability mass function for nodes among the
|
151 |
+
top `k` choices, then samples from that subset after re-normalizing the probabilities.
|
152 |
+
|
153 |
+
Beams are sampled in the default, deterministic way.
|
154 |
+
|
155 |
+
:param k: The number of top choices to be selected from.
|
156 |
+
:param temperature: A `temperature` below 1.0 produces a sharper probability distribution and a `temperature`
|
157 |
+
above 1.0 produces a flatter probability distribution.
|
158 |
+
:param with_replacement: If set to `True`, samples will be selected with replacement from the top k choices.
|
159 |
+
"""
|
160 |
+
|
161 |
+
def __init__(
|
162 |
+
self,
|
163 |
+
k: int = 1,
|
164 |
+
temperature: float = 1.0,
|
165 |
+
with_replacement: bool = False,
|
166 |
+
):
|
167 |
+
self.k = k
|
168 |
+
self.temperature = temperature or 1.0
|
169 |
+
self.with_replacement = with_replacement
|
170 |
+
|
171 |
+
def sample_nodes(
|
172 |
+
self, log_probs: torch.Tensor, per_node_beam_size: int, state: StateType
|
173 |
+
) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
|
174 |
+
if not per_node_beam_size <= self.k <= log_probs.size()[1]:
|
175 |
+
raise ValueError(
|
176 |
+
"k must be a postive integer no less than per_node_beam_size and no greater than vocabulary size"
|
177 |
+
)
|
178 |
+
|
179 |
+
# shape (both): (batch_size, k)
|
180 |
+
top_k_log_probs, top_k_indices = log_probs.topk(self.k, dim=-1)
|
181 |
+
|
182 |
+
# Apply temperature if necessary.
|
183 |
+
# shape: (batch_size, k)
|
184 |
+
if self.temperature != 1.0:
|
185 |
+
top_k_log_probs = top_k_log_probs / self.temperature
|
186 |
+
|
187 |
+
# Re-normalize the subset.
|
188 |
+
# shape: (batch_size, k)
|
189 |
+
normalized_top_k_probs = torch.nn.functional.softmax(top_k_log_probs, dim=-1)
|
190 |
+
|
191 |
+
# Sample from the re-normalized subset.
|
192 |
+
# NOTE: These indices are not indices into `log_probs`, they are indices into `top_k_log_probs`.
|
193 |
+
# shape: (batch_size, per_node_beam_size)
|
194 |
+
sampled_indices = torch.multinomial(
|
195 |
+
normalized_top_k_probs, per_node_beam_size, replacement=self.with_replacement
|
196 |
+
)
|
197 |
+
|
198 |
+
# Convert `sampled_indices` back to indices in the original `log_probs` tensor.
|
199 |
+
# shape: (batch_size, per_node_beam_size)
|
200 |
+
indices = top_k_indices.gather(-1, sampled_indices)
|
201 |
+
|
202 |
+
return log_probs.gather(1, indices), indices, state
|
203 |
+
|
204 |
+
|
205 |
+
class TopPSampler(Sampler):
|
206 |
+
"""
|
207 |
+
A `Sampler` which redistributes the probability mass function for nodes among
|
208 |
+
the top choices with a cumulative probability of at least `p`, then samples from that subset
|
209 |
+
after re-normalizing the probabilities.
|
210 |
+
|
211 |
+
Beams are sampled in the default, deterministic way.
|
212 |
+
|
213 |
+
:param p:
|
214 |
+
The cumulative probability cutoff threshold. A higher value of `p` will result in more possible
|
215 |
+
examples to sample from. If `with_replacement` is `False` and the number of possible samples is
|
216 |
+
insufficient to sample without replacement from when calling `sample_nodes`, then the top
|
217 |
+
`per_node_beam_size` examples will be chosen.
|
218 |
+
:param temperature:
|
219 |
+
A `temperature` below 1.0 produces a sharper probability distribution and a `temperature`
|
220 |
+
above 1.0 produces a flatter probability distribution.
|
221 |
+
:param with_replacement:
|
222 |
+
If set to `True`, samples will be selected with replacement from the top choices.
|
223 |
+
|
224 |
+
"""
|
225 |
+
|
226 |
+
def __init__(
|
227 |
+
self,
|
228 |
+
p: float = 0.9,
|
229 |
+
temperature: float = 1.0,
|
230 |
+
with_replacement: bool = False,
|
231 |
+
):
|
232 |
+
if p < 0.0 or p > 1.0:
|
233 |
+
raise ValueError("p must be a positive float no greater than 1.0")
|
234 |
+
self.p = p
|
235 |
+
self.temperature = temperature or 1.0
|
236 |
+
self.with_replacement = with_replacement
|
237 |
+
|
238 |
+
def sample_nodes(
|
239 |
+
self, log_probs: torch.Tensor, per_node_beam_size: int, state: StateType
|
240 |
+
) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
|
241 |
+
if not per_node_beam_size <= log_probs.size()[1]:
|
242 |
+
raise ValueError("per_node_beam_size cannot be greater than vocabulary size")
|
243 |
+
|
244 |
+
# First apply temperature coefficient:
|
245 |
+
if self.temperature != 1.0:
|
246 |
+
_log_probs = torch.nn.functional.log_softmax(log_probs / self.temperature, dim=-1)
|
247 |
+
else:
|
248 |
+
_log_probs = log_probs
|
249 |
+
|
250 |
+
# Sort the probabilities in descending order to then find cumulative sum
|
251 |
+
log_probs_descending, sorting_indices = torch.sort(_log_probs, descending=True)
|
252 |
+
|
253 |
+
# shape: (batch_size, num_classes)
|
254 |
+
probabilities_descending = log_probs_descending.exp()
|
255 |
+
probabilities_summed = torch.cumsum(probabilities_descending, dim=-1)
|
256 |
+
|
257 |
+
# Create a mask for filtering out probabilities that don't make the top `p`.
|
258 |
+
# shape: (batch_size, num_classes)
|
259 |
+
exclusion_mask = probabilities_summed >= self.p
|
260 |
+
|
261 |
+
# We want to include the first index where probabilities_summed >= p, so we shift over one.
|
262 |
+
exclusion_mask[..., 1:] = exclusion_mask[..., :-1].clone()
|
263 |
+
exclusion_mask[..., 0] = False
|
264 |
+
|
265 |
+
# Make sure there's at least `per_node_beam_size` options to be selected.
|
266 |
+
if not self.with_replacement:
|
267 |
+
exclusion_mask[..., :per_node_beam_size] = False
|
268 |
+
|
269 |
+
log_probs_descending[exclusion_mask] = torch.finfo(log_probs.dtype).min
|
270 |
+
|
271 |
+
# Now re-normalized the included log probs.
|
272 |
+
# shape: (batch_size, num_classes)
|
273 |
+
filtered_probabilities = torch.nn.functional.softmax(log_probs_descending, dim=-1)
|
274 |
+
|
275 |
+
# Sample from the re-normalized subset.
|
276 |
+
# NOTE: These indices are not indices into `log_probs`, they are indices into `log_probs_descending`.
|
277 |
+
# shape: (batch_size, per_node_beam_size)
|
278 |
+
sampled_indices = torch.multinomial(
|
279 |
+
filtered_probabilities, per_node_beam_size, replacement=self.with_replacement
|
280 |
+
)
|
281 |
+
|
282 |
+
# Convert `sampled_indices` back to indices in the original `log_probs` tensor.
|
283 |
+
# shape: (batch_size, per_node_beam_size)
|
284 |
+
selected_indices = sorting_indices.gather(-1, sampled_indices)
|
285 |
+
|
286 |
+
# Return (selected log probabilities, selected classes)
|
287 |
+
# shape: (len(log_probs),1) , (len(log_probs), 1)
|
288 |
+
return torch.gather(log_probs, 1, selected_indices), selected_indices, state
|
289 |
+
|
290 |
+
|
291 |
+
class GumbelSampler(Sampler):
|
292 |
+
"""
|
293 |
+
A `Sampler` which uses the Gumbel-Top-K trick to sample without replacement. See
|
294 |
+
[*Stochastic Beams and Where to Find Them: The Gumbel-Top-k Trick for Sampling
|
295 |
+
Sequences Without Replacement*, W Kool, H Van Hoof and M Welling, 2010]
|
296 |
+
(https://api.semanticscholar.org/CorpusID:76662039).
|
297 |
+
|
298 |
+
:param temperature: A `temperature` below 1.0 produces a sharper probability distribution and a `temperature`
|
299 |
+
above 1.0 produces a flatter probability distribution.
|
300 |
+
"""
|
301 |
+
|
302 |
+
def __init__(self, temperature: float = 1.0):
|
303 |
+
self.temperature = temperature
|
304 |
+
|
305 |
+
def init_state(
|
306 |
+
self, start_class_log_probabilities: torch.Tensor, batch_size: int, num_classes: int
|
307 |
+
) -> StateType:
|
308 |
+
# shape: (batch_size, num_classes)
|
309 |
+
zeros = start_class_log_probabilities.new_zeros((batch_size, num_classes))
|
310 |
+
|
311 |
+
# shape: (batch_size, num_classes)
|
312 |
+
G_phi_S = self.gumbel_with_max(start_class_log_probabilities, zeros)
|
313 |
+
|
314 |
+
return {"G_phi_S": G_phi_S}
|
315 |
+
|
316 |
+
def sample_nodes(
|
317 |
+
self,
|
318 |
+
log_probs: torch.Tensor,
|
319 |
+
per_node_beam_size: int,
|
320 |
+
state: StateType,
|
321 |
+
) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
|
322 |
+
# First apply temperature coefficient:
|
323 |
+
# shape: (batch_size * beam_size, num_classes)
|
324 |
+
if self.temperature != 1.0:
|
325 |
+
_log_probs = torch.nn.functional.log_softmax(log_probs / self.temperature, dim=-1)
|
326 |
+
else:
|
327 |
+
_log_probs = log_probs
|
328 |
+
|
329 |
+
# shape: (group_size,)
|
330 |
+
phi_S = state["phi_S"]
|
331 |
+
|
332 |
+
# shape: (group_size, num_classes)
|
333 |
+
phi_S = phi_S.unsqueeze(-1).expand_as(_log_probs)
|
334 |
+
|
335 |
+
# shape: (group_size, num_classes)
|
336 |
+
phi_S_new = phi_S + _log_probs
|
337 |
+
|
338 |
+
# shape: (group_size, 1)
|
339 |
+
G_phi_S = state["G_phi_S"].unsqueeze(-1)
|
340 |
+
|
341 |
+
# shape: (group_size, num_classes)
|
342 |
+
G_phi_S_new = self.gumbel_with_max(phi_S_new, G_phi_S)
|
343 |
+
|
344 |
+
# Replace NaNs with very negative number.
|
345 |
+
# shape: (group_size, num_classes)
|
346 |
+
# G_phi_S_new[G_phi_S_new.isnan()] = torch.finfo(G_phi_S_new.dtype).min
|
347 |
+
|
348 |
+
# shape (both): (group_size, per_node_beam_size)
|
349 |
+
top_G_phi_S_new, top_indices = torch.topk(G_phi_S_new, per_node_beam_size, dim=-1)
|
350 |
+
|
351 |
+
# shape: (group_size, per_node_beam_size)
|
352 |
+
top_log_probs = log_probs.gather(1, top_indices)
|
353 |
+
|
354 |
+
return top_log_probs, top_indices, {"G_phi_S": top_G_phi_S_new}
|
355 |
+
|
356 |
+
def sample_beams(
|
357 |
+
self,
|
358 |
+
log_probs: torch.Tensor,
|
359 |
+
beam_size: int,
|
360 |
+
state: StateType,
|
361 |
+
) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
|
362 |
+
"""
|
363 |
+
Returns the beams with the highest perturbed log probabilities.
|
364 |
+
"""
|
365 |
+
# shape (log_probs): (batch_size, beam_size * per_node_beam_size)
|
366 |
+
|
367 |
+
batch_size = log_probs.size()[0]
|
368 |
+
|
369 |
+
# shape: (batch_size * beam_size, per_node_beam_size)
|
370 |
+
G_phi_S = state["G_phi_S"]
|
371 |
+
|
372 |
+
# shape: (batch_size, beam_size * per_node_beam_size)
|
373 |
+
G_phi_S = G_phi_S.reshape_as(log_probs)
|
374 |
+
|
375 |
+
# shape (both): (batch_size, beam_size)
|
376 |
+
G_phi_S_new, selected_indices = torch.topk(G_phi_S, beam_size, dim=-1)
|
377 |
+
|
378 |
+
# shape: (batch_size, beam_size)
|
379 |
+
selected_log_probs = log_probs.gather(1, selected_indices)
|
380 |
+
|
381 |
+
# Now sort the selected beams by their true log prob.
|
382 |
+
# shape (all): (batch_size, beam_size)
|
383 |
+
selected_log_probs, sort_indices = selected_log_probs.sort(dim=-1, descending=True)
|
384 |
+
selected_indices = selected_indices.gather(1, sort_indices)
|
385 |
+
G_phi_S_new = G_phi_S_new.gather(1, sort_indices)
|
386 |
+
|
387 |
+
# shape: (batch_size * beam_size,)
|
388 |
+
G_phi_S_new = G_phi_S_new.reshape(batch_size * beam_size)
|
389 |
+
|
390 |
+
# shape: (batch_size * beam_size,)
|
391 |
+
phi_S = selected_log_probs.reshape(batch_size * beam_size)
|
392 |
+
|
393 |
+
return selected_log_probs, selected_indices, {"G_phi_S": G_phi_S_new, "phi_S": phi_S}
|
394 |
+
|
395 |
+
def gumbel(self, phi) -> torch.Tensor:
|
396 |
+
"""
|
397 |
+
Sample `Gumbel(phi)`.
|
398 |
+
|
399 |
+
`phi` should have shape `(batch_size, num_classes)`.
|
400 |
+
"""
|
401 |
+
return -torch.log(-torch.log(torch.rand_like(phi))) + phi
|
402 |
+
|
403 |
+
def gumbel_with_max(self, phi, T) -> torch.Tensor:
|
404 |
+
"""
|
405 |
+
Sample `Gumbel(phi)` conditioned on the maximum value being equal to `T`.
|
406 |
+
|
407 |
+
`phi` should have shape `(batch_size, num_classes)` and `T` should have
|
408 |
+
shape `(batch_size, 1)`.
|
409 |
+
"""
|
410 |
+
# Shape: (batch_size, num_classes)
|
411 |
+
G_phi = self.gumbel(phi)
|
412 |
+
|
413 |
+
# Now we find the maximum from these samples.
|
414 |
+
# Shape: (batch_size, )
|
415 |
+
Z, _ = G_phi.max(dim=-1)
|
416 |
+
|
417 |
+
# Shape: (batch_size, num_classes)
|
418 |
+
v = T - G_phi + torch.log1p(-torch.exp(G_phi - Z.unsqueeze(-1)))
|
419 |
+
|
420 |
+
# Shape: (batch_size, num_classes)
|
421 |
+
return T - torch.nn.functional.relu(v) - torch.log1p(torch.exp(-v.abs()))
|
422 |
+
|
423 |
+
|
424 |
+
class FinalSequenceScorer:
|
425 |
+
"""
|
426 |
+
An abstract class that can be used to score the final generated sequences found
|
427 |
+
by beam search. Given the predicted sequences and the corresponding log probabilities of
|
428 |
+
those sequences, the class calculates and returns the final score of the sequences.
|
429 |
+
|
430 |
+
The default implementation scores the sequences using the sum of the log probabilities of
|
431 |
+
the sequence, which is passed as input.
|
432 |
+
"""
|
433 |
+
|
434 |
+
@abstractmethod
|
435 |
+
def score(self, predictions: torch.Tensor, log_probabilities: torch.Tensor, end_index: int) -> torch.Tensor:
|
436 |
+
"""
|
437 |
+
Score the final predictions found by beam search.
|
438 |
+
Returns a tensor of the final sequence scores of shape `(batch_size, beam_size)`.
|
439 |
+
|
440 |
+
:param predictions: A tensor containing the initial predictions with shape `(batch_size, beam_size, max_steps)`.
|
441 |
+
:param log_probabilities: A tensor containing the log probabilities of the sequence, defined as the sum
|
442 |
+
of the log probabilities per token, with shape `(batch_size, beam_size)`.
|
443 |
+
:param end_index: The index of the end symbol.
|
444 |
+
|
445 |
+
"""
|
446 |
+
raise NotImplementedError
|
447 |
+
|
448 |
+
|
449 |
+
class SequenceLogProbabilityScorer(FinalSequenceScorer):
|
450 |
+
"""
|
451 |
+
A :class:`FinalSequenceScorer` which scores the sequences by the sum of the log probabilities
|
452 |
+
across the sequence's tokens.
|
453 |
+
"""
|
454 |
+
|
455 |
+
def score(self, predictions: torch.Tensor, log_probabilities: torch.Tensor, end_index: int) -> torch.Tensor:
|
456 |
+
del predictions, end_index
|
457 |
+
# The sum of the sequence log probabilities is the input parameter, so just
|
458 |
+
# return it.
|
459 |
+
return log_probabilities
|
460 |
+
|
461 |
+
|
462 |
+
class LengthNormalizedSequenceLogProbabilityScorer(FinalSequenceScorer):
|
463 |
+
"""
|
464 |
+
A :class:`FinalSequenceScorer` which scores the sequences by the average log probability of the
|
465 |
+
tokens in the sequence. It optionally includes a length penalty which promotes
|
466 |
+
or demotes sequences based on their lengths. The final score for a sequence will
|
467 |
+
be `(sequence_log_probability) / (sequence_length ** length_penalty)`. The sequence length
|
468 |
+
here includes the end token.
|
469 |
+
|
470 |
+
:param length_penalty: The length penalty to use. A value of 1.0 means no length penalty is used.
|
471 |
+
A value > 1.0 favors longer sequences, and < 1.0 favors shorter sequences.
|
472 |
+
"""
|
473 |
+
|
474 |
+
def __init__(self, length_penalty: float = 1.0):
|
475 |
+
super().__init__()
|
476 |
+
self.length_penalty = length_penalty
|
477 |
+
|
478 |
+
def score(self, predictions: torch.Tensor, log_probabilities: torch.Tensor, end_index: int) -> torch.Tensor:
|
479 |
+
# shape: (batch_size, beam_size)
|
480 |
+
lengths = (predictions != end_index).long().sum(dim=2)
|
481 |
+
|
482 |
+
# If the sequence ended during beam search, the `log_probabilities` will include
|
483 |
+
# the transition to the end token. Therefore, in such situations, `lengths` is
|
484 |
+
# actually off by 1. This corrects for that.
|
485 |
+
# shape: (batch_size, beam_size)
|
486 |
+
is_end_token = predictions[:, :, -1] == end_index
|
487 |
+
lengths += is_end_token.long()
|
488 |
+
|
489 |
+
# shape: (batch_size, beam_size)
|
490 |
+
average_log_probs = log_probabilities / (lengths**self.length_penalty)
|
491 |
+
return average_log_probs
|
492 |
+
|
493 |
+
|
494 |
+
class Constraint:
|
495 |
+
"""
|
496 |
+
An abstract class that can be used to enforce constraints on the output predictions
|
497 |
+
by manipulating the class log probabilities during beam search.
|
498 |
+
|
499 |
+
A `Constraint` just has three methods that need to be implemented by subclasses:
|
500 |
+
`init_state()`, `apply()` and `_update_state()`.
|
501 |
+
|
502 |
+
`init_state()` takes one argument:
|
503 |
+
|
504 |
+
- the batch size, an int
|
505 |
+
|
506 |
+
It returns a constraint state, which is a nested list of dictionaries, with any state needed for subsequent
|
507 |
+
calls to `apply()` and `update_state()`. The length of the outer list should be equal to `batch_size`.
|
508 |
+
Each inner list should be of length 1.
|
509 |
+
|
510 |
+
`apply()` takes two arguments:
|
511 |
+
|
512 |
+
- the constraint state, which is a nested list of dictionaries. The length of the outer list is `batch_size`
|
513 |
+
and the length of each inner list is `beam_size` except on the first time `apply()` is called when it is 1.
|
514 |
+
- `class_log_probabilities`, a tensor of shape `(batch_size, beam_size, num_classes)` that contains the
|
515 |
+
log probabilities for the classes during search. The first time `apply()` is called, `beam_size = 1`.
|
516 |
+
|
517 |
+
The `apply()` method should return new `class_log_probabilities` that enforce the constraint
|
518 |
+
for this step of beam search. For instance, it may prevent a specific class from being selected by setting
|
519 |
+
the corresponding log probability to a negligible value such as `float("-inf")` or
|
520 |
+
`torch.finfo(class_log_probabilities.dtype).min`.
|
521 |
+
|
522 |
+
`_update_state()` takes two arguments:
|
523 |
+
|
524 |
+
- the copied parent constraint state, which is a nested list of dictionaries. `state[i][j]` contains the
|
525 |
+
copied state for the parent of `last_prediction[i, j]`. It is unique to that batch and beam, so it can be
|
526 |
+
directly edited in-place without affecting the others.
|
527 |
+
- last_prediction, a tensor of shape `(batch_size, beam_size)` containing the predictions from the last
|
528 |
+
step of beam search.
|
529 |
+
|
530 |
+
The `_update_state()` function should return a new constraint state, a nested list of dictionaries of
|
531 |
+
length `batch_size` and inner list of length `beam_size`, one for each of the predictions in `last_prediction`.
|
532 |
+
|
533 |
+
"""
|
534 |
+
|
535 |
+
@abstractmethod
|
536 |
+
def init_state(
|
537 |
+
self,
|
538 |
+
batch_size: int,
|
539 |
+
) -> ConstraintStateType:
|
540 |
+
raise NotImplementedError
|
541 |
+
|
542 |
+
@abstractmethod
|
543 |
+
def apply(
|
544 |
+
self,
|
545 |
+
state: ConstraintStateType,
|
546 |
+
class_log_probabilities: torch.Tensor,
|
547 |
+
) -> torch.Tensor:
|
548 |
+
raise NotImplementedError
|
549 |
+
|
550 |
+
@staticmethod
|
551 |
+
def _copy_state(
|
552 |
+
state: ConstraintStateType,
|
553 |
+
batch_size: int,
|
554 |
+
beam_size: int,
|
555 |
+
last_backpointer: Optional[torch.Tensor] = None,
|
556 |
+
) -> ConstraintStateType:
|
557 |
+
"""
|
558 |
+
Copies the `state` . This method copies the data in `state` using `copy.deepcopy()`. If this
|
559 |
+
is not appropriate for your constraint, you will need to implement the copying yourself.
|
560 |
+
"""
|
561 |
+
new_state = []
|
562 |
+
for i in range(batch_size):
|
563 |
+
batch_state = []
|
564 |
+
for j in range(beam_size):
|
565 |
+
if last_backpointer is None:
|
566 |
+
# This is the first prediction, so the backpointer is 0
|
567 |
+
backpointer = 0
|
568 |
+
else:
|
569 |
+
backpointer = last_backpointer[i, j].item()
|
570 |
+
batch_state.append(copy.deepcopy(state[i][backpointer])) # type: ignore
|
571 |
+
new_state.append(batch_state)
|
572 |
+
return new_state
|
573 |
+
|
574 |
+
def update_state(
|
575 |
+
self,
|
576 |
+
state: ConstraintStateType,
|
577 |
+
last_prediction: torch.Tensor,
|
578 |
+
last_backpointer: Optional[torch.Tensor] = None,
|
579 |
+
) -> ConstraintStateType:
|
580 |
+
batch_size, beam_size = last_prediction.size()
|
581 |
+
new_state = self._copy_state(state, batch_size, beam_size, last_backpointer)
|
582 |
+
return self._update_state(new_state, last_prediction)
|
583 |
+
|
584 |
+
@abstractmethod
|
585 |
+
def _update_state(
|
586 |
+
self,
|
587 |
+
state: ConstraintStateType,
|
588 |
+
last_prediction: torch.Tensor,
|
589 |
+
) -> ConstraintStateType:
|
590 |
+
raise NotImplementedError
|
591 |
+
|
592 |
+
|
593 |
+
class RepeatedNGramBlockingConstraint(Constraint):
|
594 |
+
def __init__(self, ngram_size: int, **kwargs) -> None:
|
595 |
+
super().__init__(**kwargs)
|
596 |
+
self.ngram_size = ngram_size
|
597 |
+
|
598 |
+
def init_state(
|
599 |
+
self,
|
600 |
+
batch_size: int,
|
601 |
+
) -> ConstraintStateType:
|
602 |
+
return [[{"seen_ngrams": {}, "current_prefix": []}] for _ in range(batch_size)]
|
603 |
+
|
604 |
+
def apply(
|
605 |
+
self,
|
606 |
+
state: ConstraintStateType,
|
607 |
+
class_log_probabilities: torch.Tensor,
|
608 |
+
) -> torch.Tensor:
|
609 |
+
for i, batch in enumerate(state):
|
610 |
+
for j, beam in enumerate(batch):
|
611 |
+
current_prefix = tuple(beam["current_prefix"])
|
612 |
+
seen_ngrams = beam["seen_ngrams"]
|
613 |
+
try:
|
614 |
+
disallowed_indices = seen_ngrams[current_prefix]
|
615 |
+
class_log_probabilities[i, j, disallowed_indices] = torch.finfo(
|
616 |
+
class_log_probabilities.dtype
|
617 |
+
).min
|
618 |
+
except KeyError:
|
619 |
+
# We have not seen this prefix before, so there is no index
|
620 |
+
# that needs to be blocked
|
621 |
+
pass
|
622 |
+
return class_log_probabilities
|
623 |
+
|
624 |
+
def _update_state(
|
625 |
+
self,
|
626 |
+
state: ConstraintStateType,
|
627 |
+
last_prediction: torch.Tensor,
|
628 |
+
) -> ConstraintStateType:
|
629 |
+
for i, batch in enumerate(state):
|
630 |
+
for j, beam in enumerate(batch):
|
631 |
+
prediction = last_prediction[i, j].item()
|
632 |
+
prefix = beam["current_prefix"]
|
633 |
+
seen_ngrams = beam["seen_ngrams"]
|
634 |
+
|
635 |
+
if len(prefix) == self.ngram_size - 1:
|
636 |
+
# This is a new ngram that we have to remember
|
637 |
+
if tuple(prefix) not in seen_ngrams:
|
638 |
+
seen_ngrams[tuple(prefix)] = []
|
639 |
+
seen_ngrams[tuple(prefix)].append(prediction)
|
640 |
+
|
641 |
+
# Create the new prefix, removing the oldest index if the prefix
|
642 |
+
# is too long
|
643 |
+
prefix.append(prediction)
|
644 |
+
if len(prefix) == self.ngram_size:
|
645 |
+
prefix.pop(0)
|
646 |
+
return state
|
647 |
+
|
648 |
+
|
649 |
+
class BeamSearch:
|
650 |
+
"""
|
651 |
+
Implements the beam search algorithm for decoding the most likely sequences.
|
652 |
+
|
653 |
+
:param end_index: The index of the "stop" or "end" token in the vocabulary. Usually the EOS token ID.
|
654 |
+
|
655 |
+
:param max_steps: The maximum number of decoding steps to take, i.e. the maximum length
|
656 |
+
of the predicted sequences.
|
657 |
+
|
658 |
+
:param beam_size: The width of the beam used.
|
659 |
+
|
660 |
+
:param per_node_beam_size: The maximum number of candidates to consider per node, at each step in the search.
|
661 |
+
If not given, this just defaults to `beam_size`. Setting this parameter
|
662 |
+
to a number smaller than `beam_size` may give better results, as it can introduce
|
663 |
+
more diversity into the search. See
|
664 |
+
[*Beam Search Strategies for Neural Machine Translation*, Freitag and Al-Onaizan, 2017]
|
665 |
+
(https://api.semanticscholar.org/CorpusID:2229477).
|
666 |
+
|
667 |
+
:param sampler: An optional `Sampler` which is used to pick next candidate nodes and beams.
|
668 |
+
If not specified, `DeterministicSampler` will be used, which just takes the
|
669 |
+
`per_node_beam_size` most likely nodes and the `beam_size` most likely beams.
|
670 |
+
|
671 |
+
Using the [`GumbelSampler`](#gumbelsampler), on the other hand, will give you
|
672 |
+
[Stochastic Beam Search](https://api.semanticscholar.org/CorpusID:76662039).
|
673 |
+
|
674 |
+
:param min_steps: The minimum number of decoding steps to take, i.e. the minimum length of
|
675 |
+
the predicted sequences. This does not include the start or end tokens. If `None`,
|
676 |
+
no minimum is enforced.
|
677 |
+
|
678 |
+
:param final_sequence_scorer: An optional `FinalSequenceScorer` which is used to score the final generated sequences.
|
679 |
+
The output from this module is what is returned by the `search` method. If not
|
680 |
+
specified, `SequenceLogProbabilityScorer` will be used, which scores the sequences
|
681 |
+
by the sum of the token log probabilities.
|
682 |
+
|
683 |
+
:param constraints: An optional list of `Constraint`s which should be applied during beam search. If not
|
684 |
+
provided, no constraints will be enforced.
|
685 |
+
|
686 |
+
"""
|
687 |
+
|
688 |
+
def __init__(
|
689 |
+
self,
|
690 |
+
end_index: int,
|
691 |
+
*,
|
692 |
+
max_steps: int = 50,
|
693 |
+
beam_size: int = 10,
|
694 |
+
per_node_beam_size: Optional[int] = None,
|
695 |
+
sampler: Optional[Sampler] = None,
|
696 |
+
min_steps: Optional[int] = None,
|
697 |
+
final_sequence_scorer: Optional[FinalSequenceScorer] = None,
|
698 |
+
constraints: Optional[List[Constraint]] = None,
|
699 |
+
distributed_model: bool = False
|
700 |
+
) -> None:
|
701 |
+
if not max_steps > 0:
|
702 |
+
raise ValueError("max_steps must be positive")
|
703 |
+
if not beam_size > 0:
|
704 |
+
raise ValueError("beam_size must be positive")
|
705 |
+
if per_node_beam_size is not None and not per_node_beam_size > 0:
|
706 |
+
raise ValueError("per_node_beam_size must be positive")
|
707 |
+
if min_steps is not None:
|
708 |
+
if not min_steps >= 0:
|
709 |
+
raise ValueError("min_steps must be non-negative")
|
710 |
+
if not min_steps <= max_steps:
|
711 |
+
raise ValueError("min_steps must be less than or equal to max_steps")
|
712 |
+
|
713 |
+
self._end_index = end_index
|
714 |
+
self.max_steps = max_steps
|
715 |
+
self.beam_size = beam_size
|
716 |
+
self.per_node_beam_size = per_node_beam_size or beam_size
|
717 |
+
self.sampler = sampler or DeterministicSampler()
|
718 |
+
self.min_steps = min_steps or 0
|
719 |
+
self.final_sequence_scorer = final_sequence_scorer or SequenceLogProbabilityScorer()
|
720 |
+
self.constraints = constraints or []
|
721 |
+
self.distributed_model = distributed_model
|
722 |
+
|
723 |
+
@staticmethod
|
724 |
+
def _reconstruct_sequences(predictions, backpointers):
|
725 |
+
# Reconstruct the sequences.
|
726 |
+
# shape: [(batch_size, beam_size, 1)]
|
727 |
+
reconstructed_predictions = [predictions[-1].unsqueeze(2)]
|
728 |
+
|
729 |
+
if not backpointers:
|
730 |
+
return reconstructed_predictions
|
731 |
+
|
732 |
+
# shape: (batch_size, beam_size)
|
733 |
+
cur_backpointers = backpointers[-1]
|
734 |
+
|
735 |
+
for timestep in range(len(predictions) - 2, 0, -1):
|
736 |
+
# shape: (batch_size, beam_size, 1)
|
737 |
+
cur_preds = predictions[timestep].gather(1, cur_backpointers).unsqueeze(2)
|
738 |
+
|
739 |
+
reconstructed_predictions.append(cur_preds)
|
740 |
+
|
741 |
+
# shape: (batch_size, beam_size)
|
742 |
+
cur_backpointers = backpointers[timestep - 1].gather(1, cur_backpointers)
|
743 |
+
|
744 |
+
# shape: (batch_size, beam_size, 1)
|
745 |
+
final_preds = predictions[0].gather(1, cur_backpointers).unsqueeze(2)
|
746 |
+
|
747 |
+
reconstructed_predictions.append(final_preds)
|
748 |
+
|
749 |
+
return reconstructed_predictions
|
750 |
+
|
751 |
+
def search(
|
752 |
+
self,
|
753 |
+
start_predictions: torch.Tensor,
|
754 |
+
start_state: StateType,
|
755 |
+
step: StepFunctionType,
|
756 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
757 |
+
"""
|
758 |
+
Given a starting state and a step function, apply beam search to find the
|
759 |
+
most likely target sequences.
|
760 |
+
|
761 |
+
Returns a tuple of `(predictions, final_scores)`, where `predictions`
|
762 |
+
has shape `(batch_size, beam_size, max_steps)` and `final_scores`
|
763 |
+
has shape `(batch_size, beam_size)`.
|
764 |
+
|
765 |
+
.. note::
|
766 |
+
If your step function returns `-inf` for some log probabilities
|
767 |
+
(like if you're using a masked log-softmax) then some of the "best"
|
768 |
+
sequences returned may also have `-inf` log probability. Specifically
|
769 |
+
this happens when the beam size is smaller than the number of actions
|
770 |
+
with finite log probability (non-zero probability) returned by the step function.
|
771 |
+
Therefore if you're using a mask you may want to check the results from `search`
|
772 |
+
and potentially discard sequences with non-finite log probability.
|
773 |
+
|
774 |
+
:param start_predictions: A tensor containing the initial predictions with shape `(batch_size,)`.
|
775 |
+
Usually the initial predictions are just the index of the "start" token
|
776 |
+
in the target vocabulary.
|
777 |
+
|
778 |
+
:param start_state: The initial state passed to the `step` function. Each value of the state dict
|
779 |
+
should be a tensor of shape `(batch_size, *)`, where `*` means any other
|
780 |
+
number of dimensions.
|
781 |
+
|
782 |
+
:param step: A function that is responsible for computing the next most likely tokens,
|
783 |
+
given the current state and the predictions from the last time step.
|
784 |
+
The function should accept two or three arguments:
|
785 |
+
|
786 |
+
- a tensor of shape `(group_size,)` or representing the index of the predicted
|
787 |
+
tokens from the last time step,
|
788 |
+
- the current state, a `StateType`, and
|
789 |
+
- optionally, the timestep, an `int`.
|
790 |
+
|
791 |
+
The `group_size` will be `batch_size * beam_size`, except in the initial
|
792 |
+
step, for which it will just be `batch_size`.
|
793 |
+
|
794 |
+
The function is expected to return a tuple, where the first element
|
795 |
+
is a tensor of shape `(group_size, vocab_size)` containing
|
796 |
+
the log probabilities of the tokens for the next step, and the second
|
797 |
+
element is the updated state. The tensor in the state should have shape
|
798 |
+
`(group_size, *)`, where `*` means any other number of dimensions.
|
799 |
+
|
800 |
+
"""
|
801 |
+
step_signature = signature(step)
|
802 |
+
if len(step_signature.parameters) < 3:
|
803 |
+
# If the step function we're given does not take the time step argument, wrap it
|
804 |
+
# in one that does.
|
805 |
+
old_step = cast(StepFunctionTypeNoTimestep, step)
|
806 |
+
|
807 |
+
def new_step(last_predictions: torch.Tensor, state: Dict[str, torch.Tensor], time_step: int):
|
808 |
+
del time_step
|
809 |
+
return old_step(last_predictions, state)
|
810 |
+
|
811 |
+
return self._search(start_predictions, start_state, new_step)
|
812 |
+
else:
|
813 |
+
return self._search(start_predictions, start_state, cast(StepFunctionTypeWithTimestep, step))
|
814 |
+
|
815 |
+
def _search(
|
816 |
+
self,
|
817 |
+
start_predictions: torch.Tensor,
|
818 |
+
start_state: StateType,
|
819 |
+
step: StepFunctionTypeWithTimestep,
|
820 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
821 |
+
batch_size = start_predictions.size()[0]
|
822 |
+
|
823 |
+
# List of (batch_size, beam_size) tensors. One for each time step. Does not
|
824 |
+
# include the start symbols, which are implicit.
|
825 |
+
predictions: List[torch.Tensor] = []
|
826 |
+
|
827 |
+
# List of (batch_size, beam_size) tensors. One for each time step. None for
|
828 |
+
# the first. Stores the index n for the parent prediction, i.e.
|
829 |
+
# predictions[t-1][i][n], that it came from.
|
830 |
+
backpointers: List[torch.Tensor] = []
|
831 |
+
|
832 |
+
constraint_states = [constraint.init_state(batch_size) for constraint in self.constraints]
|
833 |
+
|
834 |
+
# Calculate the first timestep. This is done outside the main loop
|
835 |
+
# because we are going from a single decoder input (the output from the
|
836 |
+
# encoder) to the top `beam_size` decoder outputs. On the other hand,
|
837 |
+
# within the main loop we are going from the `beam_size` elements of the
|
838 |
+
# beam to `beam_size`^2 candidates from which we will select the top
|
839 |
+
# `beam_size` elements for the next iteration.
|
840 |
+
# shape: (batch_size, num_classes)
|
841 |
+
start_class_log_probabilities, state = step(start_predictions, start_state, 0)
|
842 |
+
|
843 |
+
num_classes = start_class_log_probabilities.size()[1]
|
844 |
+
|
845 |
+
# Make sure `per_node_beam_size` is not larger than `num_classes`.
|
846 |
+
if self.per_node_beam_size > num_classes:
|
847 |
+
raise ValueError(
|
848 |
+
f"Vocab size ({num_classes:d}) too small "
|
849 |
+
f"relative to per_node_beam_size ({self.per_node_beam_size:d}).\n"
|
850 |
+
f"Please decrease beam_size or per_node_beam_size."
|
851 |
+
)
|
852 |
+
|
853 |
+
sampler_state = self.sampler.init_state(start_class_log_probabilities, batch_size, num_classes)
|
854 |
+
|
855 |
+
# Apply all constraints.
|
856 |
+
if self.constraints:
|
857 |
+
# shape: (batch_size, 1, num_classes)
|
858 |
+
expanded_start_class_log_probabilities = start_class_log_probabilities.unsqueeze(1)
|
859 |
+
for constraint, constraint_state in zip(self.constraints, constraint_states):
|
860 |
+
expanded_start_class_log_probabilities = constraint.apply(
|
861 |
+
constraint_state, expanded_start_class_log_probabilities
|
862 |
+
)
|
863 |
+
start_class_log_probabilities = expanded_start_class_log_probabilities.squeeze(1)
|
864 |
+
|
865 |
+
# Prevent selecting the end symbol if there is any min_steps constraint
|
866 |
+
if self.min_steps >= 1:
|
867 |
+
start_class_log_probabilities[:, self._end_index] = torch.finfo(
|
868 |
+
start_class_log_probabilities.dtype
|
869 |
+
).min
|
870 |
+
|
871 |
+
# Get the initial predicted classed and their log probabilities.
|
872 |
+
# shape: (batch_size, beam_size), (batch_size, beam_size)
|
873 |
+
(
|
874 |
+
start_top_log_probabilities,
|
875 |
+
start_predicted_classes,
|
876 |
+
sampler_state,
|
877 |
+
) = self.sampler.sample_beams(start_class_log_probabilities, self.beam_size, sampler_state)
|
878 |
+
|
879 |
+
if (
|
880 |
+
self.beam_size == 1 and
|
881 |
+
(start_predicted_classes == self._end_index).all() and
|
882 |
+
not self.distributed_model
|
883 |
+
):
|
884 |
+
warnings.warn(
|
885 |
+
"Empty sequences predicted. You may want to increase the beam size or ensure "
|
886 |
+
"your step function is working properly.",
|
887 |
+
RuntimeWarning,
|
888 |
+
)
|
889 |
+
return start_predicted_classes.unsqueeze(-1), start_top_log_probabilities
|
890 |
+
|
891 |
+
# The log probabilities for the last time step.
|
892 |
+
# shape: (batch_size, beam_size)
|
893 |
+
last_log_probabilities = start_top_log_probabilities
|
894 |
+
|
895 |
+
# shape: [(batch_size, beam_size)]
|
896 |
+
predictions.append(start_predicted_classes)
|
897 |
+
|
898 |
+
# Log probability tensor that mandates that the end token is selected.
|
899 |
+
# shape: (batch_size * beam_size, num_classes)
|
900 |
+
log_probs_after_end = start_class_log_probabilities.new_full(
|
901 |
+
(batch_size * self.beam_size, num_classes),
|
902 |
+
torch.finfo(start_class_log_probabilities.dtype).min,
|
903 |
+
)
|
904 |
+
log_probs_after_end[:, self._end_index] = 0.0
|
905 |
+
|
906 |
+
# Set the same state for each element in the beam.
|
907 |
+
self._update_initial_state(state, batch_size)
|
908 |
+
|
909 |
+
for i, constraint in enumerate(self.constraints):
|
910 |
+
constraint_states[i] = constraint.update_state(constraint_states[i], start_predicted_classes)
|
911 |
+
|
912 |
+
for timestep in range(self.max_steps - 1):
|
913 |
+
# shape: (batch_size * beam_size,)
|
914 |
+
last_predictions = predictions[-1].reshape(batch_size * self.beam_size)
|
915 |
+
|
916 |
+
# If every predicted token from the last step is `self._end_index`,
|
917 |
+
# then we can stop early.
|
918 |
+
# FIXME for distributed model we cannot stop early unless all devices are done,
|
919 |
+
# for now we just always run to the max limit, ideally we should check all devices
|
920 |
+
if not self.distributed_model and (last_predictions == self._end_index).all():
|
921 |
+
# finished
|
922 |
+
break
|
923 |
+
# Take a step. This get the predicted log probs of the next classes
|
924 |
+
# and updates the state.
|
925 |
+
# shape: (batch_size * beam_size, num_classes)
|
926 |
+
class_log_probabilities, state = step(last_predictions, state, timestep + 1)
|
927 |
+
|
928 |
+
# Apply all constraints.
|
929 |
+
if self.constraints:
|
930 |
+
# shape: (batch_size, beam_size, num_classes)
|
931 |
+
reshaped_class_log_probabilities = class_log_probabilities.view(batch_size, self.beam_size, -1)
|
932 |
+
for constraint, constraint_state in zip(self.constraints, constraint_states):
|
933 |
+
reshaped_class_log_probabilities = constraint.apply(
|
934 |
+
constraint_state, reshaped_class_log_probabilities
|
935 |
+
)
|
936 |
+
# shape: (batch_size * beam_size, num_classes)
|
937 |
+
class_log_probabilities = reshaped_class_log_probabilities.view(batch_size * self.beam_size, -1)
|
938 |
+
|
939 |
+
# The `timestep`-th iteration of the for loop is generating the `timestep + 2`-th token
|
940 |
+
# of the sequence (because `timestep` is 0-indexed and we generated the first token
|
941 |
+
# before the for loop). Here we block the end index if the search is not allowed to
|
942 |
+
# terminate on this iteration.
|
943 |
+
if timestep + 2 <= self.min_steps:
|
944 |
+
class_log_probabilities[:, self._end_index] = torch.finfo(class_log_probabilities.dtype).min
|
945 |
+
|
946 |
+
# shape: (batch_size * beam_size, num_classes)
|
947 |
+
last_predictions_expanded = last_predictions.unsqueeze(-1).expand(
|
948 |
+
batch_size * self.beam_size, num_classes
|
949 |
+
)
|
950 |
+
|
951 |
+
# Here we are finding any beams where we predicted the end token in
|
952 |
+
# the previous timestep and replacing the distribution with a
|
953 |
+
# one-hot distribution, forcing the beam to predict the end token
|
954 |
+
# this timestep as well.
|
955 |
+
# shape: (batch_size * beam_size, num_classes)
|
956 |
+
cleaned_log_probabilities = torch.where(
|
957 |
+
last_predictions_expanded == self._end_index,
|
958 |
+
log_probs_after_end,
|
959 |
+
class_log_probabilities,
|
960 |
+
)
|
961 |
+
|
962 |
+
# shape (both): (batch_size * beam_size, per_node_beam_size)
|
963 |
+
top_log_probabilities, predicted_classes, sampler_state = self.sampler.sample_nodes(
|
964 |
+
cleaned_log_probabilities, self.per_node_beam_size, sampler_state
|
965 |
+
)
|
966 |
+
|
967 |
+
# Here we expand the last log probabilities to (batch_size * beam_size, per_node_beam_size)
|
968 |
+
# so that we can add them to the current log probs for this timestep.
|
969 |
+
# This lets us maintain the log probability of each element on the beam.
|
970 |
+
# shape: (batch_size * beam_size, per_node_beam_size)
|
971 |
+
expanded_last_log_probabilities = (
|
972 |
+
last_log_probabilities.unsqueeze(2)
|
973 |
+
.expand(batch_size, self.beam_size, self.per_node_beam_size)
|
974 |
+
.reshape(batch_size * self.beam_size, self.per_node_beam_size)
|
975 |
+
)
|
976 |
+
|
977 |
+
# shape: (batch_size * beam_size, per_node_beam_size)
|
978 |
+
summed_top_log_probabilities = top_log_probabilities + expanded_last_log_probabilities
|
979 |
+
|
980 |
+
# shape: (batch_size, beam_size * per_node_beam_size)
|
981 |
+
reshaped_summed = summed_top_log_probabilities.reshape(
|
982 |
+
batch_size, self.beam_size * self.per_node_beam_size
|
983 |
+
)
|
984 |
+
|
985 |
+
# shape: (batch_size, beam_size * per_node_beam_size)
|
986 |
+
reshaped_predicted_classes = predicted_classes.reshape(
|
987 |
+
batch_size, self.beam_size * self.per_node_beam_size
|
988 |
+
)
|
989 |
+
|
990 |
+
# Keep only the top `beam_size` beam indices.
|
991 |
+
# shape (both): (batch_size, beam_size)
|
992 |
+
(
|
993 |
+
restricted_beam_log_probs,
|
994 |
+
restricted_beam_indices,
|
995 |
+
sampler_state,
|
996 |
+
) = self.sampler.sample_beams(reshaped_summed, self.beam_size, sampler_state)
|
997 |
+
|
998 |
+
# Use the beam indices to extract the corresponding classes.
|
999 |
+
# shape: (batch_size, beam_size)
|
1000 |
+
restricted_predicted_classes = reshaped_predicted_classes.gather(1, restricted_beam_indices)
|
1001 |
+
|
1002 |
+
predictions.append(restricted_predicted_classes)
|
1003 |
+
|
1004 |
+
# shape: (batch_size, beam_size)
|
1005 |
+
last_log_probabilities = restricted_beam_log_probs
|
1006 |
+
|
1007 |
+
# The beam indices come from a `beam_size * per_node_beam_size` dimension where the
|
1008 |
+
# indices with a common ancestor are grouped together. Hence
|
1009 |
+
# dividing by per_node_beam_size gives the ancestor. (Note that this is integer
|
1010 |
+
# division as the tensor is a LongTensor.)
|
1011 |
+
# shape: (batch_size, beam_size)
|
1012 |
+
backpointer = torch.divide(restricted_beam_indices, self.per_node_beam_size, rounding_mode="trunc")
|
1013 |
+
backpointers.append(backpointer)
|
1014 |
+
|
1015 |
+
# Keep only the pieces of the state tensors corresponding to the
|
1016 |
+
# ancestors created this iteration.
|
1017 |
+
self._update_state(state, backpointer)
|
1018 |
+
|
1019 |
+
for i, constraint in enumerate(self.constraints):
|
1020 |
+
constraint_states[i] = constraint.update_state(
|
1021 |
+
constraint_states[i], restricted_predicted_classes, last_backpointer=backpointer
|
1022 |
+
)
|
1023 |
+
|
1024 |
+
# Warn about "-inf" log probabilities if not using any constraints (negligible
|
1025 |
+
# log probabilities are expected when using constraints).
|
1026 |
+
if not self.constraints and (
|
1027 |
+
not torch.isfinite(last_log_probabilities).all()
|
1028 |
+
or (last_log_probabilities == torch.finfo(last_log_probabilities.dtype).min).any()
|
1029 |
+
):
|
1030 |
+
warnings.warn(
|
1031 |
+
"Negligible log probabilities encountered ('-inf' or equivalent). "
|
1032 |
+
"Some final sequences may not make sense. "
|
1033 |
+
"This can happen when the beam size is larger than the number of valid (non-zero "
|
1034 |
+
"probability) transitions that the step function produces.",
|
1035 |
+
RuntimeWarning,
|
1036 |
+
)
|
1037 |
+
|
1038 |
+
reconstructed_predictions = self._reconstruct_sequences(predictions, backpointers)
|
1039 |
+
|
1040 |
+
# shape: (batch_size, beam_size, max_steps)
|
1041 |
+
all_predictions = torch.cat(list(reversed(reconstructed_predictions)), 2)
|
1042 |
+
|
1043 |
+
# Calculate the final sequence scores
|
1044 |
+
# shape: (batch_size, beam_size)
|
1045 |
+
final_scores = self.final_sequence_scorer.score(all_predictions, last_log_probabilities, self._end_index)
|
1046 |
+
|
1047 |
+
# Sort the sequences based on the final scores so the best scoring
|
1048 |
+
# sequence is at index 0
|
1049 |
+
sorted_final_scores, sorted_indices = torch.sort(final_scores, dim=1, descending=True)
|
1050 |
+
sorted_all_predictions = torch.gather(
|
1051 |
+
all_predictions, 1, sorted_indices.unsqueeze(-1).expand_as(all_predictions)
|
1052 |
+
)
|
1053 |
+
|
1054 |
+
return sorted_all_predictions, sorted_final_scores
|
1055 |
+
|
1056 |
+
def _update_initial_state(self, state: StateType, batch_size: int):
|
1057 |
+
"""
|
1058 |
+
Expand tensors in a state dictionary from `(batch_size, *)` to `(batch_size * beam_size, *)`.
|
1059 |
+
"""
|
1060 |
+
for key, state_tensor in state.items():
|
1061 |
+
if state_tensor is None:
|
1062 |
+
continue
|
1063 |
+
# shape: (batch_size * beam_size, *)
|
1064 |
+
_, *last_dims = state_tensor.size()
|
1065 |
+
state[key] = (
|
1066 |
+
state_tensor.unsqueeze(1)
|
1067 |
+
.expand(batch_size, self.beam_size, *last_dims)
|
1068 |
+
.reshape(batch_size * self.beam_size, *last_dims)
|
1069 |
+
)
|
1070 |
+
|
1071 |
+
def _update_state(self, state: StateType, backpointer: torch.Tensor):
|
1072 |
+
batch_size = backpointer.size()[0]
|
1073 |
+
|
1074 |
+
for key, state_tensor in state.items():
|
1075 |
+
if state_tensor is None:
|
1076 |
+
continue
|
1077 |
+
_, *last_dims = state_tensor.size()
|
1078 |
+
# shape: (batch_size, beam_size, *)
|
1079 |
+
expanded_backpointer = backpointer.view(batch_size, self.beam_size, *([1] * len(last_dims))).expand(
|
1080 |
+
batch_size, self.beam_size, *last_dims
|
1081 |
+
)
|
1082 |
+
# shape: (batch_size * beam_size, *)
|
1083 |
+
state[key] = (
|
1084 |
+
state_tensor.reshape(batch_size, self.beam_size, *last_dims)
|
1085 |
+
.gather(1, expanded_backpointer)
|
1086 |
+
.reshape(batch_size * self.beam_size, *last_dims)
|
1087 |
+
)
|
config_molmoe.py
CHANGED
@@ -27,11 +27,15 @@ import gin
|
|
27 |
|
28 |
#from olmo.aliases import PathOrStr
|
29 |
from .aliases import PathOrStr
|
30 |
-
from olmo.exceptions import OLMoConfigurationError
|
31 |
-
from
|
32 |
-
|
33 |
-
from
|
34 |
-
|
|
|
|
|
|
|
|
|
35 |
|
36 |
__all__ = [
|
37 |
"ActivationType",
|
|
|
27 |
|
28 |
#from olmo.aliases import PathOrStr
|
29 |
from .aliases import PathOrStr
|
30 |
+
#from olmo.exceptions import OLMoConfigurationError
|
31 |
+
from .exceptions import OLMoConfigurationError
|
32 |
+
#from olmo.util import StrEnum, resource_path
|
33 |
+
from .util import StrEnum, resource_path
|
34 |
+
|
35 |
+
#from olmo.mm_data.data_utils import build_tokenizer
|
36 |
+
from .data_utils import build_tokenizer
|
37 |
+
#from olmo.multimodal_preprocessor import MultiModalPreprocessor
|
38 |
+
from .multimodal_preprocessor import MultiModalPreprocessor
|
39 |
|
40 |
__all__ = [
|
41 |
"ActivationType",
|
constants.py
ADDED
@@ -0,0 +1,571 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
DEFAULT_IMAGE_PATCH_TOKEN = f"<im_patch>"
|
2 |
+
DEFAULT_IM_START_TOKEN = f"<im_start>"
|
3 |
+
DEFAULT_IM_END_TOKEN = f"<im_end>"
|
4 |
+
DEFAULT_IM_COL_TOKEN = f"<im_col>"
|
5 |
+
IMAGE_PROMPT = "<|image|>"
|
6 |
+
|
7 |
+
EXTRA_TOKENS = (DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_COL_TOKEN, IMAGE_PROMPT)
|
8 |
+
|
9 |
+
|
10 |
+
VIT_STANDARD_CONFIGS = {
|
11 |
+
"dinov2-large": {
|
12 |
+
"image_emb_dim": 1024,
|
13 |
+
"image_mlp_dim": 4096,
|
14 |
+
'image_patch_size': 14,
|
15 |
+
'image_pos_patch_size': 14,
|
16 |
+
'image_num_layers': 24,
|
17 |
+
'image_num_heads': 16,
|
18 |
+
'image_num_key_value_heads': 16,
|
19 |
+
'image_head_dim': 64,
|
20 |
+
'image_mlp_activations': 'gelu',
|
21 |
+
'image_default_input_size': (224, 224),
|
22 |
+
'image_num_pos': 257,
|
23 |
+
'image_norm_eps': 1e-6,
|
24 |
+
"image_model_type": "dino"
|
25 |
+
},
|
26 |
+
"SigLIP-So400m-14-384": {
|
27 |
+
"image_emb_dim": 1152,
|
28 |
+
'image_num_layers': 27,
|
29 |
+
"image_mlp_dim": 4304,
|
30 |
+
'image_patch_size': 14,
|
31 |
+
'image_pos_patch_size': 14,
|
32 |
+
'image_num_heads': 16,
|
33 |
+
'image_num_key_value_heads': 16,
|
34 |
+
'image_head_dim': 72,
|
35 |
+
'image_mlp_activations': 'gelu',
|
36 |
+
# Although it is called "384" that seems to be an error of the author's
|
37 |
+
# part, it actually only handles 378 inputs
|
38 |
+
'image_default_input_size': (378, 378),
|
39 |
+
'image_num_pos': 729, # note not CLS token
|
40 |
+
'image_norm_eps': 1e-6,
|
41 |
+
"image_model_type": "siglip",
|
42 |
+
"resize_mode": "siglip"
|
43 |
+
},
|
44 |
+
"DFN5B-CLIP-ViT-H-14-378": {
|
45 |
+
"image_emb_dim": 1280,
|
46 |
+
'image_patch_size': 14,
|
47 |
+
'image_pos_patch_size': 14,
|
48 |
+
'image_num_layers': 32,
|
49 |
+
'image_num_heads': 16,
|
50 |
+
'image_num_key_value_heads': 16,
|
51 |
+
'image_head_dim': 80,
|
52 |
+
'image_mlp_dim': 5120,
|
53 |
+
'image_dropout_rate': 0.0,
|
54 |
+
'image_mlp_activations': 'quick_gelu',
|
55 |
+
'image_default_input_size': (378, 378),
|
56 |
+
'image_num_pos': 730,
|
57 |
+
'image_norm_eps': 1e-5,
|
58 |
+
"image_model_type": "openai",
|
59 |
+
"resize_mode": "no_aspect_ratio"
|
60 |
+
},
|
61 |
+
'ViT-L/14-336': {
|
62 |
+
'image_patch_size': 14,
|
63 |
+
'image_pos_patch_size': 14,
|
64 |
+
'image_emb_dim': 1024,
|
65 |
+
'image_num_heads': 16,
|
66 |
+
'image_num_layers': 23,
|
67 |
+
'image_head_dim': 64,
|
68 |
+
'image_mlp_dim': 4096,
|
69 |
+
'image_mlp_activations': 'quick_gelu',
|
70 |
+
'image_dropout_rate': 0.0,
|
71 |
+
'image_num_pos': 577,
|
72 |
+
'image_default_input_size': (336, 336),
|
73 |
+
'image_norm_eps': 1e-5,
|
74 |
+
'image_num_key_value_heads': 16,
|
75 |
+
"image_model_type": "openai"
|
76 |
+
},
|
77 |
+
'EVA02-L-14-336': {
|
78 |
+
'image_patch_size': 14,
|
79 |
+
'image_pos_patch_size': 14,
|
80 |
+
'image_emb_dim': 1024,
|
81 |
+
'image_num_heads': 16,
|
82 |
+
'image_num_layers': 24,
|
83 |
+
'image_head_dim': 64,
|
84 |
+
'image_mlp_dim': 2730,
|
85 |
+
'image_mlp_activations': 'silu',
|
86 |
+
'image_dropout_rate': 0.0,
|
87 |
+
'image_num_pos': 577,
|
88 |
+
'image_default_input_size': (336, 336),
|
89 |
+
'image_norm_eps': 1e-6,
|
90 |
+
'image_num_key_value_heads': 16,
|
91 |
+
"image_model_type": "eva"
|
92 |
+
},
|
93 |
+
'ViT-L/14': {
|
94 |
+
'image_patch_size': 14,
|
95 |
+
'image_pos_patch_size': 14,
|
96 |
+
'image_emb_dim': 1024,
|
97 |
+
'image_num_heads': 16,
|
98 |
+
# Note the original model has 24 layers, but we don't use the last layer
|
99 |
+
'image_num_layers': 23,
|
100 |
+
'image_head_dim': 64,
|
101 |
+
'image_mlp_dim': 4096,
|
102 |
+
'image_mlp_activations': 'quick_gelu',
|
103 |
+
'image_dropout_rate': 0.0,
|
104 |
+
'image_num_pos': 257,
|
105 |
+
'image_default_input_size': (224, 224),
|
106 |
+
'image_norm_eps': 1e-5,
|
107 |
+
'image_num_key_value_heads': 16,
|
108 |
+
"image_model_type": "openai"
|
109 |
+
},
|
110 |
+
'debug': {
|
111 |
+
'image_patch_size': 14,
|
112 |
+
'image_pos_patch_size': 14,
|
113 |
+
'image_emb_dim': 1024,
|
114 |
+
'image_num_heads': 16,
|
115 |
+
'image_num_layers': 1,
|
116 |
+
'image_head_dim': 64,
|
117 |
+
'image_mlp_dim': 1024,
|
118 |
+
'image_mlp_activations': 'quick_gelu',
|
119 |
+
'image_dropout_rate': 0.0,
|
120 |
+
'image_num_pos': 577,
|
121 |
+
'image_default_input_size': (336, 336),
|
122 |
+
'image_norm_eps': 1e-5,
|
123 |
+
'image_num_key_value_heads': 16,
|
124 |
+
"image_model_type": "openai"
|
125 |
+
}
|
126 |
+
}
|
127 |
+
|
128 |
+
OPEN_LLM_STANDARD_CONFIGS = {
|
129 |
+
"qwen1.5_7b": {
|
130 |
+
'vocab_size': 151936,
|
131 |
+
'hidden_size': 4096,
|
132 |
+
'intermediate_size': 11008,
|
133 |
+
'num_hidden_layers': 32,
|
134 |
+
'num_attention_heads': 32,
|
135 |
+
'num_key_value_heads': 32,
|
136 |
+
'max_sequence_length': 2048,
|
137 |
+
'max_position_embeddings': 32768,
|
138 |
+
'rope_theta': 1000000.0,
|
139 |
+
'initializer_range': 0.02,
|
140 |
+
'rms_norm_eps': 1e-6,
|
141 |
+
"qkv_bias": True,
|
142 |
+
'tie_word_embeddings': False,
|
143 |
+
'hidden_act': 'silu',
|
144 |
+
'norm_module': 'RMSNorm',
|
145 |
+
"tokenizer": "hf-Qwen/Qwen1.5-7B",
|
146 |
+
},
|
147 |
+
"qwen1.5_14b": {
|
148 |
+
'vocab_size': 152064,
|
149 |
+
'hidden_size': 5120,
|
150 |
+
'intermediate_size': 13696,
|
151 |
+
'num_hidden_layers': 40,
|
152 |
+
'num_attention_heads': 40,
|
153 |
+
'num_key_value_heads': 40,
|
154 |
+
'max_sequence_length': 2048,
|
155 |
+
'max_position_embeddings': 32768,
|
156 |
+
'rope_theta': 1000000.0,
|
157 |
+
'initializer_range': 0.02,
|
158 |
+
'rms_norm_eps': 1e-6,
|
159 |
+
"qkv_bias": True,
|
160 |
+
'tie_word_embeddings': False,
|
161 |
+
'hidden_act': 'silu',
|
162 |
+
'norm_module': 'RMSNorm',
|
163 |
+
"tokenizer": "hf-Qwen/Qwen1.5-14B",
|
164 |
+
},
|
165 |
+
"qwen1.5_32b": {
|
166 |
+
"vocab_size": 152064,
|
167 |
+
"hidden_size": 5120,
|
168 |
+
"intermediate_size": 27392,
|
169 |
+
"num_hidden_layers": 64,
|
170 |
+
"num_attention_heads": 40,
|
171 |
+
"num_key_value_heads": 8,
|
172 |
+
'max_sequence_length': 2048,
|
173 |
+
'max_position_embeddings': 32768,
|
174 |
+
"rope_theta": 1000000.0,
|
175 |
+
'initializer_range': 0.02,
|
176 |
+
"rms_norm_eps": 1e-6,
|
177 |
+
"qkv_bias": True,
|
178 |
+
"tie_word_embeddings": False,
|
179 |
+
'hidden_act': 'silu',
|
180 |
+
'norm_module': 'RMSNorm',
|
181 |
+
"tokenizer": "hf-Qwen/Qwen1.5-32B",
|
182 |
+
},
|
183 |
+
'llama_7b': {
|
184 |
+
'vocab_size': 32000,
|
185 |
+
'hidden_size': 4096,
|
186 |
+
'intermediate_size': 11008,
|
187 |
+
'num_hidden_layers': 32,
|
188 |
+
'num_attention_heads': 32,
|
189 |
+
'num_key_value_heads': 32,
|
190 |
+
'max_sequence_length': 2048,
|
191 |
+
'max_position_embeddings': 8192,
|
192 |
+
'rope_theta': 10000.0,
|
193 |
+
'initializer_range': 0.02,
|
194 |
+
'rms_norm_eps': 1e-5,
|
195 |
+
'tie_word_embeddings': False,
|
196 |
+
'hidden_act': 'silu',
|
197 |
+
'norm_module': 'RMSNorm',
|
198 |
+
"tokenizer": "llama"
|
199 |
+
},
|
200 |
+
'yi_6b': {
|
201 |
+
'vocab_size': 64000,
|
202 |
+
'hidden_size': 4096,
|
203 |
+
'intermediate_size': 11008,
|
204 |
+
'num_hidden_layers': 32,
|
205 |
+
'num_attention_heads': 32,
|
206 |
+
'num_key_value_heads': 4,
|
207 |
+
'max_sequence_length': 4096,
|
208 |
+
'max_position_embeddings': 4096,
|
209 |
+
'rope_theta': 5000000.0,
|
210 |
+
'initializer_range': 0.02,
|
211 |
+
'rms_norm_eps': 1e-5,
|
212 |
+
'tie_word_embeddings': False,
|
213 |
+
'hidden_act': 'silu',
|
214 |
+
'norm_module': 'RMSNorm',
|
215 |
+
"tokenizer": "yi"
|
216 |
+
},
|
217 |
+
'yi_9b': {
|
218 |
+
'vocab_size': 64000,
|
219 |
+
'hidden_size': 4096,
|
220 |
+
'intermediate_size': 11008,
|
221 |
+
'num_hidden_layers': 48,
|
222 |
+
'num_attention_heads': 32,
|
223 |
+
'num_key_value_heads': 4,
|
224 |
+
'max_sequence_length': 4096,
|
225 |
+
'max_position_embeddings': 4096,
|
226 |
+
'rope_theta': 10000,
|
227 |
+
'initializer_range': 0.02,
|
228 |
+
'rms_norm_eps': 1e-06,
|
229 |
+
'tie_word_embeddings': False,
|
230 |
+
'hidden_act': 'silu',
|
231 |
+
'norm_module': 'RMSNorm',
|
232 |
+
"tokenizer": "yi"
|
233 |
+
},
|
234 |
+
'yi_34b': {
|
235 |
+
'vocab_size': 64000,
|
236 |
+
'hidden_size': 7168,
|
237 |
+
'intermediate_size': 20480,
|
238 |
+
'num_hidden_layers': 60,
|
239 |
+
'num_attention_heads': 56,
|
240 |
+
'num_key_value_heads': 8,
|
241 |
+
'max_sequence_length': 4096,
|
242 |
+
'max_position_embeddings': 4096,
|
243 |
+
'rope_theta': 5000000.0,
|
244 |
+
'initializer_range': 0.02,
|
245 |
+
'rms_norm_eps': 1e-5,
|
246 |
+
'tie_word_embeddings': False,
|
247 |
+
'hidden_act': 'silu',
|
248 |
+
'norm_module': 'RMSNorm',
|
249 |
+
"tokenizer": "yi"
|
250 |
+
},
|
251 |
+
"olmo_1b": {
|
252 |
+
'vocab_size': 50304,
|
253 |
+
'hidden_size': 2048,
|
254 |
+
'intermediate_size': 8192,
|
255 |
+
'num_hidden_layers': 16,
|
256 |
+
'num_attention_heads': 16,
|
257 |
+
'num_key_value_heads': 16,
|
258 |
+
'max_sequence_length': 4096,
|
259 |
+
'max_position_embeddings': 32768,
|
260 |
+
'rope_theta': 10000.0,
|
261 |
+
'initializer_range': 0.02,
|
262 |
+
'rms_norm_eps': 1e-5,
|
263 |
+
'tie_word_embeddings': True,
|
264 |
+
'hidden_act': 'silu',
|
265 |
+
'norm_module': 'OlmoLayerNorm',
|
266 |
+
"tokenizer": "hf-allenai/OLMo-1B"
|
267 |
+
},
|
268 |
+
"olmo_7b": {
|
269 |
+
'vocab_size': 50304,
|
270 |
+
'hidden_size': 4096,
|
271 |
+
'intermediate_size': 22016//2,
|
272 |
+
'num_hidden_layers': 32,
|
273 |
+
'num_attention_heads': 32,
|
274 |
+
'num_key_value_heads': 32,
|
275 |
+
'max_sequence_length': 4096,
|
276 |
+
'max_position_embeddings': 32768,
|
277 |
+
'rope_theta': 10000.0,
|
278 |
+
'initializer_range': 0.02,
|
279 |
+
'rms_norm_eps': 1e-5,
|
280 |
+
'tie_word_embeddings': False,
|
281 |
+
'hidden_act': 'silu',
|
282 |
+
'norm_module': 'OlmoLayerNorm',
|
283 |
+
"tokenizer": "hf-allenai/OLMo-7B",
|
284 |
+
},
|
285 |
+
"olmo_1.7_7b": {
|
286 |
+
'vocab_size': 50304,
|
287 |
+
'hidden_size': 4096,
|
288 |
+
'intermediate_size': 22016//2,
|
289 |
+
'num_hidden_layers': 32,
|
290 |
+
'num_attention_heads': 32,
|
291 |
+
'num_key_value_heads': 32,
|
292 |
+
'max_sequence_length': 4096,
|
293 |
+
'max_position_embeddings': 32768,
|
294 |
+
'rope_theta': 10000.0,
|
295 |
+
'initializer_range': 0.02,
|
296 |
+
'rms_norm_eps': 1e-5,
|
297 |
+
'tie_word_embeddings': False,
|
298 |
+
'hidden_act': 'silu',
|
299 |
+
"qkv_clip": 8,
|
300 |
+
'norm_module': 'OlmoLayerNorm',
|
301 |
+
"tokenizer": "hf-allenai/OLMo-1.7-7B",
|
302 |
+
},
|
303 |
+
'mistral_7b': {
|
304 |
+
'vocab_size': 32000,
|
305 |
+
'hidden_size': 4096,
|
306 |
+
'intermediate_size': 14336,
|
307 |
+
'num_hidden_layers': 32,
|
308 |
+
'num_attention_heads': 32,
|
309 |
+
'num_key_value_heads': 8,
|
310 |
+
'max_sequence_length': 4096,
|
311 |
+
'max_position_embeddings': 32768,
|
312 |
+
'rope_theta': 10000.0,
|
313 |
+
'initializer_range': 0.02,
|
314 |
+
'rms_norm_eps': 1e-5,
|
315 |
+
'tie_word_embeddings': False,
|
316 |
+
'hidden_act': 'silu',
|
317 |
+
'norm_module': 'RMSNorm',
|
318 |
+
"tokenizer": "mistral"
|
319 |
+
},
|
320 |
+
'mistral0.3_7b': {
|
321 |
+
'vocab_size': 32768,
|
322 |
+
'hidden_size': 4096,
|
323 |
+
'intermediate_size': 14336,
|
324 |
+
'num_hidden_layers': 32,
|
325 |
+
'num_attention_heads': 32,
|
326 |
+
'num_key_value_heads': 8,
|
327 |
+
'max_sequence_length': 4096,
|
328 |
+
'max_position_embeddings': 32768,
|
329 |
+
'rope_theta': 1000000.0,
|
330 |
+
'initializer_range': 0.02,
|
331 |
+
'rms_norm_eps': 1e-5,
|
332 |
+
'tie_word_embeddings': False,
|
333 |
+
'hidden_act': 'silu',
|
334 |
+
'norm_module': 'RMSNorm',
|
335 |
+
"tokenizer": "mistral0.3"
|
336 |
+
},
|
337 |
+
"mistral0.2_22b": {
|
338 |
+
'vocab_size': 32000,
|
339 |
+
'hidden_size': 6144,
|
340 |
+
'intermediate_size': 16384,
|
341 |
+
'num_hidden_layers': 56,
|
342 |
+
'num_attention_heads': 48,
|
343 |
+
'num_key_value_heads': 8,
|
344 |
+
'max_sequence_length': 4096,
|
345 |
+
'max_position_embeddings': 32768,
|
346 |
+
'rope_theta': 1000000,
|
347 |
+
'initializer_range': 0.02,
|
348 |
+
'rms_norm_eps': 1e-5,
|
349 |
+
'tie_word_embeddings': False,
|
350 |
+
'hidden_act': 'silu',
|
351 |
+
'norm_module': 'RMSNorm',
|
352 |
+
"tokenizer": "mistral"
|
353 |
+
},
|
354 |
+
'llama_13b': {
|
355 |
+
'vocab_size': 32000,
|
356 |
+
'hidden_size': 5120,
|
357 |
+
'intermediate_size': 13824,
|
358 |
+
'num_hidden_layers': 40,
|
359 |
+
'num_attention_heads': 40,
|
360 |
+
'num_key_value_heads': 40,
|
361 |
+
'max_sequence_length': 2048,
|
362 |
+
'max_position_embeddings': 8192,
|
363 |
+
'initializer_range': 0.02,
|
364 |
+
'rms_norm_eps': 1e-5,
|
365 |
+
'tie_word_embeddings': False,
|
366 |
+
'hidden_act': 'silu',
|
367 |
+
"norm_module": 'RMSNorm',
|
368 |
+
'rope_theta': 10000.0,
|
369 |
+
"tokenizer": "llama"
|
370 |
+
},
|
371 |
+
'llama_70b': {
|
372 |
+
'vocab_size': 32000,
|
373 |
+
'hidden_size': 8192,
|
374 |
+
'intermediate_size': 28672,
|
375 |
+
'num_hidden_layers': 80,
|
376 |
+
'num_attention_heads': 64,
|
377 |
+
'num_key_value_heads': 8,
|
378 |
+
'max_sequence_length': 8192,
|
379 |
+
'max_position_embeddings': 8192,
|
380 |
+
'rope_theta': 10000.0,
|
381 |
+
'initializer_range': 0.02,
|
382 |
+
'rms_norm_eps': 1e-5,
|
383 |
+
'tie_word_embeddings': False,
|
384 |
+
'hidden_act': 'silu',
|
385 |
+
"tokenizer": "llama"
|
386 |
+
},
|
387 |
+
'llama_70bflash': {
|
388 |
+
'vocab_size': 32000,
|
389 |
+
'hidden_size': 8192,
|
390 |
+
'intermediate_size': 28672,
|
391 |
+
'num_hidden_layers': 80,
|
392 |
+
'num_attention_heads': 64,
|
393 |
+
'num_key_value_heads': 8,
|
394 |
+
'max_sequence_length': 8192,
|
395 |
+
'max_position_embeddings': 8192,
|
396 |
+
'rope_theta': 10000.0,
|
397 |
+
'initializer_range': 0.02,
|
398 |
+
'rms_norm_eps': 1e-5,
|
399 |
+
'tie_word_embeddings': False,
|
400 |
+
'scan_attention': True,
|
401 |
+
'scan_mlp': True,
|
402 |
+
'hidden_act': 'silu',
|
403 |
+
"tokenizer": "llama"
|
404 |
+
},
|
405 |
+
'llama3_8b': {
|
406 |
+
'vocab_size': 128256,
|
407 |
+
'hidden_size': 4096,
|
408 |
+
'intermediate_size': 14336,
|
409 |
+
'num_hidden_layers': 32,
|
410 |
+
'num_attention_heads': 32,
|
411 |
+
'num_key_value_heads': 8,
|
412 |
+
'max_sequence_length': 8192,
|
413 |
+
'max_position_embeddings': 8192,
|
414 |
+
'rope_theta': 500000.0,
|
415 |
+
'initializer_range': 0.02,
|
416 |
+
'rms_norm_eps': 1e-5,
|
417 |
+
'tie_word_embeddings': False,
|
418 |
+
'hidden_act': 'silu',
|
419 |
+
'norm_module': 'RMSNorm',
|
420 |
+
"tokenizer": "hf-meta-llama/Meta-Llama-3-8B",
|
421 |
+
|
422 |
+
},
|
423 |
+
'llama3_70b': {
|
424 |
+
'vocab_size': 128256,
|
425 |
+
'hidden_size': 8192,
|
426 |
+
'intermediate_size': 28672,
|
427 |
+
'num_hidden_layers': 80,
|
428 |
+
'num_attention_heads': 64,
|
429 |
+
'num_key_value_heads': 8,
|
430 |
+
'max_sequence_length': 8192,
|
431 |
+
'max_position_embeddings': 8192,
|
432 |
+
'rope_theta': 500000.0,
|
433 |
+
'initializer_range': 0.02,
|
434 |
+
'rms_norm_eps': 1e-5,
|
435 |
+
'tie_word_embeddings': False,
|
436 |
+
'hidden_act': 'silu',
|
437 |
+
'norm_module': 'RMSNorm',
|
438 |
+
"tokenizer": "hf-meta-llama/Meta-Llama-3-70B",
|
439 |
+
},
|
440 |
+
'open_llama_3b': {
|
441 |
+
'vocab_size': 32000,
|
442 |
+
'hidden_size': 3200,
|
443 |
+
'intermediate_size': 8640,
|
444 |
+
'num_hidden_layers': 26,
|
445 |
+
'num_attention_heads': 32,
|
446 |
+
'max_sequence_length': 2048,
|
447 |
+
'initializer_range': 0.02,
|
448 |
+
'rms_norm_eps': 1e-6,
|
449 |
+
'max_position_embeddings': 2048,
|
450 |
+
'num_key_value_heads': 32,
|
451 |
+
'rope_theta': 10000.0,
|
452 |
+
'tie_word_embeddings': False,
|
453 |
+
'hidden_act': 'silu',
|
454 |
+
'norm_module': 'RMSNorm',
|
455 |
+
"tokenizer": "llama"
|
456 |
+
},
|
457 |
+
'gemma_2b': {
|
458 |
+
'vocab_size': 256000,
|
459 |
+
'hidden_size': 2048,
|
460 |
+
'intermediate_size': 16384,
|
461 |
+
'num_hidden_layers': 18,
|
462 |
+
'num_attention_heads': 8,
|
463 |
+
'max_sequence_length': 8192,
|
464 |
+
'initializer_range': 0.02,
|
465 |
+
'rms_norm_eps': 1e-6,
|
466 |
+
'max_position_embeddings': 8192,
|
467 |
+
'num_key_value_heads': 1,
|
468 |
+
'rope_theta': 10000.0,
|
469 |
+
'tie_word_embeddings': True,
|
470 |
+
'normalize_input_embeds': True,
|
471 |
+
'norm_module': 'GemmaRMSNorm',
|
472 |
+
'hidden_act': 'gelu',
|
473 |
+
"tokenizer": "gemma"
|
474 |
+
},
|
475 |
+
'gemma_7b': {
|
476 |
+
'vocab_size': 256000,
|
477 |
+
'hidden_size': 3072,
|
478 |
+
'intermediate_size': 24576,
|
479 |
+
'num_hidden_layers': 28,
|
480 |
+
'num_attention_heads': 16,
|
481 |
+
'max_sequence_length': 8192,
|
482 |
+
'initializer_range': 0.02,
|
483 |
+
'rms_norm_eps': 1e-6,
|
484 |
+
'max_position_embeddings': 8192,
|
485 |
+
'num_key_value_heads': 16,
|
486 |
+
'rope_theta': 10000.0,
|
487 |
+
'tie_word_embeddings': True,
|
488 |
+
'normalize_input_embeds': True,
|
489 |
+
'norm_module': 'GemmaRMSNorm',
|
490 |
+
'hidden_act': 'gelu',
|
491 |
+
"tokenizer": "gemma"
|
492 |
+
},
|
493 |
+
'tiny_llama_1b': {
|
494 |
+
'vocab_size': 32000,
|
495 |
+
'hidden_size': 2048,
|
496 |
+
'intermediate_size': 5632,
|
497 |
+
'num_hidden_layers': 22,
|
498 |
+
'num_attention_heads': 32,
|
499 |
+
'max_sequence_length': 2048,
|
500 |
+
'initializer_range': 0.02,
|
501 |
+
'rms_norm_eps': 1e-5,
|
502 |
+
'max_position_embeddings': 2048,
|
503 |
+
'num_key_value_heads': 4,
|
504 |
+
'rope_theta': 10000.0,
|
505 |
+
'tie_word_embeddings': False,
|
506 |
+
'hidden_act': 'silu',
|
507 |
+
'norm_module': 'RMSNorm',
|
508 |
+
"tokenizer": "llama"
|
509 |
+
},
|
510 |
+
'debug': { # A small model for debugging
|
511 |
+
'vocab_size': 32000,
|
512 |
+
'hidden_size': 512,
|
513 |
+
'intermediate_size': 512,
|
514 |
+
'num_hidden_layers': 1,
|
515 |
+
'num_attention_heads': 8,
|
516 |
+
'max_sequence_length': 4096,
|
517 |
+
'initializer_range': 0.02,
|
518 |
+
'rms_norm_eps': 1e-5,
|
519 |
+
'max_position_embeddings': 4096,
|
520 |
+
'num_key_value_heads': 8,
|
521 |
+
'rope_theta': 10000.0,
|
522 |
+
'tie_word_embeddings': False,
|
523 |
+
'hidden_act': 'silu',
|
524 |
+
'norm_module': 'RMSNorm',
|
525 |
+
"tokenizer": "llama"
|
526 |
+
},
|
527 |
+
'gemma2_9b': {
|
528 |
+
'vocab_size': 256000,
|
529 |
+
'hidden_size': 3584,
|
530 |
+
'head_dim': 256,
|
531 |
+
'intermediate_size': 14336,
|
532 |
+
'num_hidden_layers': 42,
|
533 |
+
'num_attention_heads': 16,
|
534 |
+
'max_sequence_length': 8192,
|
535 |
+
"query_pre_attn_scalar": 224,
|
536 |
+
'initializer_range': 0.02,
|
537 |
+
'rms_norm_eps': 1e-6,
|
538 |
+
'max_position_embeddings': 8192,
|
539 |
+
'num_key_value_heads': 8,
|
540 |
+
'rope_theta': 10000.0,
|
541 |
+
'tie_word_embeddings': False,
|
542 |
+
'normalize_input_embeds': True,
|
543 |
+
'norm_module': 'GemmaRMSNorm',
|
544 |
+
'hidden_act': 'gelu_tanh',
|
545 |
+
"tokenizer": "hf-google/gemma-2-9b",
|
546 |
+
"attn_logit_softcapping": 50.0,
|
547 |
+
"final_logit_softcapping": 30.0,
|
548 |
+
},
|
549 |
+
'gemma2_27b': {
|
550 |
+
'vocab_size': 256000,
|
551 |
+
'hidden_size': 4608,
|
552 |
+
'head_dim': 128,
|
553 |
+
'intermediate_size': 36864,
|
554 |
+
'num_hidden_layers': 46,
|
555 |
+
'num_attention_heads': 32,
|
556 |
+
'max_sequence_length': 8192,
|
557 |
+
"query_pre_attn_scalar": 144,
|
558 |
+
'initializer_range': 0.02,
|
559 |
+
'rms_norm_eps': 1e-6,
|
560 |
+
'max_position_embeddings': 8192,
|
561 |
+
'num_key_value_heads': 16,
|
562 |
+
'rope_theta': 10000.0,
|
563 |
+
'tie_word_embeddings': False,
|
564 |
+
'normalize_input_embeds': True,
|
565 |
+
'norm_module': 'GemmaRMSNorm',
|
566 |
+
'hidden_act': 'gelu_tanh',
|
567 |
+
"tokenizer": "hf-google/gemma-2-27b",
|
568 |
+
"attn_logit_softcapping": 50.0,
|
569 |
+
"final_logit_softcapping": 30.0,
|
570 |
+
},
|
571 |
+
}
|
data_factory.py
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
Dataset factory to load data from huggingface and others.
|
3 |
+
'''
|
4 |
+
import dataclasses
|
5 |
+
import logging
|
6 |
+
from typing import List, Optional
|
7 |
+
|
8 |
+
import numpy as np
|
9 |
+
import tensorflow as tf
|
10 |
+
|
11 |
+
from .data_utils import add_segment_ids
|
12 |
+
from .dataset_sizes import get_dataset_size
|
13 |
+
from .tasks import get_task
|
14 |
+
from .multimodal_preprocessor import MultiModalPreprocessor
|
15 |
+
import seqio
|
16 |
+
|
17 |
+
from .torch_util import get_global_rank
|
18 |
+
|
19 |
+
log = logging.getLogger(__name__)
|
20 |
+
|
21 |
+
|
22 |
+
@dataclasses.dataclass
|
23 |
+
class SeqioDataset:
|
24 |
+
mixture_or_task_name: str
|
25 |
+
seq_len: int
|
26 |
+
global_batch_size: int
|
27 |
+
max_crops: int = None
|
28 |
+
is_training: bool = False
|
29 |
+
for_inference: bool = False
|
30 |
+
split: str = 'train'
|
31 |
+
shuffle: bool = True
|
32 |
+
num_epochs: int = None
|
33 |
+
drop_remainder: bool = True
|
34 |
+
seed: int = None
|
35 |
+
pack: bool = False
|
36 |
+
use_custom_packing_ops: bool = False
|
37 |
+
use_memory_cache: bool = False
|
38 |
+
shuffle_buffer_size: Optional[int] = None
|
39 |
+
different_host_mixture_seeds: bool = True
|
40 |
+
disable_autotune: bool = True
|
41 |
+
trim_output_features: bool = True
|
42 |
+
|
43 |
+
@classmethod
|
44 |
+
def from_dict(cls, data):
|
45 |
+
return cls(**data)
|
46 |
+
|
47 |
+
def get_task_feature_lengths_dict(self, max_crops):
|
48 |
+
if self.max_crops is not None:
|
49 |
+
assert self.max_crops >= max_crops
|
50 |
+
max_crops = self.max_crops
|
51 |
+
return dict(
|
52 |
+
target_tokens=self.seq_len,
|
53 |
+
loss_masks=self.seq_len,
|
54 |
+
images=max_crops,
|
55 |
+
image_positions=max_crops,
|
56 |
+
image_input_idx=max_crops,
|
57 |
+
is_training=self.is_training
|
58 |
+
)
|
59 |
+
|
60 |
+
def build(self, preprocessor: MultiModalPreprocessor, shard_id, num_shards):
|
61 |
+
shard_info = seqio.ShardInfo(index=shard_id, num_shards=num_shards)
|
62 |
+
task_feature_lengths_dict = self.get_task_feature_lengths_dict(
|
63 |
+
preprocessor.get_max_total_crops())
|
64 |
+
|
65 |
+
seed = self.seed
|
66 |
+
assert seed is not None
|
67 |
+
|
68 |
+
batch_size = self.global_batch_size // num_shards
|
69 |
+
|
70 |
+
if isinstance(self.mixture_or_task_name, (dict, list, tuple)):
|
71 |
+
if isinstance(self.mixture_or_task_name, dict):
|
72 |
+
items = self.mixture_or_task_name.items()
|
73 |
+
else:
|
74 |
+
items = self.mixture_or_task_name
|
75 |
+
task_list = []
|
76 |
+
for task, weight in items:
|
77 |
+
task = get_task(preprocessor, task, self.is_training, self.for_inference)
|
78 |
+
task_list.append((task, weight))
|
79 |
+
mixture_or_task = task_list
|
80 |
+
else:
|
81 |
+
mixture_or_task = get_task(
|
82 |
+
preprocessor, self.mixture_or_task_name, self.is_training, self.for_inference)
|
83 |
+
|
84 |
+
in_memory_shuffle = self.shuffle
|
85 |
+
if not self.drop_remainder:
|
86 |
+
# Used if we want to evaluate on an eval dataset without dropping any examples.
|
87 |
+
# To do this, we pad the dataset with dummy examples marked as invalid in their
|
88 |
+
# metadata so we can still get fixed-sized batches.
|
89 |
+
assert self.num_epochs is not None
|
90 |
+
assert not self.pack
|
91 |
+
assert not isinstance(mixture_or_task, list), "Inference datasets cannot be mixtures"
|
92 |
+
logging.info(
|
93 |
+
f"Initializing inf. dataset {mixture_or_task.name}: replica_batch_size={batch_size}"
|
94 |
+
f' seed={seed}, sharding={shard_info.index}/{shard_info.num_shards}'
|
95 |
+
)
|
96 |
+
ds = mixture_or_task.get_dataset(
|
97 |
+
sequence_length=task_feature_lengths_dict,
|
98 |
+
split=self.split,
|
99 |
+
shuffle=in_memory_shuffle,
|
100 |
+
num_epochs=self.num_epochs,
|
101 |
+
seed=seed,
|
102 |
+
try_in_mem_cache=self.use_memory_cache,
|
103 |
+
trim_output_features=self.trim_output_features
|
104 |
+
)
|
105 |
+
|
106 |
+
try:
|
107 |
+
n = len(ds)
|
108 |
+
except TypeError:
|
109 |
+
dataset_len = get_dataset_size(self.mixture_or_task_name, self.split)
|
110 |
+
logging.info(f"Setting dataset len to {dataset_len} based on DATASET_SIZES")
|
111 |
+
n = dataset_len
|
112 |
+
ds = tf.data.experimental.assert_cardinality(n)(ds)
|
113 |
+
|
114 |
+
remainder = n % self.global_batch_size
|
115 |
+
if remainder > 0:
|
116 |
+
n_to_pad = self.global_batch_size - remainder
|
117 |
+
else:
|
118 |
+
n_to_pad = 0
|
119 |
+
assert "metadata/valid" not in ds.element_spec
|
120 |
+
def add_valid(x):
|
121 |
+
x["metadata/valid"] = True
|
122 |
+
return x
|
123 |
+
def add_invalid(x):
|
124 |
+
x["metadata/valid"] = False
|
125 |
+
return x
|
126 |
+
ds = ds.map(add_valid)
|
127 |
+
if n_to_pad > 0:
|
128 |
+
to_pad = ds.take(1).map(add_invalid).cache().repeat(n_to_pad)
|
129 |
+
ds = ds.concatenate(to_pad)
|
130 |
+
|
131 |
+
# Shard after padding to ensure shards are the same length
|
132 |
+
ds = ds.shard(num_shards=num_shards, index=shard_id)
|
133 |
+
|
134 |
+
ds = preprocessor.get_post_mixing_preprocessor()(
|
135 |
+
ds, task_feature_lengths=task_feature_lengths_dict)
|
136 |
+
data_iter = ds.batch(batch_size, drop_remainder=True, num_parallel_calls=tf.data.experimental.AUTOTUNE)
|
137 |
+
# Make it possible for client to get the size of the batched/sharded dataset with `len()`
|
138 |
+
new_len = (n + n_to_pad) // self.global_batch_size
|
139 |
+
data_iter = tf.data.experimental.assert_cardinality(new_len)(data_iter)
|
140 |
+
else:
|
141 |
+
if isinstance(mixture_or_task, list):
|
142 |
+
total_rate = sum(x[1] for x in mixture_or_task)
|
143 |
+
mixture_or_task = [(task, r/total_rate) for task, r in mixture_or_task]
|
144 |
+
sorted_tasks: List[seqio.Task] = sorted(mixture_or_task, key=lambda x: -x[1])
|
145 |
+
|
146 |
+
if self.different_host_mixture_seeds and shard_info:
|
147 |
+
# If each process has the same seed they will draw from the datasets in the same
|
148 |
+
# order, which can make the global batches very non-random if there are
|
149 |
+
# many processes each with a small batch size. To fix this, we give each host
|
150 |
+
# a different seed based on its rank to use when mixing
|
151 |
+
mix_seed = seed + shard_info.index*4397
|
152 |
+
else:
|
153 |
+
mix_seed = seed
|
154 |
+
|
155 |
+
logging.info(
|
156 |
+
f"Initializing mixture: replica_batch_size={batch_size} seed={seed}, "
|
157 |
+
f"mix_seed={mix_seed}, sharding={shard_info.index}/{shard_info.num_shards} rates:"
|
158 |
+
)
|
159 |
+
for task, rate in sorted_tasks:
|
160 |
+
logging.info(f"\t{task.name}: {rate:0.4f}")
|
161 |
+
|
162 |
+
datasets = []
|
163 |
+
rates = []
|
164 |
+
for task, rate in sorted_tasks:
|
165 |
+
assert rate > 0
|
166 |
+
datasets.append(task.get_dataset(
|
167 |
+
task_feature_lengths_dict,
|
168 |
+
split=self.split,
|
169 |
+
shuffle=self.shuffle,
|
170 |
+
seed=seed,
|
171 |
+
shard_info=shard_info,
|
172 |
+
num_epochs=self.num_epochs,
|
173 |
+
try_in_mem_cache=self.use_memory_cache,
|
174 |
+
trim_output_features=self.trim_output_features
|
175 |
+
))
|
176 |
+
rates.append(rate)
|
177 |
+
|
178 |
+
# If any of the sub-tasks have subsegment_ids, we need to ensure all the tasks have
|
179 |
+
# a subsegment_ids field so they can be mixed
|
180 |
+
if any("subsegment_ids" in ds.element_spec for ds in datasets):
|
181 |
+
for ix, ds in enumerate(datasets):
|
182 |
+
if "subsegment_ids" not in ds.element_spec:
|
183 |
+
datasets[ix] = add_segment_ids(ds)
|
184 |
+
|
185 |
+
ds = tf.data.Dataset.sample_from_datasets(
|
186 |
+
datasets, rates, seed=mix_seed, stop_on_empty_dataset=False)
|
187 |
+
else:
|
188 |
+
logging.info(
|
189 |
+
f"Initializing dataset {mixture_or_task.name}: replica_batch_size={batch_size}"
|
190 |
+
f' seed={seed}, sharding={shard_info.index}/{shard_info.num_shards}'
|
191 |
+
)
|
192 |
+
ds = mixture_or_task.get_dataset(
|
193 |
+
task_feature_lengths_dict,
|
194 |
+
split=self.split,
|
195 |
+
shuffle=self.shuffle,
|
196 |
+
seed=seed,
|
197 |
+
shard_info=shard_info,
|
198 |
+
num_epochs=self.num_epochs,
|
199 |
+
try_in_mem_cache=self.use_memory_cache,
|
200 |
+
trim_output_features=self.trim_output_features
|
201 |
+
)
|
202 |
+
data_iter = preprocessor.get_post_mixing_preprocessor()(
|
203 |
+
ds, task_feature_lengths=task_feature_lengths_dict)
|
204 |
+
data_iter = data_iter.batch(batch_size, drop_remainder=True, num_parallel_calls=tf.data.experimental.AUTOTUNE)
|
205 |
+
ds = ds.prefetch(2)
|
206 |
+
|
207 |
+
# Following https://github.com/google-research/big_vision/blob/b8dab6e4de3436849415f37c591399c93b1eaf39/big_vision/input_pipeline.py#L228
|
208 |
+
# These options try to stop tf datasets from eating all our RAM if we are using a
|
209 |
+
# large mixture
|
210 |
+
# This options are used by default in some google codebases
|
211 |
+
# For example: (https://github.com/google-research/big_vision/blob/b8dab6e4de3436849415f37c591399c93b1eaf39/big_vision/input_pipeline.py#L228)
|
212 |
+
# They don't seem to harm throughput and can save RAM so we use them as well
|
213 |
+
options = tf.data.Options()
|
214 |
+
options.experimental_optimization.inject_prefetch = False
|
215 |
+
options.threading.max_intra_op_parallelism = 1
|
216 |
+
if self.disable_autotune:
|
217 |
+
# Following https://www.tensorflow.org/datasets/performances
|
218 |
+
# This reduces RAM and checkpoint size by a lot
|
219 |
+
options.autotune.enabled = False
|
220 |
+
data_iter = data_iter.with_options(options)
|
221 |
+
|
222 |
+
return data_iter
|
data_utils.py
ADDED
@@ -0,0 +1,827 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import abc
|
2 |
+
import dataclasses
|
3 |
+
import functools
|
4 |
+
import os
|
5 |
+
from os import environ
|
6 |
+
from typing import Mapping, Optional, Sequence, List
|
7 |
+
from absl import logging
|
8 |
+
import clu
|
9 |
+
import gin
|
10 |
+
from pathlib import Path
|
11 |
+
|
12 |
+
import seqio
|
13 |
+
from seqio import utils
|
14 |
+
from seqio.feature_converters import _check_exact_match, _check_lengths
|
15 |
+
|
16 |
+
import tensorflow as tf
|
17 |
+
from tensorflow.python.ops import control_flow_ops
|
18 |
+
from tensorflow.python.ops.image_ops_impl import _ImageDimensions, _CheckAtLeast3DImage, _assert, _is_tensor
|
19 |
+
|
20 |
+
from tensorflow.python.framework import ops
|
21 |
+
from tensorflow.python.ops import array_ops
|
22 |
+
from transformers import PreTrainedTokenizerFast
|
23 |
+
|
24 |
+
from . import seqio_tokenizer as vocab
|
25 |
+
from .constants import *
|
26 |
+
from .utils import pop_metadata
|
27 |
+
from .util import is_url
|
28 |
+
|
29 |
+
DEFAULT_EXTRA_IDS = 0
|
30 |
+
OutputFeaturesType = Mapping[str, utils.Feature]
|
31 |
+
|
32 |
+
|
33 |
+
def build_tokenizer(
|
34 |
+
tokenizer_type, has_extra_token=True,
|
35 |
+
adds_space=False,
|
36 |
+
olmo_bos_token_id=1, olmo_eos_token_id=2,
|
37 |
+
tokenizer_dir="gs://mm-olmo/tokenizer",
|
38 |
+
pad_tokenizer_to=None, cache={},
|
39 |
+
):
|
40 |
+
cache_key = (tokenizer_type, has_extra_token, adds_space, olmo_bos_token_id,
|
41 |
+
olmo_eos_token_id, pad_tokenizer_to)
|
42 |
+
if cache_key in cache:
|
43 |
+
return cache[cache_key]
|
44 |
+
|
45 |
+
if tokenizer_type == 'llama':
|
46 |
+
tok = vocab.SentencePieceVocabulary(
|
47 |
+
os.path.join(tokenizer_dir, "llama_tokenizer.model"),
|
48 |
+
extra_ids=DEFAULT_EXTRA_IDS,
|
49 |
+
reverse_extra_ids=True,
|
50 |
+
extra_tokens=EXTRA_TOKENS if has_extra_token else None,
|
51 |
+
)
|
52 |
+
elif tokenizer_type == 'yi':
|
53 |
+
tok = vocab.SentencePieceVocabulary(
|
54 |
+
os.path.join(tokenizer_dir, "yi_tokenizer.model"),
|
55 |
+
extra_ids=DEFAULT_EXTRA_IDS,
|
56 |
+
reverse_extra_ids=True,
|
57 |
+
extra_tokens=EXTRA_TOKENS if has_extra_token else None,
|
58 |
+
)
|
59 |
+
elif tokenizer_type == 'mistral':
|
60 |
+
tok = vocab.SentencePieceVocabulary(
|
61 |
+
os.path.join(tokenizer_dir, "mistral_tokenizer.model"),
|
62 |
+
extra_ids=DEFAULT_EXTRA_IDS,
|
63 |
+
reverse_extra_ids=True,
|
64 |
+
extra_tokens=EXTRA_TOKENS if has_extra_token else None,
|
65 |
+
)
|
66 |
+
|
67 |
+
elif tokenizer_type == "mistral0.3":
|
68 |
+
tok = vocab.SentencePieceVocabulary(
|
69 |
+
os.path.join(tokenizer_dir, "mistral0.3_tokenizer.model.v3"),
|
70 |
+
extra_ids=DEFAULT_EXTRA_IDS,
|
71 |
+
reverse_extra_ids=True,
|
72 |
+
extra_tokens=EXTRA_TOKENS if has_extra_token else None,
|
73 |
+
)
|
74 |
+
elif tokenizer_type == 'gemma':
|
75 |
+
tok = vocab.SentencePieceVocabulary(
|
76 |
+
os.path.join(tokenizer_dir, "gemma_tokenizer.model"),
|
77 |
+
extra_ids=DEFAULT_EXTRA_IDS,
|
78 |
+
reverse_extra_ids=True,
|
79 |
+
extra_tokens=EXTRA_TOKENS if has_extra_token else None,
|
80 |
+
)
|
81 |
+
elif tokenizer_type.startswith("hf-"):
|
82 |
+
# FIXME When using the beaker image "sanghol/mm-olmo" for hosting endpoints,
|
83 |
+
# we should set the cache_dir, otherwise FileNotFound errors will be raised
|
84 |
+
cache_dir = None if tokenizer_dir is None or is_url(tokenizer_dir) else tokenizer_dir
|
85 |
+
from transformers import AutoTokenizer
|
86 |
+
|
87 |
+
extra_tokens = list(EXTRA_TOKENS)
|
88 |
+
if pad_tokenizer_to is not None:
|
89 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_type[3:], token=environ.get("HF_ACCESS_TOKEN"), cache_dir=cache_dir)
|
90 |
+
n_extra_tokens = pad_tokenizer_to - len(tokenizer)
|
91 |
+
# This handles a case where the LLM embedding matrix is larger than the vocab size
|
92 |
+
# We need the extra tokens in `EXTRA_TOKENS` to be assigned id's higher than the embedding
|
93 |
+
# matrix size, not the vocab size, since we will concat the embedding and matrix with
|
94 |
+
# the special token embedding matrix, so we pad the vocab with additional special tokens
|
95 |
+
if n_extra_tokens > 0:
|
96 |
+
logging.info(f"Padding tokenizer with {n_extra_tokens} tokens")
|
97 |
+
extra_tokens = [f"|<EXTRA_TOKENS_{i}>|" for i in range(n_extra_tokens)] + extra_tokens
|
98 |
+
|
99 |
+
bos_token_id = None
|
100 |
+
|
101 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
102 |
+
tokenizer_type[3:], additional_special_tokens=extra_tokens,
|
103 |
+
token=environ.get("HF_ACCESS_TOKEN"),
|
104 |
+
cache_dir=cache_dir,
|
105 |
+
)
|
106 |
+
if ("qwen2" in tokenizer_type.lower()) or ("olmo" in tokenizer_type.lower()):
|
107 |
+
# These tokenizers do not have a BOS, and instead use EOS as a generic seperator token.
|
108 |
+
# In this case we will use EOS as BOS
|
109 |
+
assert tokenizer.bos_token_id is None
|
110 |
+
bos_token_id = tokenizer.eos_token_id
|
111 |
+
|
112 |
+
if pad_tokenizer_to is not None:
|
113 |
+
for ix, tok in enumerate(EXTRA_TOKENS):
|
114 |
+
ids = tokenizer.encode(tok, add_special_tokens=False)
|
115 |
+
assert ids == [pad_tokenizer_to + ix]
|
116 |
+
|
117 |
+
tok = vocab.HfTokenizerWrapper(tokenizer, bos_token_id=bos_token_id, adds_space=adds_space)
|
118 |
+
elif tokenizer_type.startswith("olmo-"):
|
119 |
+
from olmo.tokenizer import Tokenizer
|
120 |
+
assert Path(tokenizer_type[5:]).is_file()
|
121 |
+
tokenizer = Tokenizer.from_file(
|
122 |
+
tokenizer_type[5:],
|
123 |
+
eos_token_id=olmo_eos_token_id,
|
124 |
+
pad_token_id=-1,
|
125 |
+
)
|
126 |
+
tok = vocab.OLMoTokenizerWrapper(tokenizer, bos_token_id=olmo_bos_token_id, adds_space=adds_space)
|
127 |
+
else:
|
128 |
+
raise NotImplementedError(tokenizer_type)
|
129 |
+
cache[cache_key] = tok
|
130 |
+
return tok
|
131 |
+
|
132 |
+
|
133 |
+
def get_special_token_ids(tokenizer):
|
134 |
+
if isinstance(tokenizer, (vocab.HfTokenizerWrapper, vocab.OLMoTokenizerWrapper)):
|
135 |
+
ids = tokenizer.encode("".join(EXTRA_TOKENS))
|
136 |
+
if len(ids) == len(EXTRA_TOKENS) + 1:
|
137 |
+
ids = ids[1:]
|
138 |
+
elif ("gemma_tokenizer" in tokenizer._sentencepiece_model_file or
|
139 |
+
"yi_tokenizer" in tokenizer._sentencepiece_model_file
|
140 |
+
):
|
141 |
+
# Not sure why ATM, but the LLaMa tokenizer will add an extra space token
|
142 |
+
# if this string starts with a space, while the gemma one needs the leading space
|
143 |
+
ids = tokenizer.encode(" " + " ".join(EXTRA_TOKENS))
|
144 |
+
else:
|
145 |
+
ids = tokenizer.encode(" ".join(EXTRA_TOKENS))
|
146 |
+
|
147 |
+
assert len(ids) == len(EXTRA_TOKENS)
|
148 |
+
return {k: i for k, i in zip(EXTRA_TOKENS, ids)}
|
149 |
+
|
150 |
+
|
151 |
+
def _append_to_innermost_axis(
|
152 |
+
tensor: tf.Tensor, scalar: tf.Tensor,
|
153 |
+
) -> tf.Tensor:
|
154 |
+
"""Appends `scalar` to each slice in the innermost axis of `tensor`.
|
155 |
+
|
156 |
+
>>> _append_to_innermost_axis([1, 2, 3], -1)
|
157 |
+
[1, 2, 3, -1]
|
158 |
+
>>> _append_to_innermost_axis([[1, 2], [3, 4]], -1)
|
159 |
+
[[1, 2, -1], [3, 4, -1]]
|
160 |
+
>>> _append_to_innermost_axis(tf.ragged.constant([[1, 2], [3]]), -1)
|
161 |
+
[[1, 2, -1], [3, -1]]
|
162 |
+
|
163 |
+
Args:
|
164 |
+
tensor: The tensor that should have a value appended.
|
165 |
+
scalar: The value to append.
|
166 |
+
|
167 |
+
Returns:
|
168 |
+
A copy of `tensor` with `scalar` appended to each slice along
|
169 |
+
the innermost axis.
|
170 |
+
"""
|
171 |
+
if isinstance(tensor, tf.RaggedTensor):
|
172 |
+
if tensor.shape.rank > 2:
|
173 |
+
return tensor.with_values(
|
174 |
+
_append_to_innermost_axis(tensor.values, scalar)
|
175 |
+
)
|
176 |
+
else:
|
177 |
+
return tf.concat([tensor, tf.fill([tensor.nrows(), 1], scalar)], axis=1)
|
178 |
+
else:
|
179 |
+
ndims = tf.rank(tensor)
|
180 |
+
paddings = tf.concat(
|
181 |
+
[tf.zeros((ndims - 1, 2), dtype=tf.int32), tf.constant([[0, 1]])],
|
182 |
+
axis=0,
|
183 |
+
)
|
184 |
+
return tf.pad(tensor, paddings=paddings, constant_values=scalar)
|
185 |
+
|
186 |
+
|
187 |
+
def _shift_right_by_one(tensor: tf.Tensor, bos_id: int = 0) -> tf.Tensor:
|
188 |
+
"""Shift the input tensor to the right by one position without wrapping."""
|
189 |
+
|
190 |
+
if not (tensor.dtype.is_integer or tensor.dtype.is_floating):
|
191 |
+
raise ValueError(f"Only numeric types are supported. Got: {tensor.dtype}")
|
192 |
+
# tf.roll wraps around the axis.
|
193 |
+
rolled = tf.roll(tensor, shift=1, axis=0)
|
194 |
+
|
195 |
+
# Zero out the first position by multiplying with [0, 1, 1, ..., 1].
|
196 |
+
depth = tf.shape(tensor)[0]
|
197 |
+
mask = tf.one_hot(0, depth=depth, on_value=0, off_value=1, dtype=tensor.dtype)
|
198 |
+
|
199 |
+
# Expand dims of mask to broadcast to rolled.
|
200 |
+
dim_expansion = [slice(None, None)] + [None] * (len(rolled.shape) - 1)
|
201 |
+
mask = mask[dim_expansion]
|
202 |
+
return rolled * mask + (1 - mask) * bos_id
|
203 |
+
|
204 |
+
|
205 |
+
def make_autoregressive_inputs(
|
206 |
+
targets: tf.Tensor,
|
207 |
+
sequence_id: tf.Tensor = None,
|
208 |
+
output_dtype: Optional[tf.dtypes.DType] = None,
|
209 |
+
bos_id: int = 0,
|
210 |
+
) -> tf.Tensor:
|
211 |
+
"""Generate inputs for an autoregressive model, by shifting the targets.
|
212 |
+
|
213 |
+
Modified from mesh_tensorflow.transformer.transformer.autoregressive_inputs.
|
214 |
+
|
215 |
+
For the first element of each sequence, the returned input id is 0.
|
216 |
+
|
217 |
+
For a "packed" dataset, also pass the sequence_id tensor, which aligns
|
218 |
+
with the targets tensor and contains different values for different
|
219 |
+
concatenated examples.
|
220 |
+
|
221 |
+
Example for a packed dataset:
|
222 |
+
|
223 |
+
```
|
224 |
+
targets = [3, 8, 2, 9, 2, 5, 4, 2, -1, -1]
|
225 |
+
sequence_id = [1, 1, 1, 2, 2, 3, 3, 3, 0, 0]
|
226 |
+
inputs = [1, 3, 8, 1, 9, 1, 5, 4, -1, -1]
|
227 |
+
| | |
|
228 |
+
These positions are set to 0 if sequence_id is not
|
229 |
+
None.
|
230 |
+
```
|
231 |
+
|
232 |
+
Args:
|
233 |
+
targets: a tf.int32 tensor with shape [length].
|
234 |
+
sequence_id: an optional tensor with the same shape as targets.
|
235 |
+
output_dtype: an optional output data type.
|
236 |
+
bos_id: bos id.
|
237 |
+
|
238 |
+
Returns:
|
239 |
+
a tensor with dtype tf.int32 and the same shape as targets.
|
240 |
+
"""
|
241 |
+
output_dtype = output_dtype or targets.dtype
|
242 |
+
if sequence_id is not None and not sequence_id.dtype.is_integer:
|
243 |
+
raise ValueError(
|
244 |
+
"The sequence_id should be integer-valued tensors for a packed dataset."
|
245 |
+
)
|
246 |
+
if sequence_id is not None and len(targets.shape) > 1:
|
247 |
+
raise ValueError(
|
248 |
+
"Only 1-D sequences are supported with packing. Got a "
|
249 |
+
f"packed {len(targets.shape)}-D sequence."
|
250 |
+
)
|
251 |
+
|
252 |
+
inputs = _shift_right_by_one(targets, bos_id)
|
253 |
+
if inputs.dtype != output_dtype:
|
254 |
+
inputs = tf.cast(inputs, output_dtype)
|
255 |
+
|
256 |
+
# We should have a 0 at the beginning of each sequence rather than the
|
257 |
+
# shifted EOS (e.g. 1) from the previous sequence.
|
258 |
+
if sequence_id is not None:
|
259 |
+
not_first_in_sequence = tf.equal(
|
260 |
+
sequence_id, _shift_right_by_one(sequence_id)
|
261 |
+
)
|
262 |
+
not_first_in_sequence = tf.cast(not_first_in_sequence, output_dtype)
|
263 |
+
first_ids = tf.cast((1 - not_first_in_sequence) * bos_id, output_dtype)
|
264 |
+
inputs = inputs * not_first_in_sequence + first_ids
|
265 |
+
return inputs
|
266 |
+
|
267 |
+
|
268 |
+
@tf.function
|
269 |
+
def sum_except_first_axis(tensor):
|
270 |
+
# Compute the sum along all axes except the first
|
271 |
+
axes_to_sum = tuple(range(1, len(tensor.shape)))
|
272 |
+
return tf.reduce_sum(tensor, axis=axes_to_sum)
|
273 |
+
|
274 |
+
|
275 |
+
@seqio.map_over_dataset()
|
276 |
+
def add_segment_ids(ex):
|
277 |
+
ex["subsegment_ids"] = tf.zeros_like(ex["target_tokens"], dtype=tf.int32)
|
278 |
+
return ex
|
279 |
+
|
280 |
+
|
281 |
+
def trim_and_pad_dataset(
|
282 |
+
dataset: tf.data.Dataset, feature_lengths: Mapping[str, int]
|
283 |
+
) -> tf.data.Dataset:
|
284 |
+
"""Trim and pad first dimension of features to `feature_lengths`.
|
285 |
+
|
286 |
+
Args:
|
287 |
+
dataset: tf.data.Dataset, the dataset to trim/pad examples in.
|
288 |
+
feature_lengths: map from feature key to final length. Other features will
|
289 |
+
be returned unchanged.
|
290 |
+
|
291 |
+
Returns:
|
292 |
+
Trimmed/padded tf.data.Dataset.
|
293 |
+
"""
|
294 |
+
|
295 |
+
def _trim_and_pad(k: str, t: tf.Tensor) -> tf.Tensor:
|
296 |
+
"""Trim/pad to the first axis of `t` to be of size `length`."""
|
297 |
+
if k not in feature_lengths:
|
298 |
+
return t
|
299 |
+
if isinstance(t, tf.RaggedTensor):
|
300 |
+
t = t.to_tensor()
|
301 |
+
|
302 |
+
constant_values = -1
|
303 |
+
length_k = feature_lengths[k]
|
304 |
+
if isinstance(length_k, int):
|
305 |
+
t = t[:length_k]
|
306 |
+
pad_amt = length_k - tf.shape(t)[0]
|
307 |
+
padded_t = tf.pad(t, [(0, pad_amt)] + [(0, 0)] * (len(t.shape) - 1), constant_values=constant_values)
|
308 |
+
padded_t.set_shape([length_k] + t.shape.as_list()[1:])
|
309 |
+
return padded_t
|
310 |
+
|
311 |
+
slices = tuple((slice(0, limit) for limit in length_k))
|
312 |
+
t = t[slices]
|
313 |
+
pad_amt = tf.pad((length_k - tf.shape(t))[..., None], ((0, 0), (1, 0)), constant_values=constant_values)
|
314 |
+
padded_t = tf.pad(t, pad_amt, constant_values=constant_values)
|
315 |
+
padded_t.set_shape(length_k)
|
316 |
+
return padded_t
|
317 |
+
|
318 |
+
return dataset.map(
|
319 |
+
lambda x: {k: _trim_and_pad(k, t) for k, t in x.items()},
|
320 |
+
num_parallel_calls=tf.data.experimental.AUTOTUNE,
|
321 |
+
)
|
322 |
+
|
323 |
+
|
324 |
+
def get_3d_subsegments(segmented_suffix):
|
325 |
+
q_lens, text_lens = segmented_suffix.nested_row_lengths()
|
326 |
+
text_segments = tf.range(0, tf.shape(text_lens)[0], dtype=tf.int32)
|
327 |
+
question_repeat = tf.reshape(tf.stack([tf.ones_like(q_lens), q_lens-1], 1), [-1])
|
328 |
+
question_offset = tf.range(1, tf.shape(q_lens)[0]+1, dtype=tf.int32)*200
|
329 |
+
question_offset = tf.reshape(tf.stack([question_offset, question_offset-100], 1), [-1])
|
330 |
+
text_segments = text_segments + tf.repeat(question_offset, question_repeat)
|
331 |
+
segment_ids = tf.cast(tf.repeat(text_segments, text_lens), tf.int32)
|
332 |
+
return segment_ids
|
333 |
+
|
334 |
+
|
335 |
+
def assert_not_truncated(ds, keys, max_val):
|
336 |
+
def _check(ex):
|
337 |
+
for k in keys:
|
338 |
+
tf.assert_less(tf.shape(ex[k])[0], max_val+1,
|
339 |
+
message=f"Field {k} was unexpectedly truncated max_len={max_val}")
|
340 |
+
return ex
|
341 |
+
return ds.map(_check)
|
342 |
+
|
343 |
+
|
344 |
+
def apply_with_random_selector(x, func, num_cases):
|
345 |
+
"""Computes func(x, sel), with sel sampled from [0...num_cases-1].
|
346 |
+
Args:
|
347 |
+
x: input Tensor.
|
348 |
+
func: Python function to apply.
|
349 |
+
num_cases: Python int32, number of cases to sample sel from.
|
350 |
+
Returns:
|
351 |
+
The result of func(x, sel), where func receives the value of the
|
352 |
+
selector as a python integer, but sel is sampled dynamically.
|
353 |
+
"""
|
354 |
+
sel = tf.random.uniform([], maxval=num_cases, dtype=tf.int32)
|
355 |
+
# Pass the real x only to one of the func calls.
|
356 |
+
return control_flow_ops.merge([
|
357 |
+
func(control_flow_ops.switch(x, tf.equal(sel, case))[1], case)
|
358 |
+
for case in range(num_cases)])[0]
|
359 |
+
|
360 |
+
|
361 |
+
def denormalize_boxes(boxes, image_shape):
|
362 |
+
"""Converts boxes normalized by [height, width] to pixel coordinates.
|
363 |
+
Args:
|
364 |
+
boxes: a tensor whose last dimension is 4 representing the coordinates of
|
365 |
+
boxes in ymin, xmin, ymax, xmax order.
|
366 |
+
image_shape: a list of two integers, a two-element vector or a tensor such
|
367 |
+
that all but the last dimensions are `broadcastable` to `boxes`. The last
|
368 |
+
dimension is 2, which represents [height, width].
|
369 |
+
Returns:
|
370 |
+
denormalized_boxes: a tensor whose shape is the same as `boxes` representing
|
371 |
+
the denormalized boxes.
|
372 |
+
Raises:
|
373 |
+
ValueError: If the last dimension of boxes is not 4.
|
374 |
+
"""
|
375 |
+
with tf.name_scope('denormalize_boxes'):
|
376 |
+
if isinstance(image_shape, list) or isinstance(image_shape, tuple):
|
377 |
+
height, width = image_shape
|
378 |
+
height = tf.cast(height, dtype=boxes.dtype)
|
379 |
+
width = tf.cast(width, dtype=boxes.dtype)
|
380 |
+
else:
|
381 |
+
image_shape = tf.cast(image_shape, dtype=boxes.dtype)
|
382 |
+
height, width = tf.split(image_shape, 2, axis=-1)
|
383 |
+
|
384 |
+
ymin, xmin, ymax, xmax = tf.split(boxes, 4, axis=-1)
|
385 |
+
ymin = ymin * height
|
386 |
+
xmin = xmin * width
|
387 |
+
ymax = ymax * height
|
388 |
+
xmax = xmax * width
|
389 |
+
|
390 |
+
denormalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1)
|
391 |
+
return denormalized_boxes
|
392 |
+
|
393 |
+
def pad_to_bounding_box(image, offset_height, offset_width, target_height,
|
394 |
+
target_width, value=0):
|
395 |
+
|
396 |
+
return pad_to_bounding_box_internal(
|
397 |
+
image,
|
398 |
+
offset_height,
|
399 |
+
offset_width,
|
400 |
+
target_height,
|
401 |
+
target_width,
|
402 |
+
check_dims=True,
|
403 |
+
value=value)
|
404 |
+
|
405 |
+
def pad_to_bounding_box_internal(image, offset_height, offset_width,
|
406 |
+
target_height, target_width, check_dims, value):
|
407 |
+
|
408 |
+
with ops.name_scope(None, 'pad_to_bounding_box_with_one_internal', [image]):
|
409 |
+
image = ops.convert_to_tensor(image, name='image')
|
410 |
+
|
411 |
+
is_batch = True
|
412 |
+
image_shape = image.get_shape()
|
413 |
+
if image_shape.ndims == 3:
|
414 |
+
is_batch = False
|
415 |
+
image = array_ops.expand_dims(image, 0)
|
416 |
+
elif image_shape.ndims is None:
|
417 |
+
is_batch = False
|
418 |
+
image = array_ops.expand_dims(image, 0)
|
419 |
+
image.set_shape([None] * 4)
|
420 |
+
elif image_shape.ndims != 4:
|
421 |
+
raise ValueError(
|
422 |
+
'\'image\' (shape %s) must have either 3 or 4 dimensions.' %
|
423 |
+
image_shape)
|
424 |
+
|
425 |
+
batch, height, width, depth = _ImageDimensions(image, rank=4)
|
426 |
+
|
427 |
+
after_padding_width = target_width - offset_width - width
|
428 |
+
|
429 |
+
after_padding_height = target_height - offset_height - height
|
430 |
+
|
431 |
+
if check_dims:
|
432 |
+
assert_ops = _CheckAtLeast3DImage(image, require_static=False)
|
433 |
+
assert_ops += _assert(offset_height >= 0, ValueError,
|
434 |
+
'offset_height must be >= 0')
|
435 |
+
assert_ops += _assert(offset_width >= 0, ValueError,
|
436 |
+
'offset_width must be >= 0')
|
437 |
+
assert_ops += _assert(after_padding_width >= 0, ValueError,
|
438 |
+
'width must be <= target - offset')
|
439 |
+
assert_ops += _assert(after_padding_height >= 0, ValueError,
|
440 |
+
'height must be <= target - offset')
|
441 |
+
image = control_flow_ops.with_dependencies(assert_ops, image)
|
442 |
+
|
443 |
+
# Do not pad on the depth dimensions.
|
444 |
+
paddings = array_ops.reshape(
|
445 |
+
tf.stack([
|
446 |
+
0, 0, offset_height, after_padding_height, offset_width,
|
447 |
+
after_padding_width, 0, 0
|
448 |
+
]), [4, 2])
|
449 |
+
padded = array_ops.pad(image, paddings, constant_values=value)
|
450 |
+
|
451 |
+
padded_shape = [
|
452 |
+
None if _is_tensor(i) else i
|
453 |
+
for i in [batch, target_height, target_width, depth]
|
454 |
+
]
|
455 |
+
padded.set_shape(padded_shape)
|
456 |
+
|
457 |
+
if not is_batch:
|
458 |
+
padded = array_ops.squeeze(padded, axis=[0])
|
459 |
+
|
460 |
+
return padded
|
461 |
+
|
462 |
+
def resize_and_crop_boxes(boxes, image_scale, output_size, offset, paddings):
|
463 |
+
"""Resizes boxes to output size with scale and offset.
|
464 |
+
Args:
|
465 |
+
boxes: `Tensor` of shape [N, 4] representing ground truth boxes.
|
466 |
+
image_scale: 2D float `Tensor` representing scale factors that apply to
|
467 |
+
[height, width] of input image.
|
468 |
+
output_size: 2D `Tensor` or `int` representing [height, width] of target
|
469 |
+
output image size.
|
470 |
+
offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
|
471 |
+
boxes.
|
472 |
+
paddings: 2D `Tensor` representing top/left paddings.
|
473 |
+
Returns:
|
474 |
+
boxes: `Tensor` of shape [N, 4] representing the scaled boxes.
|
475 |
+
"""
|
476 |
+
# Adjusts box coordinates based on image_scale, offset and paddings.
|
477 |
+
boxes *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
|
478 |
+
boxes -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
|
479 |
+
boxes += tf.tile(tf.expand_dims(paddings, axis=0), [1, 2])
|
480 |
+
# Clips the boxes.
|
481 |
+
boxes = clip_boxes(boxes, output_size)
|
482 |
+
return boxes
|
483 |
+
|
484 |
+
def clip_boxes(boxes, image_shape):
|
485 |
+
"""Clips boxes to image boundaries.
|
486 |
+
Args:
|
487 |
+
boxes: a tensor whose last dimension is 4 representing the coordinates of
|
488 |
+
boxes in ymin, xmin, ymax, xmax order.
|
489 |
+
image_shape: a list of two integers, a two-element vector or a tensor such
|
490 |
+
that all but the last dimensions are `broadcastable` to `boxes`. The last
|
491 |
+
dimension is 2, which represents [height, width].
|
492 |
+
Returns:
|
493 |
+
clipped_boxes: a tensor whose shape is the same as `boxes` representing the
|
494 |
+
clipped boxes.
|
495 |
+
Raises:
|
496 |
+
ValueError: If the last dimension of boxes is not 4.
|
497 |
+
"""
|
498 |
+
if boxes.shape[-1] != 4:
|
499 |
+
raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
|
500 |
+
boxes.shape[-1]))
|
501 |
+
|
502 |
+
with tf.name_scope('clip_boxes'):
|
503 |
+
if isinstance(image_shape, list) or isinstance(image_shape, tuple):
|
504 |
+
height, width = image_shape
|
505 |
+
max_length = [height, width, height, width]
|
506 |
+
else:
|
507 |
+
image_shape = tf.cast(image_shape, dtype=boxes.dtype)
|
508 |
+
height, width = tf.unstack(image_shape, axis=-1)
|
509 |
+
max_length = tf.stack(
|
510 |
+
[height, width, height, width], axis=-1)
|
511 |
+
|
512 |
+
clipped_boxes = tf.math.maximum(tf.math.minimum(boxes, max_length), 0.0)
|
513 |
+
return clipped_boxes
|
514 |
+
|
515 |
+
|
516 |
+
def get_non_empty_box_indices(boxes):
|
517 |
+
"""Get indices for non-empty boxes."""
|
518 |
+
# Selects indices if box height or width is 0.
|
519 |
+
height = boxes[:, 2] - boxes[:, 0]
|
520 |
+
width = boxes[:, 3] - boxes[:, 1]
|
521 |
+
indices = tf.where(
|
522 |
+
tf.logical_and(tf.greater(height, 0), tf.greater(width, 0)))
|
523 |
+
return indices[:, 0]
|
524 |
+
|
525 |
+
|
526 |
+
def resize_and_pad(image, desired_output_size, masks=None, boxes=None, labels=None,
|
527 |
+
random_scale_min=0.1, random_scale_max=2.0, do_random_scale=False,
|
528 |
+
shrink_both_sides=True, boxes1=None, filter_box=True,
|
529 |
+
desired_target_size=None, random_scale_ratio=0.0,
|
530 |
+
resize_method=tf.image.ResizeMethod.BILINEAR, return_outputs=True,
|
531 |
+
pad_value=0, normalize=True):
|
532 |
+
desired_height, desired_width = desired_output_size
|
533 |
+
desired_height_f = tf.cast(desired_height, dtype=tf.float32)
|
534 |
+
desired_width_f = tf.cast(desired_width, dtype=tf.float32)
|
535 |
+
|
536 |
+
height = tf.cast(tf.shape(image)[0], tf.float32)
|
537 |
+
width = tf.cast(tf.shape(image)[1], tf.float32)
|
538 |
+
|
539 |
+
if boxes is not None:
|
540 |
+
# Converts boxes from normalized coordinates to pixel coordinates.
|
541 |
+
# Now the coordinates of boxes are w.r.t. the original image.
|
542 |
+
boxes = denormalize_boxes(boxes, [height, width])
|
543 |
+
|
544 |
+
if boxes1 is not None:
|
545 |
+
boxes1 = denormalize_boxes(boxes1, [height, width])
|
546 |
+
|
547 |
+
if do_random_scale:
|
548 |
+
random_scale_factor = tf.random.uniform([], random_scale_min, random_scale_max)
|
549 |
+
if not shrink_both_sides:
|
550 |
+
# Max random is where scale * W > W_desired
|
551 |
+
# scale * H > H_desired
|
552 |
+
rsf_max = tf.maximum(desired_width_f / width, desired_height_f / height)
|
553 |
+
random_scale_factor = tf.minimum(rsf_max, random_scale_factor)
|
554 |
+
|
555 |
+
scaled_y = tf.cast(random_scale_factor * desired_height_f, tf.int32)
|
556 |
+
scaled_x = tf.cast(random_scale_factor * desired_width_f, tf.int32)
|
557 |
+
|
558 |
+
# Recompute the accurate scale_factor using rounded scaled image size.
|
559 |
+
image_scale_y = tf.cast(scaled_y, tf.float32) / height
|
560 |
+
image_scale_x = tf.cast(scaled_x, tf.float32) / width
|
561 |
+
|
562 |
+
image_scale = tf.cond(tf.less(
|
563 |
+
tf.random.uniform([], minval=0, maxval=1, dtype=tf.float32),
|
564 |
+
tf.cast(random_scale_ratio, tf.float32)),
|
565 |
+
lambda: tf.maximum(image_scale_x, image_scale_y),
|
566 |
+
lambda: tf.minimum(image_scale_x, image_scale_y))
|
567 |
+
|
568 |
+
# image_scale = tf.minimum(image_scale_x, image_scale_y)
|
569 |
+
|
570 |
+
# Conceptual captions has some REALLY WIDE images I believe
|
571 |
+
# this ensures that we won't scale any side lower than to 64
|
572 |
+
image_scale = tf.maximum(image_scale, 64.0 / tf.minimum(height, width))
|
573 |
+
|
574 |
+
# Select non-zero random offset (x, y) if scaled image is larger than
|
575 |
+
# self._output_size.
|
576 |
+
scaled_height = tf.cast(height * image_scale, tf.int32)
|
577 |
+
scaled_width = tf.cast(width * image_scale, tf.int32)
|
578 |
+
offset_y = tf.cast(scaled_height - desired_height, tf.float32)
|
579 |
+
offset_x = tf.cast(scaled_width - desired_width, tf.float32)
|
580 |
+
offset_y = tf.maximum(0.0, offset_y) * tf.random.uniform([], 0, 1)
|
581 |
+
offset_x = tf.maximum(0.0, offset_x) * tf.random.uniform([], 0, 1)
|
582 |
+
offset_y = tf.cast(offset_y, tf.int32)
|
583 |
+
offset_x = tf.cast(offset_x, tf.int32)
|
584 |
+
else:
|
585 |
+
image_scale_y = desired_height_f / height
|
586 |
+
image_scale_x = desired_width_f / width
|
587 |
+
image_scale = tf.minimum(image_scale_x, image_scale_y)
|
588 |
+
scaled_height = tf.cast(height * image_scale, tf.int32)
|
589 |
+
scaled_width = tf.cast(width * image_scale, tf.int32)
|
590 |
+
offset_y = tf.constant(0)
|
591 |
+
offset_x = tf.constant(0)
|
592 |
+
|
593 |
+
# Now resize and crop
|
594 |
+
if resize_method == 'random' and do_random_scale:
|
595 |
+
resize_methods = sorted([k for k in tf.image.ResizeMethod.__dict__.keys() if k.isupper()])
|
596 |
+
image = apply_with_random_selector(
|
597 |
+
image,
|
598 |
+
lambda x, method_idx: tf.image.resize(x, [scaled_height, scaled_width],
|
599 |
+
tf.image.ResizeMethod.__dict__[resize_methods[method_idx]],
|
600 |
+
antialias=True),
|
601 |
+
num_cases=len(resize_methods))
|
602 |
+
|
603 |
+
elif resize_method != 'random':
|
604 |
+
image = tf.image.resize(image, [scaled_height, scaled_width], method=resize_method, antialias=True)
|
605 |
+
else:
|
606 |
+
image = tf.image.resize(image, [scaled_height, scaled_width],
|
607 |
+
method=tf.image.ResizeMethod.BILINEAR, antialias=True)
|
608 |
+
|
609 |
+
image = tf.clip_by_value(image, 0.0, 1.0)
|
610 |
+
|
611 |
+
# H x W x C
|
612 |
+
image = image[offset_y:offset_y + desired_height, offset_x:offset_x + desired_width, :]
|
613 |
+
|
614 |
+
H = tf.shape(image)[0]
|
615 |
+
W = tf.shape(image)[1]
|
616 |
+
|
617 |
+
top_pad = (desired_height - H) // 2
|
618 |
+
left_pad = (desired_width - W) // 2
|
619 |
+
|
620 |
+
image_mask = pad_to_bounding_box(
|
621 |
+
tf.ones_like(image, dtype=tf.bool), top_pad, left_pad, desired_height, desired_width)[:,:,0]
|
622 |
+
|
623 |
+
image = pad_to_bounding_box(image, top_pad, left_pad, desired_height, desired_width, value=pad_value)
|
624 |
+
|
625 |
+
if isinstance(desired_height, int) and isinstance(desired_width, int):
|
626 |
+
image.set_shape([desired_height, desired_width, 3])
|
627 |
+
|
628 |
+
if masks is not None and tf.size(masks) != 0:
|
629 |
+
masks = tf.image.resize(masks, [scaled_height, scaled_width],
|
630 |
+
method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
|
631 |
+
|
632 |
+
if len(masks.shape) == 3:
|
633 |
+
masks = masks[offset_y:offset_y + desired_height, offset_x:offset_x + desired_width]
|
634 |
+
else:
|
635 |
+
masks = masks[:, offset_y:offset_y + desired_height, offset_x:offset_x + desired_width]
|
636 |
+
|
637 |
+
masks = pad_to_bounding_box(masks, top_pad, left_pad, desired_height, desired_width)
|
638 |
+
masks = tf.image.resize(masks, desired_target_size,
|
639 |
+
method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
|
640 |
+
|
641 |
+
indices = None
|
642 |
+
if boxes is not None:
|
643 |
+
# assert ValueError("the box need to be shift which is not tested yet.")
|
644 |
+
boxes = resize_and_crop_boxes(
|
645 |
+
boxes,
|
646 |
+
tf.stack([image_scale, image_scale]),
|
647 |
+
[desired_height, desired_width],
|
648 |
+
tf.cast(tf.stack([offset_y, offset_x]), dtype=tf.float32),
|
649 |
+
tf.cast(tf.stack([top_pad, left_pad]), dtype=tf.float32))
|
650 |
+
|
651 |
+
if filter_box:
|
652 |
+
indices = get_non_empty_box_indices(boxes)
|
653 |
+
else:
|
654 |
+
indices = tf.range(tf.shape(boxes)[0])
|
655 |
+
boxes = tf.gather(boxes, indices)
|
656 |
+
|
657 |
+
if labels is not None:
|
658 |
+
labels = tf.gather(labels, indices)
|
659 |
+
|
660 |
+
if boxes1 is not None:
|
661 |
+
boxes1 = resize_and_crop_boxes(
|
662 |
+
boxes1,
|
663 |
+
tf.stack([image_scale, image_scale]),
|
664 |
+
[desired_height, desired_width],
|
665 |
+
tf.cast(tf.stack([offset_y, offset_x]), dtype=tf.float32),
|
666 |
+
tf.cast(tf.stack([top_pad, left_pad]), dtype=tf.float32))
|
667 |
+
|
668 |
+
image_info = tf.stack([
|
669 |
+
tf.cast(top_pad, tf.float32),
|
670 |
+
tf.cast(left_pad, tf.float32),
|
671 |
+
1.0 / image_scale,
|
672 |
+
height,
|
673 |
+
width,
|
674 |
+
tf.cast(offset_y, dtype=tf.float32) / height,
|
675 |
+
tf.cast(offset_x, dtype=tf.float32) / width,
|
676 |
+
tf.cast(offset_y, dtype=tf.float32),
|
677 |
+
tf.cast(offset_x, dtype=tf.float32),
|
678 |
+
tf.cast(scaled_height, dtype=tf.float32),
|
679 |
+
tf.cast(scaled_width, dtype=tf.float32),
|
680 |
+
])
|
681 |
+
|
682 |
+
if boxes1 is not None:
|
683 |
+
outputs = (image_info, masks, boxes, labels, indices, boxes1)
|
684 |
+
else:
|
685 |
+
outputs = (image_info, masks, boxes, labels, indices)
|
686 |
+
|
687 |
+
if normalize:
|
688 |
+
image = normalize_image(image)
|
689 |
+
|
690 |
+
if return_outputs:
|
691 |
+
return image, image_mask, outputs
|
692 |
+
else:
|
693 |
+
return image, image_mask
|
694 |
+
|
695 |
+
|
696 |
+
def _remove_bars_from_frames(frames, black_bar=True, threshold=32, max_perc_to_trim=0.3):
|
697 |
+
"""
|
698 |
+
:param frames: [num_frames, height, width, 3]
|
699 |
+
:param blackbar_threshold: Pixels must be this intense for us to not trim
|
700 |
+
:param max_perc_to_prim: Will trim x% by default of the image at most in each dimension
|
701 |
+
:return:
|
702 |
+
"""
|
703 |
+
# Detect black bars####################
|
704 |
+
frames_shape = tf.shape(frames)
|
705 |
+
h, w = frames_shape[1], frames_shape[2]
|
706 |
+
if black_bar:
|
707 |
+
has_content = tf.reduce_max(frames, axis=(0, -1)) >= threshold
|
708 |
+
else:
|
709 |
+
has_content = tf.reduce_min(frames, axis=(0, -1)) <= threshold
|
710 |
+
|
711 |
+
y_frames = tf.cast(tf.reshape(tf.where(tf.reduce_any(has_content, axis=1)), [-1]), tf.int32)
|
712 |
+
nhbars = tf.shape(y_frames)[0]
|
713 |
+
y_frames = tf.cond(nhbars > 0, lambda: y_frames, lambda: tf.expand_dims(tf.cast(h // 2, tf.int32), axis=0))
|
714 |
+
|
715 |
+
y1 = tf.minimum(y_frames[0], tf.cast(tf.cast(h, tf.float32) * max_perc_to_trim, tf.int32))
|
716 |
+
y2 = tf.maximum(y_frames[-1] + 1, tf.cast(tf.cast(h, tf.float32) * (1 - max_perc_to_trim), tf.int32))
|
717 |
+
|
718 |
+
x_frames = tf.cast(tf.reshape(tf.where(tf.reduce_any(has_content, axis=0)), [-1]), tf.int32)
|
719 |
+
nvbars = tf.shape(x_frames)[0]
|
720 |
+
x_frames = tf.cond(nvbars > 0, lambda: x_frames, lambda: tf.expand_dims(tf.cast(w // 2, tf.int32), axis=0))
|
721 |
+
|
722 |
+
x1 = tf.minimum(x_frames[0], tf.cast(tf.cast(w, tf.float32) * max_perc_to_trim, tf.int32))
|
723 |
+
x2 = tf.maximum(x_frames[-1] + 1, tf.cast(tf.cast(w, tf.float32) * (1 - max_perc_to_trim), tf.int32))
|
724 |
+
|
725 |
+
frames = frames[:, y1:y2, x1:x2]
|
726 |
+
return frames
|
727 |
+
|
728 |
+
def convert_video_dtype(video,dtype):
|
729 |
+
"""
|
730 |
+
Converts tensor to dtype and scales the values.
|
731 |
+
Video equivalent of tf.convert_image_dtype: https://www.tensorflow.org/api_docs/python/tf/image/convert_image_dtype
|
732 |
+
"""
|
733 |
+
return tf.map_fn(
|
734 |
+
fn=functools.partial(
|
735 |
+
tf.image.convert_image_dtype,
|
736 |
+
dtype=dtype),
|
737 |
+
elems=video,
|
738 |
+
fn_output_signature=dtype)
|
739 |
+
|
740 |
+
|
741 |
+
def stateless_shuffle(x: tf.Tensor, seed):
|
742 |
+
if hasattr(tf.random.experimental, 'stateless_shuffle'):
|
743 |
+
return tf.random.experimental.stateless_shuffle(x, seed=seed)
|
744 |
+
else:
|
745 |
+
vals = tf.random.stateless_uniform(tf.shape(x)[:1], seed)
|
746 |
+
ixs = tf.argsort(vals)
|
747 |
+
return tf.gather(x, ixs)
|
748 |
+
|
749 |
+
|
750 |
+
def stateless_permutation(n: int, seed):
|
751 |
+
if hasattr(tf.random.experimental, 'stateless_shuffle'):
|
752 |
+
ix = tf.range(0, n, dtype=tf.int32)
|
753 |
+
return tf.random.experimental.stateless_shuffle(ix, seed=seed)
|
754 |
+
else:
|
755 |
+
vals = tf.random.stateless_uniform(n, seed)
|
756 |
+
return tf.argsort(vals)
|
757 |
+
|
758 |
+
|
759 |
+
@seqio.map_over_dataset
|
760 |
+
def _strip_metadata(example):
|
761 |
+
return pop_metadata(example)[0]
|
762 |
+
|
763 |
+
|
764 |
+
def sample_patches(mask, n_patches, stateless=False, seeds=None):
|
765 |
+
input_sample_valid = tf.boolean_mask(tf.range(tf.shape(mask)[0]), mask)
|
766 |
+
input_sample_masked = tf.boolean_mask(tf.range(tf.shape(mask)[0]), mask == 0)
|
767 |
+
if stateless:
|
768 |
+
encoder_pos_ids = tf.concat([
|
769 |
+
stateless_shuffle(input_sample_valid, seeds[0]),
|
770 |
+
stateless_shuffle(input_sample_masked, seeds[1])], axis=0)[:n_patches]
|
771 |
+
else:
|
772 |
+
encoder_pos_ids = tf.concat([
|
773 |
+
tf.random.shuffle(input_sample_valid),
|
774 |
+
tf.random.shuffle(input_sample_masked)], axis=0)[:n_patches]
|
775 |
+
encoder_pos_ids = tf.reshape(encoder_pos_ids, (n_patches,))
|
776 |
+
encoder_pos_ids = tf.cast(encoder_pos_ids, tf.int32)
|
777 |
+
return encoder_pos_ids
|
778 |
+
|
779 |
+
|
780 |
+
@gin.configurable()
|
781 |
+
def normalize_image(image,
|
782 |
+
offset=(0.48145466, 0.4578275, 0.40821073),
|
783 |
+
scale=(0.26862954, 0.26130258, 0.27577711)):
|
784 |
+
"""Normalizes the image to zero mean and unit variance."""
|
785 |
+
offset = tf.constant(offset)
|
786 |
+
offset = tf.expand_dims(offset, axis=0)
|
787 |
+
offset = tf.expand_dims(offset, axis=0)
|
788 |
+
image -= tf.cast(offset, image.dtype)
|
789 |
+
|
790 |
+
scale = tf.constant(scale)
|
791 |
+
scale = tf.expand_dims(scale, axis=0)
|
792 |
+
scale = tf.expand_dims(scale, axis=0)
|
793 |
+
image /= tf.cast(scale, image.dtype)
|
794 |
+
return image
|
795 |
+
|
796 |
+
|
797 |
+
def unnormalize_image(image,
|
798 |
+
offset=(0.48145466, 0.4578275, 0.40821073),
|
799 |
+
scale=(0.26862954, 0.26130258, 0.27577711)):
|
800 |
+
"""Normalizes the image to zero mean and unit variance."""
|
801 |
+
scale = tf.cast(tf.expand_dims(tf.expand_dims(tf.constant(scale), axis=0), axis=0), image.dtype)
|
802 |
+
image *= scale
|
803 |
+
|
804 |
+
offset = tf.cast(tf.expand_dims(tf.expand_dims(tf.constant(offset), axis=0), axis=0), image.dtype)
|
805 |
+
image += offset
|
806 |
+
return image
|
807 |
+
|
808 |
+
|
809 |
+
def flatten_parts(ds: tf.data.Dataset, parts: List[str], add_index=False, dataset_size=None) -> tf.data.Dataset:
|
810 |
+
def _flatten(ex):
|
811 |
+
flat_key = {k: ex[k] for k in parts}
|
812 |
+
if add_index:
|
813 |
+
flat_key['index'] = tf.range(len(ex[parts[0]]))
|
814 |
+
|
815 |
+
flat_ds = tf.data.Dataset.from_tensor_slices(flat_key)
|
816 |
+
|
817 |
+
def _merge(_flat_ex):
|
818 |
+
for k, v in ex.items():
|
819 |
+
if k not in parts:
|
820 |
+
_flat_ex[k] = v
|
821 |
+
return _flat_ex
|
822 |
+
return flat_ds.map(_merge)
|
823 |
+
|
824 |
+
ds = ds.flat_map(_flatten)
|
825 |
+
if dataset_size is not None:
|
826 |
+
ds = tf.data.experimental.assert_cardinality(dataset_size)(ds)
|
827 |
+
return ds
|
dataset_sizes.py
ADDED
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
DATASET_SIZES = {
|
2 |
+
("cockatoo_qa_v2", "train"): 194820,
|
3 |
+
("user_qa", "train"): 71172,
|
4 |
+
|
5 |
+
("text_vqa", "train"): 34602,
|
6 |
+
("chart_qa", "train"): 28299,
|
7 |
+
("chart_qa_prompting", "train"): 28299,
|
8 |
+
("chart_qa_weighted", "train"): 28299,
|
9 |
+
("tally_qa", "train"): 132981,
|
10 |
+
("doc_qa", "train"): 39463,
|
11 |
+
("info_qa", "train"): 23946,
|
12 |
+
("okvqa", "train"): 9009,
|
13 |
+
("gqa", "train"): 943000,
|
14 |
+
("gqa_multi", "train"): 72140,
|
15 |
+
("coco_2014_vqa", "train"): 443757, # (82783, 443757)
|
16 |
+
("coco_captioning_karpathy", "train"): 414113, # (82783, 414113)
|
17 |
+
("coco_captioning_karpathy_multi", "train"): 82783,
|
18 |
+
("coco_2014_vqa_multi", "train"): 82783,
|
19 |
+
("science_qa_img", "train"): 6218,
|
20 |
+
("ai2_diagram", "train"): 11389,
|
21 |
+
("a_okvqa_mc", "train"): 17056,
|
22 |
+
("a_okvqa_da", "train"): 17056,
|
23 |
+
("ocr_vqa", "train"): 166043,
|
24 |
+
("st_qa", "train"): 25050,
|
25 |
+
("ocr_qa", "train"): 166043,
|
26 |
+
|
27 |
+
("dv_qa", "train"): 200000,
|
28 |
+
("tabwmp_da", "train"): 23059,
|
29 |
+
("figure_qa", "train"): 100000,
|
30 |
+
("figure_qa_zero_shot", "train"): 100000,
|
31 |
+
("plot_qa", "train"): 157070,
|
32 |
+
('clocks', 'train'): 800269,
|
33 |
+
('clocks', 'validation'): 25600,
|
34 |
+
|
35 |
+
("st_qa", "test"): 4070,
|
36 |
+
('text_vqa', "test"): 5734,
|
37 |
+
('okvqa', "test"): 5046,
|
38 |
+
('chart_qa', "test"): 1250,
|
39 |
+
('doc_qa', "test"): 5188,
|
40 |
+
('info_qa', "test"): 3288,
|
41 |
+
('gqa', "test"): 95336,
|
42 |
+
('coco_captioning_karpathy', "test"): 25010,
|
43 |
+
("science_qa_img", "test"): 2017,
|
44 |
+
("ai2_diagram", "test"): 3088,
|
45 |
+
("a_okvqa_mc_eval", "test"): 6702,
|
46 |
+
("a_okvqa_da_eval", "test"): 6109,
|
47 |
+
|
48 |
+
("ai2_diagram_v2", "train"): 10950,
|
49 |
+
("ai2_diagram_v2", "validation"): 1463,
|
50 |
+
("ai2_diagram_v2", "test"): 3088,
|
51 |
+
("vqa_v2_test", "test2015"): 555187,
|
52 |
+
|
53 |
+
("ai2_diagram_v2_transparent", "train"): 10950,
|
54 |
+
("ai2_diagram_v2_transparent", "validation"): 1463,
|
55 |
+
("ai2_diagram_v2_transparent", "test"): 3088,
|
56 |
+
|
57 |
+
# splits in mix_data include both transparent + opaque boxes
|
58 |
+
("ai2_diagram_v2_mix_transparent", "train"): 15042,
|
59 |
+
("ai2_diagram_v2_mix_transparent", "validation"): 1980,
|
60 |
+
("ai2_diagram_v2_mix_transparent", "test"): 4272,
|
61 |
+
|
62 |
+
# vaia_qa
|
63 |
+
('vaia_qa', 'train'): 477052,
|
64 |
+
('vaia_qa', 'validation'): 1024,
|
65 |
+
|
66 |
+
('vaia_qa_latex_image', 'train'): 477052,
|
67 |
+
('vaia_qa_latex_image', 'validation'): 1024,
|
68 |
+
('vaia_qa_latex_image_only', 'train'): 42605,
|
69 |
+
('vaia_qa_latex_image_only', 'validation'): 1024,
|
70 |
+
('vaia_qa_latex_all_image_only', 'train'): 154266,
|
71 |
+
('vaia_qa_latex_all_image_only', 'validation'): 1024,
|
72 |
+
|
73 |
+
("vaia_qa_latex_image_math_subset_short_answer", 'train'): 198161,
|
74 |
+
("vaia_qa_latex_image_math_subset_short_answer", 'validation'): 419,
|
75 |
+
("vaia_qa_latex_image_math_subset_mc_only_short_answer", "train"): 57568,
|
76 |
+
("vaia_qa_latex_image_math_subset_mc_only_short_answer", "validation"): 118,
|
77 |
+
("vaia_qa_latex_image_math_subset_mc_only_short_answer_first", "train"): 57568,
|
78 |
+
("vaia_qa_latex_image_math_subset_mc_only_short_answer_first", "validation"): 118,
|
79 |
+
|
80 |
+
("vaia_qa_latex_image_all_image_only_short_answer", "train"): 86752,
|
81 |
+
("vaia_qa_latex_image_all_image_only_short_answer", "validation"): 92,
|
82 |
+
("vaia_qa_latex_image_all_image_only_short_answer_first", "train"): 86752,
|
83 |
+
("vaia_qa_latex_image_all_image_only_short_answer_first", "validation"): 92,
|
84 |
+
("vaia_qa_latex_image_math_subset_image_only_short_answer", "train"): 21726,
|
85 |
+
("vaia_qa_latex_image_math_subset_image_only_short_answer", "validation"): 48,
|
86 |
+
|
87 |
+
('vqa_online', 'train'): 62722,
|
88 |
+
('vqa_online', 'validation'): 1024,
|
89 |
+
('vqa_online', 'test'): 1024,
|
90 |
+
|
91 |
+
('vqa_online_gpt_longQ_longA', 'train'): 62722,
|
92 |
+
('vqa_online_gpt_longQ_longA', 'validation'): 1024,
|
93 |
+
('vqa_online_gpt_longQ_longA', 'test'): 1024,
|
94 |
+
|
95 |
+
("tally_qa", "validation"): 38589,
|
96 |
+
('text_vqa', "validation"): 5000,
|
97 |
+
('okvqa', "validation"): 5046,
|
98 |
+
('chart_qa', "validation"): 960*2,
|
99 |
+
('chart_qa_prompting_explanation', "validation"): 960*2,
|
100 |
+
('chart_qa_ex', "validation"): 960*2,
|
101 |
+
('chart_qa_human', "validation"): 960,
|
102 |
+
('chart_qa_aug', "validation"): 960,
|
103 |
+
('doc_qa', "validation"): 5349,
|
104 |
+
('info_qa', "validation"): 2801,
|
105 |
+
('coco_2014_vqa', "validation"): 214354, # 40504 images
|
106 |
+
('coco_2014_vqa_multi', "validation"): 214354,
|
107 |
+
('coco_captioning_karpathy', "validation"): 25010,
|
108 |
+
('gqa', "validation"): 132062,
|
109 |
+
("science_qa_img", "validation"): 2097,
|
110 |
+
("ai2_diagram", "validation"): 1024,
|
111 |
+
("a_okvqa_mc", "validation"): 1145,
|
112 |
+
("a_okvqa_da", "validation"): 1075,
|
113 |
+
("charxiv_descriptive", "validation"): 1000,
|
114 |
+
("charxiv_descriptive", "test"): 1320,
|
115 |
+
("charxiv_reasoning", "validation"): 1000,
|
116 |
+
("charxiv_reasoning", "test"): 1320,
|
117 |
+
("fintabnetqa", "validation"): 125,
|
118 |
+
("fintabnetqa", "test"): 250,
|
119 |
+
("vwtq", "validation"): 125,
|
120 |
+
("vwtq", "test"): 750,
|
121 |
+
("vwtq_syn", "validation"): 125,
|
122 |
+
("vwtq_syn", "test"): 250,
|
123 |
+
("vtabfact", "validation"): 125,
|
124 |
+
("vtabfact", "test"): 250,
|
125 |
+
("nutrition_fact", "validation"): 100,
|
126 |
+
("nutrition_fact", "test"): 100,
|
127 |
+
|
128 |
+
("mmmu_test", "validation"): 900,
|
129 |
+
("count_bench", "test"): 500,
|
130 |
+
("mmmu_test", "test"): 10500,
|
131 |
+
("real_world_qa_test", "test"): 765,
|
132 |
+
("real_world_qa_no_instruction", "test"): 765,
|
133 |
+
("real_world_qa_dbg", "test"): 765,
|
134 |
+
("real_world_qa_as_user_qa", "test"): 765,
|
135 |
+
|
136 |
+
("seed_bench_test", "test"): 19241,
|
137 |
+
("pope_test", "test"): 9000,
|
138 |
+
("mme_test", "test"): 2374,
|
139 |
+
("math_vista_test", "validation"): 1000,
|
140 |
+
("math_vista_demo", "validation"): 1000,
|
141 |
+
("math_vista_v2", "validation"): 1000,
|
142 |
+
|
143 |
+
("math_vista_test", "test"): 5141,
|
144 |
+
("mmbench_test", "validation"): 4329,
|
145 |
+
("mmbench_test", "test"): 6666,
|
146 |
+
("sugar_crepe_test", "test"): 15022,
|
147 |
+
("blink_test", "validation"): 1901,
|
148 |
+
("dense_caption_eval_dbg", "validation"): 1,
|
149 |
+
|
150 |
+
("refclef_unc", "train"): 17978,
|
151 |
+
("refclef_unc", "validation"): 12029,
|
152 |
+
("refcoco_unc", "train"): 16994,
|
153 |
+
("refcoco_unc", "validation"): 10834,
|
154 |
+
("refcocoplus_unc", "train"): 16992,
|
155 |
+
("refcocoplus_unc", "validation"): 10758,
|
156 |
+
("refcocog_umd", "train"): 21899,
|
157 |
+
("refcocog_umd", "validation"): 4896,
|
158 |
+
("refclef_unc", "testA"): 3449,
|
159 |
+
("refclef_unc", "testB"): 3221,
|
160 |
+
("refclef_unc", "testC"): 2664,
|
161 |
+
("refclef_unc", "testAB"): 116,
|
162 |
+
("refclef_unc", "testBC"): 86,
|
163 |
+
("refcoco_unc", "testA"): 5657,
|
164 |
+
("refcoco_unc", "testB"): 5095,
|
165 |
+
("refcocoplus_unc", "testA"): 5726,
|
166 |
+
("refcocoplus_unc", "testB"): 4889,
|
167 |
+
("refcocog_umd", "test"): 9602,
|
168 |
+
("countbench_qa_point_count", "huggingface"): 490,
|
169 |
+
('countbench_qa', 'huggingface'): 490,
|
170 |
+
|
171 |
+
('cockatoo_712k_sept6', 'train'): 712121,
|
172 |
+
('cockatoo_712k_sept6', 'validation'): 5120,
|
173 |
+
('user_qa', 'train'): 71172,
|
174 |
+
('user_qa', 'validation'): 2048,
|
175 |
+
|
176 |
+
# pointing
|
177 |
+
("pointing_test", "test"): 436,
|
178 |
+
|
179 |
+
("fast_flickr_count_qa_point_count", "train"): 36916,
|
180 |
+
("fast_flickr_count_qa_point_count", "validation"): 163,
|
181 |
+
("fast_flickr_count_qa_point_count", "test"): 540,
|
182 |
+
("fast_flickr_count_qa_pointing", "train"): 36916,
|
183 |
+
("fast_flickr_count_qa_pointing", "validation"): 163,
|
184 |
+
("fast_flickr_count_qa_pointing", "test"): 540,
|
185 |
+
('pointing', 'train'): 309216,
|
186 |
+
('point_count', 'train'): 309216,
|
187 |
+
('pointing', 'validation'): 2054,
|
188 |
+
('point_count', 'validation'): 2054,
|
189 |
+
('point_count_high_freq', 'train'): 113840,
|
190 |
+
('point_count_high_freq', 'validation'): 3969,
|
191 |
+
('pointing_high_freq', 'train'): 113840,
|
192 |
+
('pointing_high_freq', 'validation'): 3969,
|
193 |
+
('point_qa', 'train'): 27856,
|
194 |
+
('point_qa', 'validation'): 978,
|
195 |
+
("a_okvqa_da", "test"): 6109,
|
196 |
+
("a_okvqa_mc", "test"): 6702,
|
197 |
+
("user_questions_for_elo", "test"): 14851,
|
198 |
+
("user_questions_for_elo_long", "test"): 1368,
|
199 |
+
("user_questions_for_elo_9_to_12", "test"): 3000,
|
200 |
+
|
201 |
+
("sim_point_count_qa", "train"): 522611,
|
202 |
+
("sim_point_count_qa", "validation"): 800,
|
203 |
+
("sim_point_count_qa", "test"): 800,
|
204 |
+
("sim_count_qa", "train"): 522611,
|
205 |
+
("sim_count_qa", "validation"): 800,
|
206 |
+
("sim_count_qa", "test"): 800,
|
207 |
+
|
208 |
+
("scifi_charts_qa", "validation"): 1024,
|
209 |
+
("scifi_table_qa", "validation"): 1024,
|
210 |
+
("scifi_natural_qa", "validation"): 128,
|
211 |
+
("scifi_nutrition_qa", "validation"): 128,
|
212 |
+
("scifi_document_qa", "validation"): 1024,
|
213 |
+
("scifi_diagram_qa", "validation"): 1024,
|
214 |
+
("scifi_charts_qa", "train"): 233622,
|
215 |
+
("scifi_table_qa", "train"): 93036,
|
216 |
+
("scifi_document_qa", "train"): 142559,
|
217 |
+
("scifi_diagram_qa", "train"): 33102,
|
218 |
+
|
219 |
+
("scifi_charts_qa_split", "train"): 116814,
|
220 |
+
("scifi_table_qa_split", "train"): 46518,
|
221 |
+
("scifi_document_qa_split", "train"): 71282,
|
222 |
+
("scifi_diagram_qa_split", "train"): 16551,
|
223 |
+
|
224 |
+
("scifi_charts_qa_exp_split", "train"): 116814,
|
225 |
+
("scifi_table_qa_exp_split", "train"): 46518,
|
226 |
+
("scifi_document_qa_exp_split", "train"): 71282,
|
227 |
+
("scifi_diagram_qa_exp_split", "train"): 16551,
|
228 |
+
|
229 |
+
("android_control", "train"): 74714,
|
230 |
+
("android_control", "validation"): 690,
|
231 |
+
("android_control", "test"): 3897,
|
232 |
+
|
233 |
+
("synthetic_qa_v3_multi_turn", "train"): 9824,
|
234 |
+
("synthetic_qa_v3", "train"): 162855,
|
235 |
+
("synthetic_qa_v3_style_tag", "train"): 162855,
|
236 |
+
("synthetic_qa_v3_as_user_qa", "train"): 162855,
|
237 |
+
}
|
238 |
+
|
239 |
+
|
240 |
+
for (name, split), count in list(DATASET_SIZES.items()):
|
241 |
+
if name in ["chart_qa"]:
|
242 |
+
DATASET_SIZES[(name + "_scifi", split)] = count
|
243 |
+
if name in ["android_control"]:
|
244 |
+
for k in ["ll", "hl", "hl_ll", "hl_cot"]:
|
245 |
+
DATASET_SIZES[(f"{name}_{k}", split)] = count
|
246 |
+
if name in ["scifi_charts_qa" ,"scifi_table_qa", "scifi_document_qa", "scifi_diagram_qa", "scifi_datikz_qa"]:
|
247 |
+
DATASET_SIZES[(name + "_exp", split)] = count
|
248 |
+
DATASET_SIZES[(name[:-3] + "_exp", split)] = count
|
249 |
+
DATASET_SIZES[(name[:-3] + "_demo", split)] = count
|
250 |
+
if name in ["ai2_diagram_v2_mix_transparent"]:
|
251 |
+
DATASET_SIZES[("ai2_diagram_v2_mix_transparent_one_style", split)] = count
|
252 |
+
if name in ["chart_qa", "info_qa", "doc_qa", "text_vqa", "coco_2014_vqa",
|
253 |
+
"ai2_diagram_v2_mix_transparent", "countbench_qa", "chart_qa_human"]:
|
254 |
+
DATASET_SIZES[(name + "_demo", split)] = count
|
255 |
+
|
256 |
+
|
257 |
+
def get_dataset_size(name, split):
|
258 |
+
if name.endswith("_eval"):
|
259 |
+
if (name, split) in DATASET_SIZES:
|
260 |
+
return DATASET_SIZES[(name, split)]
|
261 |
+
name = name[:-len('_eval')]
|
262 |
+
return DATASET_SIZES[(name, split)]
|
exceptions.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__all__ = [
|
2 |
+
"OLMoError",
|
3 |
+
"OLMoConfigurationError",
|
4 |
+
"OLMoCliError",
|
5 |
+
"OLMoEnvironmentError",
|
6 |
+
"OLMoNetworkError",
|
7 |
+
"OLMoCheckpointError",
|
8 |
+
]
|
9 |
+
|
10 |
+
|
11 |
+
class OLMoError(Exception):
|
12 |
+
"""
|
13 |
+
Base class for all custom OLMo exceptions.
|
14 |
+
"""
|
15 |
+
|
16 |
+
|
17 |
+
class OLMoConfigurationError(OLMoError):
|
18 |
+
"""
|
19 |
+
An error with a configuration file.
|
20 |
+
"""
|
21 |
+
|
22 |
+
|
23 |
+
class OLMoCliError(OLMoError):
|
24 |
+
"""
|
25 |
+
An error from incorrect CLI usage.
|
26 |
+
"""
|
27 |
+
|
28 |
+
|
29 |
+
class OLMoEnvironmentError(OLMoError):
|
30 |
+
"""
|
31 |
+
An error from incorrect environment variables.
|
32 |
+
"""
|
33 |
+
|
34 |
+
|
35 |
+
class OLMoNetworkError(OLMoError):
|
36 |
+
"""
|
37 |
+
An error with a network request.
|
38 |
+
"""
|
39 |
+
|
40 |
+
|
41 |
+
class OLMoCheckpointError(OLMoError):
|
42 |
+
"""
|
43 |
+
An error occurred reading or writing from a checkpoint.
|
44 |
+
"""
|
45 |
+
|
46 |
+
|
47 |
+
class OLMoThreadError(Exception):
|
48 |
+
"""
|
49 |
+
Raised when a thread fails.
|
50 |
+
"""
|
iterable_dataset.py
ADDED
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import math
|
3 |
+
import multiprocessing
|
4 |
+
import os
|
5 |
+
import pickle
|
6 |
+
import queue
|
7 |
+
import socket
|
8 |
+
import time
|
9 |
+
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
10 |
+
from multiprocessing.managers import BaseManager
|
11 |
+
from multiprocessing.shared_memory import SharedMemory
|
12 |
+
from os.path import exists
|
13 |
+
from pathlib import Path
|
14 |
+
from typing import Any, Dict, Iterator, List, Optional, Sequence, Union
|
15 |
+
|
16 |
+
import psutil
|
17 |
+
import tensorflow as tf
|
18 |
+
import numpy as np
|
19 |
+
import torch
|
20 |
+
import torch.utils.data
|
21 |
+
import clu
|
22 |
+
from clu.data.dataset_iterator import Element
|
23 |
+
|
24 |
+
|
25 |
+
from .aliases import PathOrStr
|
26 |
+
from .torch_util import barrier, get_fs_local_rank, get_global_rank, get_world_size, get_node_rank, \
|
27 |
+
get_local_world_size, get_local_rank, move_to_device
|
28 |
+
from .util import roundrobin, threaded_generator
|
29 |
+
from .data_factory import SeqioDataset
|
30 |
+
from .multimodal_preprocessor import MultiModalPreprocessor
|
31 |
+
from .preprocesssors import rename
|
32 |
+
import torch.distributed as dist
|
33 |
+
from . import tasks
|
34 |
+
|
35 |
+
__all__ = ["MMIterableDataset"]
|
36 |
+
|
37 |
+
log = logging.getLogger(__name__)
|
38 |
+
|
39 |
+
|
40 |
+
def batch_fn(batch, for_inference):
|
41 |
+
if for_inference:
|
42 |
+
out = {}
|
43 |
+
for k, v in batch.items():
|
44 |
+
if k.startswith("metadata/"):
|
45 |
+
out[k] = v
|
46 |
+
else:
|
47 |
+
out[k] = torch.from_numpy(v)
|
48 |
+
return out
|
49 |
+
else:
|
50 |
+
out = {k: torch.from_numpy(v) for k, v in batch.items() if not k.startswith("metadata/")}
|
51 |
+
out["metadata"] = [{} for _ in out["input_ids"]]
|
52 |
+
return out
|
53 |
+
|
54 |
+
|
55 |
+
class PyTorchDatasetIterator(clu.data.dataset_iterator.TfDatasetIterator):
|
56 |
+
def __init__(self, dataset, *, checkpoint: bool, for_inference: bool):
|
57 |
+
self.for_inference = for_inference
|
58 |
+
super().__init__(dataset, checkpoint=checkpoint)
|
59 |
+
|
60 |
+
def __next__(self) -> Element:
|
61 |
+
batch = {k: v.numpy() for k, v in next(self.iterator).items()}
|
62 |
+
return batch_fn(batch, self.for_inference)
|
63 |
+
|
64 |
+
def __len__(self) -> int:
|
65 |
+
return len(self._dataset)
|
66 |
+
|
67 |
+
|
68 |
+
class MMIterableDataset(torch.utils.data.IterableDataset[Dict[str, Any]]):
|
69 |
+
def __init__(
|
70 |
+
self,
|
71 |
+
dataset: SeqioDataset,
|
72 |
+
preprocessor: MultiModalPreprocessor,
|
73 |
+
world_size: Optional[int] = None,
|
74 |
+
rank: Optional[int] = None,
|
75 |
+
):
|
76 |
+
self.preprocessor = preprocessor
|
77 |
+
self.rank = rank if rank is not None else get_global_rank()
|
78 |
+
self.world_size = world_size if world_size is not None else get_world_size()
|
79 |
+
self.dataset_config = dataset
|
80 |
+
|
81 |
+
data_iter = dataset.build(
|
82 |
+
self.preprocessor,
|
83 |
+
self.rank,
|
84 |
+
self.world_size,
|
85 |
+
)
|
86 |
+
|
87 |
+
data_iter: tf.data.Dataset = rename(input_ids="input_tokens", labels="target_tokens")(data_iter)
|
88 |
+
self.dataset = data_iter
|
89 |
+
self.data_iter = PyTorchDatasetIterator(
|
90 |
+
data_iter, checkpoint=True, for_inference=dataset.for_inference)
|
91 |
+
|
92 |
+
def reset(self):
|
93 |
+
self.data_iter.reset()
|
94 |
+
|
95 |
+
def save(self, filename: PathOrStr):
|
96 |
+
self.data_iter.save(filename)
|
97 |
+
|
98 |
+
def restore(self, filename: PathOrStr):
|
99 |
+
self.data_iter.restore(filename)
|
100 |
+
|
101 |
+
def __iter__(self) -> Iterator[Dict[str, Any]]:
|
102 |
+
return self.data_iter
|
103 |
+
|
104 |
+
|
105 |
+
def _split_batch(batch, n):
|
106 |
+
subbatches = [{} for _ in range(n)]
|
107 |
+
for k, v in batch.items():
|
108 |
+
assert len(v) % n == 0, f"n={n} but {k} has {len(v)}"
|
109 |
+
subatch_dim = len(v) // n
|
110 |
+
for i, subbatch in enumerate(subbatches):
|
111 |
+
subbatch[k] = v[i * subatch_dim:(i + 1) * subatch_dim]
|
112 |
+
return subbatches
|
113 |
+
|
114 |
+
|
115 |
+
def tf_to_torch_dtype(tf_dtype):
|
116 |
+
dtype_mapping = {
|
117 |
+
tf.float16: torch.float16,
|
118 |
+
tf.float32: torch.float32,
|
119 |
+
tf.float64: torch.float64,
|
120 |
+
tf.int8: torch.int8,
|
121 |
+
tf.uint8: torch.uint8,
|
122 |
+
tf.int16: torch.int16,
|
123 |
+
tf.int32: torch.int32,
|
124 |
+
tf.int64: torch.int64,
|
125 |
+
tf.bool: torch.bool,
|
126 |
+
}
|
127 |
+
return dtype_mapping[tf_dtype]
|
128 |
+
|
129 |
+
|
130 |
+
class PeerToPeer(torch.utils.data.IterableDataset[Dict[str, Any]]):
|
131 |
+
"""
|
132 |
+
This dataloader runs the tf.data.Dataset on one processes per a node, and then
|
133 |
+
transfers the batch to the other processes. For 7B model about a 10% performance
|
134 |
+
despite my attempts to make it asynchronous
|
135 |
+
|
136 |
+
The advantage is that it avoids the overhead of running multiple tf.data.Dataset
|
137 |
+
in one node
|
138 |
+
"""
|
139 |
+
|
140 |
+
def __init__(
|
141 |
+
self,
|
142 |
+
dataset: SeqioDataset,
|
143 |
+
preprocessor: MultiModalPreprocessor,
|
144 |
+
world_size: Optional[int] = None,
|
145 |
+
rank: Optional[int] = None,
|
146 |
+
device=None
|
147 |
+
):
|
148 |
+
assert get_world_size() % get_local_world_size() == 0
|
149 |
+
self.device = device
|
150 |
+
self.device_batch_size = dataset.global_batch_size // get_world_size()
|
151 |
+
|
152 |
+
self.preprocessor = preprocessor
|
153 |
+
self.seqio_dataset = dataset
|
154 |
+
|
155 |
+
lws = get_local_world_size()
|
156 |
+
|
157 |
+
if get_local_rank() == 0:
|
158 |
+
tf_dataset = dataset.build(
|
159 |
+
self.preprocessor,
|
160 |
+
get_node_rank(),
|
161 |
+
get_world_size() // lws,
|
162 |
+
)
|
163 |
+
|
164 |
+
tf_dataset = rename(input_ids="input_tokens", labels="target_tokens")(tf_dataset)
|
165 |
+
self.dataset = tf_dataset
|
166 |
+
device_spec = {k: ((v.shape[0]//lws,) + tuple(v.shape[1:]), tf_to_torch_dtype(v.dtype))
|
167 |
+
for k, v in tf_dataset.element_spec.items()}
|
168 |
+
else:
|
169 |
+
self.dataset = None
|
170 |
+
device_spec = None
|
171 |
+
|
172 |
+
broadcast = [device_spec]
|
173 |
+
torch.distributed.broadcast_object_list(broadcast)
|
174 |
+
self.device_spec = broadcast[0]
|
175 |
+
|
176 |
+
self._node_group_ranks = ranks = [(i + get_node_rank()*lws) for i in range(lws)]
|
177 |
+
if get_local_rank() == 0:
|
178 |
+
assert get_global_rank() == self._node_group_ranks[0]
|
179 |
+
self._keys = sorted(self.device_spec)
|
180 |
+
self.multithread_pin = False
|
181 |
+
|
182 |
+
def _pin(self, it, on):
|
183 |
+
batch = next(it)
|
184 |
+
batch = {k: torch.from_numpy(v) for k, v in batch.items()}
|
185 |
+
batch = _split_batch(batch, len(self._node_group_ranks))
|
186 |
+
return [{k: v.pin_memory() for k, v in subbatch.items()} for subbatch in batch]
|
187 |
+
|
188 |
+
def _send_pinned(self, batch):
|
189 |
+
requests = []
|
190 |
+
for rank_ix, rank in enumerate(self._node_group_ranks[1:], start=1):
|
191 |
+
for k in self._keys:
|
192 |
+
batch[rank_ix][k] = batch[rank_ix][k].to(self.device, non_blocking=True)
|
193 |
+
requests.append(dist.P2POp(dist.isend, batch[rank_ix][k], rank))
|
194 |
+
ops = dist.batch_isend_irecv(requests)
|
195 |
+
return batch[0], ops
|
196 |
+
|
197 |
+
def _send(self, it, on):
|
198 |
+
if get_local_rank() == 0:
|
199 |
+
try:
|
200 |
+
batch = next(it)
|
201 |
+
batch = {k: torch.from_numpy(v) for k, v in batch.items()}
|
202 |
+
batch = _split_batch(batch, len(self._node_group_ranks))
|
203 |
+
except StopIteration:
|
204 |
+
# Special batch to indicate iteration is done
|
205 |
+
batch = [
|
206 |
+
{k: torch.full(sh, -10, dtype=dtype, device=self.device)
|
207 |
+
for k, (sh, dtype) in self.device_spec.items()}
|
208 |
+
for _ in range(len(self._node_group_ranks))
|
209 |
+
]
|
210 |
+
|
211 |
+
# pin_memory so the device transfer can be non_blocking
|
212 |
+
batch = [{k: v.pin_memory() for k, v in subbatch.items()}
|
213 |
+
for subbatch in batch]
|
214 |
+
|
215 |
+
requests = []
|
216 |
+
for rank_ix, rank in enumerate(self._node_group_ranks[1:], start=1):
|
217 |
+
for k in self._keys:
|
218 |
+
batch[rank_ix][k] = batch[rank_ix][k].to(self.device, non_blocking=True)
|
219 |
+
requests.append(dist.P2POp(dist.isend, batch[rank_ix][k], rank))
|
220 |
+
ops = dist.batch_isend_irecv(requests)
|
221 |
+
batch = batch[0]
|
222 |
+
else:
|
223 |
+
batch = {k: torch.zeros(sh, dtype=dtype, device=self.device)
|
224 |
+
for k, (sh, dtype) in self.device_spec.items()}
|
225 |
+
requests = []
|
226 |
+
for k in self._keys:
|
227 |
+
requests.append(dist.P2POp(dist.irecv, batch[k], self._node_group_ranks[0]))
|
228 |
+
ops = dist.batch_isend_irecv(requests)
|
229 |
+
return batch, ops
|
230 |
+
|
231 |
+
def __iter__(self):
|
232 |
+
on = 0
|
233 |
+
if get_local_rank() == 0:
|
234 |
+
it = iter(self.dataset.as_numpy_iterator())
|
235 |
+
else:
|
236 |
+
it = None
|
237 |
+
|
238 |
+
if get_local_rank() == 0 and self.multithread_pin:
|
239 |
+
# Try to be clever and do memory pinning in a seperate thread, in practice
|
240 |
+
# didn't seem to help much so off by default for now
|
241 |
+
# Currently does not support finite dataset
|
242 |
+
with ThreadPoolExecutor(max_workers=1) as pool:
|
243 |
+
_is_sending = self._send_pinned(self._pin(it, on))
|
244 |
+
_is_pinning = pool.submit(self._pin, it, on)
|
245 |
+
on += 1
|
246 |
+
while True:
|
247 |
+
result = _is_sending
|
248 |
+
_is_sending = self._send_pinned(_is_pinning.result())
|
249 |
+
_is_pinning = pool.submit(self._pin, it, on)
|
250 |
+
on += 1
|
251 |
+
for op in result[1]:
|
252 |
+
op.wait()
|
253 |
+
yield result[0]
|
254 |
+
else:
|
255 |
+
_in_flight = self._send(it, on)
|
256 |
+
on += 1
|
257 |
+
while True:
|
258 |
+
on += 1
|
259 |
+
next_batch = self._send(it, on) # queue up the next batch
|
260 |
+
for op in _in_flight[1]: # wait for the current batch
|
261 |
+
op.wait()
|
262 |
+
if _in_flight["input_ids"][0] != -10: # indicates no more data
|
263 |
+
return
|
264 |
+
yield _in_flight[0]
|
265 |
+
_in_flight = next_batch
|
266 |
+
|
modeling_molmoe.py
CHANGED
@@ -39,14 +39,14 @@ import einops
|
|
39 |
from transformers import PreTrainedModel
|
40 |
from transformers.modeling_outputs import CausalLMOutputWithPast
|
41 |
|
42 |
-
from
|
43 |
-
from
|
44 |
BeamSearch,
|
45 |
Constraint,
|
46 |
FinalSequenceScorer,
|
47 |
Sampler
|
48 |
)
|
49 |
-
from
|
50 |
ActivationType,
|
51 |
BlockType,
|
52 |
LayerNormType,
|
@@ -56,7 +56,7 @@ from olmo.config import (
|
|
56 |
AttentionType,
|
57 |
)
|
58 |
|
59 |
-
from
|
60 |
from .config_molmoe import (
|
61 |
MolmoConfig,
|
62 |
VisionBackboneConfig
|
|
|
39 |
from transformers import PreTrainedModel
|
40 |
from transformers.modeling_outputs import CausalLMOutputWithPast
|
41 |
|
42 |
+
from .aliases import PathOrStr
|
43 |
+
from .beam_search import (
|
44 |
BeamSearch,
|
45 |
Constraint,
|
46 |
FinalSequenceScorer,
|
47 |
Sampler
|
48 |
)
|
49 |
+
from .config import (
|
50 |
ActivationType,
|
51 |
BlockType,
|
52 |
LayerNormType,
|
|
|
56 |
AttentionType,
|
57 |
)
|
58 |
|
59 |
+
from .util import resource_path
|
60 |
from .config_molmoe import (
|
61 |
MolmoConfig,
|
62 |
VisionBackboneConfig
|
multimodal_preprocessor.py
ADDED
@@ -0,0 +1,1549 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import dataclasses
|
2 |
+
import logging
|
3 |
+
import re
|
4 |
+
from collections import defaultdict
|
5 |
+
from typing import Tuple, Optional, Any, Dict, List, Union, Mapping
|
6 |
+
|
7 |
+
import einops
|
8 |
+
import seqio
|
9 |
+
import numpy as np
|
10 |
+
import tensorflow as tf
|
11 |
+
|
12 |
+
from .mm_data import seqio_tokenizer
|
13 |
+
from .data_utils import pad_to_bounding_box, \
|
14 |
+
get_3d_subsegments, _append_to_innermost_axis, resize_and_pad, \
|
15 |
+
apply_with_random_selector, get_special_token_ids, make_autoregressive_inputs, \
|
16 |
+
trim_and_pad_dataset, assert_not_truncated
|
17 |
+
from .prompts import apply_keyword_prompt, STYLE_TO_GENERAL_PROMPT, GENERAL_PROMPTS_V1
|
18 |
+
import .constants as config
|
19 |
+
|
20 |
+
|
21 |
+
def siglip_resize(src, imgsize, truncate):
|
22 |
+
"""Resize and preprocess for SigLIP ViT in the offical jax implementation"""
|
23 |
+
assert src.dtype == tf.uint8
|
24 |
+
# SigCLIP removes aspect ratio by default
|
25 |
+
resized = tf.image.resize(src, imgsize, method=tf.image.ResizeMethod.BILINEAR, antialias=False)
|
26 |
+
dtype = src.dtype
|
27 |
+
tf_dtype = tf.type_spec_from_value(src).dtype
|
28 |
+
resized = tf.cast(tf.clip_by_value(resized, tf_dtype.min, tf_dtype.max), dtype)
|
29 |
+
|
30 |
+
# Normalize between -1 and 1 without using imagenet standard mean/std
|
31 |
+
vmin=-1; vmax=1; in_min=0; in_max=255.0
|
32 |
+
in_min_t = tf.constant(in_min, tf.float32)
|
33 |
+
in_max_t = tf.constant(in_max, tf.float32)
|
34 |
+
image = tf.cast(resized, tf.float32)
|
35 |
+
image = (image - in_min_t) / (in_max_t - in_min_t)
|
36 |
+
image = vmin + image * (vmax - vmin)
|
37 |
+
if truncate:
|
38 |
+
image = image[:truncate, :truncate]
|
39 |
+
return image
|
40 |
+
|
41 |
+
|
42 |
+
def extract_bboxes(text, image_w, image_h):
|
43 |
+
points = extract_points(text, image_w, image_h)
|
44 |
+
boxes = []
|
45 |
+
for i in range(len(points)//2):
|
46 |
+
x1, y1 = points[i*2]
|
47 |
+
x2, y2 = points[i*2 + 1]
|
48 |
+
boxes.append([x1, y1, x2, y2])
|
49 |
+
return boxes
|
50 |
+
|
51 |
+
|
52 |
+
def extract_annotated_points(caption, image_w, image_h):
|
53 |
+
points = []
|
54 |
+
for match in re.finditer("<point x=\"([0-9\\.]*)\" y=\"([0-9\\.]*)\" alt=\"([^\"]*)\">", caption):
|
55 |
+
x = float(match.group(1))
|
56 |
+
y = float(match.group(2))
|
57 |
+
points.append(([[x, y]], match.group(3)))
|
58 |
+
for match in re.finditer("<points ([^<]*) alt=\"([^\"]*)\">", caption):
|
59 |
+
loc_str = match.group(1)
|
60 |
+
locations = defaultdict(dict)
|
61 |
+
if loc_str.startswith("points="):
|
62 |
+
point_grp = []
|
63 |
+
for point_match in re.finditer(r"([0-9]+\.[0-9]),? ([0-9]+\.[0-9])", loc_str):
|
64 |
+
try:
|
65 |
+
point = [float(point_match.group(i)) for i in range(1, 3)]
|
66 |
+
point_grp.append(point)
|
67 |
+
except ValueError:
|
68 |
+
pass
|
69 |
+
else:
|
70 |
+
for val in loc_str.split():
|
71 |
+
try:
|
72 |
+
key, val = val.split("=")
|
73 |
+
locations[key[1:]][key[:1]] = float(val.strip("\""))
|
74 |
+
except ValueError:
|
75 |
+
import pdb; pdb.set_trace()
|
76 |
+
logging.warning(f"Failed to parse {val} from {match.group(0)}")
|
77 |
+
point_grp = []
|
78 |
+
for key, coords in locations.items():
|
79 |
+
if sorted(coords) == ["x", "y"]:
|
80 |
+
point_grp.append([coords["x"], coords["y"]])
|
81 |
+
if point_grp:
|
82 |
+
points.append((point_grp, match.group(2)))
|
83 |
+
|
84 |
+
normalized = []
|
85 |
+
for point_grp, point_text in points:
|
86 |
+
normalized.append((
|
87 |
+
np.array(point_grp) / 100.0 * np.array([image_w, image_h]),
|
88 |
+
point_text,
|
89 |
+
))
|
90 |
+
return normalized
|
91 |
+
|
92 |
+
|
93 |
+
def extract_points(text, image_w, image_h):
|
94 |
+
all_points = []
|
95 |
+
for match in re.finditer(r"Click\(([0-9]+\.[0-9]), ?([0-9]+\.[0-9])\)", text):
|
96 |
+
try:
|
97 |
+
point = [float(match.group(i)) for i in range(1, 3)]
|
98 |
+
except ValueError:
|
99 |
+
pass
|
100 |
+
else:
|
101 |
+
point = np.array(point)
|
102 |
+
if np.max(point) > 100:
|
103 |
+
# Treat as an invalid output
|
104 |
+
continue
|
105 |
+
point /= 100.0
|
106 |
+
point = point * np.array([image_w, image_h])
|
107 |
+
all_points.append(point)
|
108 |
+
|
109 |
+
for match in re.finditer(r"\(([0-9]+\.[0-9]),? ?([0-9]+\.[0-9])\)", text):
|
110 |
+
try:
|
111 |
+
point = [float(match.group(i)) for i in range(1, 3)]
|
112 |
+
except ValueError:
|
113 |
+
pass
|
114 |
+
else:
|
115 |
+
point = np.array(point)
|
116 |
+
if np.max(point) > 100:
|
117 |
+
# Treat as an invalid output
|
118 |
+
continue
|
119 |
+
point /= 100.0
|
120 |
+
point = point * np.array([image_w, image_h])
|
121 |
+
all_points.append(point)
|
122 |
+
for match in re.finditer(r'x\d*="\s*([0-9]+(?:\.[0-9]+)?)"\s+y\d*="\s*([0-9]+(?:\.[0-9]+)?)"', text):
|
123 |
+
try:
|
124 |
+
point = [float(match.group(i)) for i in range(1, 3)]
|
125 |
+
except ValueError:
|
126 |
+
pass
|
127 |
+
else:
|
128 |
+
point = np.array(point)
|
129 |
+
if np.max(point) > 100:
|
130 |
+
# Treat as an invalid output
|
131 |
+
continue
|
132 |
+
point /= 100.0
|
133 |
+
point = point * np.array([image_w, image_h])
|
134 |
+
all_points.append(point)
|
135 |
+
for match in re.finditer(r'(?:\d+|p)\s*=\s*([0-9]{3})\s*,\s*([0-9]{3})', text):
|
136 |
+
try:
|
137 |
+
point = [int(match.group(i)) / 10.0 for i in range(1, 3)]
|
138 |
+
except ValueError:
|
139 |
+
pass
|
140 |
+
else:
|
141 |
+
point = np.array(point)
|
142 |
+
if np.max(point) > 100:
|
143 |
+
# Treat as an invalid output
|
144 |
+
continue
|
145 |
+
point /= 100.0
|
146 |
+
point = point * np.array([image_w, image_h])
|
147 |
+
all_points.append(point)
|
148 |
+
return all_points
|
149 |
+
|
150 |
+
|
151 |
+
def extract_points_from_point_count(text, image_w, image_h):
|
152 |
+
all_points = []
|
153 |
+
points = re.findall(r"(\d+\.\d+),\s*(\d+\.\d+)", text)
|
154 |
+
|
155 |
+
for match in points:
|
156 |
+
try:
|
157 |
+
point = [float(match[0]), float(match[1])]
|
158 |
+
except ValueError:
|
159 |
+
pass
|
160 |
+
else:
|
161 |
+
point = np.array(point)
|
162 |
+
if np.max(point) > 100:
|
163 |
+
# Treat as an invalid output
|
164 |
+
continue
|
165 |
+
point = point * np.array([image_w, image_h])
|
166 |
+
all_points.append(point)
|
167 |
+
return all_points
|
168 |
+
|
169 |
+
|
170 |
+
def select_tiling(h, w, patch_size, max_num_patches):
|
171 |
+
"""Decide how best to divide in image of size [w, h] in up to max_num_patches of size patch_size"""
|
172 |
+
original_size = tf.stack([h, w]) # [1, 2]
|
173 |
+
original_res = h * w
|
174 |
+
tilings = []
|
175 |
+
for i in range(1, max_num_patches+1):
|
176 |
+
for j in range(1, max_num_patches+1):
|
177 |
+
if i*j <= max_num_patches:
|
178 |
+
tilings.append((i, j))
|
179 |
+
# sort so argmin and argmax favour smaller tilings in the event of a tie
|
180 |
+
tilings.sort(key=lambda x: (x[0]*x[1], x[0]))
|
181 |
+
candidate_tilings = tf.constant(tilings, dtype=tf.int32) # [n_resolutions, 2]
|
182 |
+
candidate_resolutions = candidate_tilings * patch_size # [n_resolutions, 2]
|
183 |
+
|
184 |
+
# How much we would need to scale the image to fit exactly in each tiling
|
185 |
+
required_scale_d = tf.cast(candidate_resolutions, tf.float32) / tf.cast(original_size[None, :], tf.float32)
|
186 |
+
required_scale = tf.reduce_min(required_scale_d, axis=-1, keepdims=True) # [n_resolutions, 1]
|
187 |
+
if tf.reduce_all(required_scale < 1):
|
188 |
+
# We are forced to downscale, so try to minimize the amount of downscaling
|
189 |
+
ix = tf.argmax(required_scale)[0]
|
190 |
+
else:
|
191 |
+
# Pick the resolution that required the least upscaling so that it most closely fits the image
|
192 |
+
required_scale = tf.where(required_scale < 1.0, 10e9, required_scale)
|
193 |
+
ix = tf.argmin(required_scale)[0]
|
194 |
+
return candidate_tilings[ix]
|
195 |
+
|
196 |
+
|
197 |
+
DEMO_STYLES = [
|
198 |
+
"point_count",
|
199 |
+
"pointing",
|
200 |
+
"user_qa",
|
201 |
+
"scifi_charts_exp",
|
202 |
+
"scifi_charts_exp",
|
203 |
+
"scifi_charts_exp",
|
204 |
+
"scifi_charts_exp",
|
205 |
+
"long_caption",
|
206 |
+
"named_entity"
|
207 |
+
]
|
208 |
+
|
209 |
+
|
210 |
+
@dataclasses.dataclass
|
211 |
+
class MultiModalPreprocessor:
|
212 |
+
"""Turns text/image inputs into tensors that can be input to the model"""
|
213 |
+
tokenizer: Any
|
214 |
+
|
215 |
+
# How to prompt the model
|
216 |
+
prompt_templates: str = "none" # How to template prompts for examples
|
217 |
+
message_format: str = "none" # How to format messages
|
218 |
+
system_prompt: Optional[str] = None # How to generate system prompts
|
219 |
+
prompt_override: Optional[str] = None # Used for setting prompt manually
|
220 |
+
always_start_with_space: bool = False # Always include a leading space for the first bit of text
|
221 |
+
default_inference_len: int = 65 # Inference len for length-conditioned prompting
|
222 |
+
|
223 |
+
# How to crops/resize images
|
224 |
+
crop_mode: str = "resize"
|
225 |
+
max_crops: int = 6
|
226 |
+
overlap_margins: Tuple[int, int] = (4, 4)
|
227 |
+
do_random_scale: Optional[bool] = False
|
228 |
+
resize: str = "default"
|
229 |
+
random_scale_max: float = 1.1
|
230 |
+
random_scale_min: float = 0.9
|
231 |
+
random_scale_ratio: float = 0.5
|
232 |
+
use_col_tokens: bool = True
|
233 |
+
|
234 |
+
# Data about the ViT and connector we need when deciding the crops
|
235 |
+
base_image_input_size: Tuple[int, int] = (336, 336)
|
236 |
+
image_token_length_w: int = 12
|
237 |
+
image_token_length_h: int = 12
|
238 |
+
image_patch_size: int = 14
|
239 |
+
image_padding_mask: bool = False
|
240 |
+
|
241 |
+
# Other settings
|
242 |
+
loss_token_weighting: Optional[str] = None
|
243 |
+
unconditioned: Union[bool, float] = False # Ignore images
|
244 |
+
fix_image_input_idx: int = 2 # backwards compatibility fix
|
245 |
+
pad_to: Optional[int] = None # experimental feature
|
246 |
+
|
247 |
+
_special_tokens: Dict[str, int] = None
|
248 |
+
split_at: Optional[int] = None
|
249 |
+
|
250 |
+
def get_max_total_crops(self):
|
251 |
+
if self.crop_mode == "resize":
|
252 |
+
return 1
|
253 |
+
elif "resize" in self.crop_mode:
|
254 |
+
return 1 + self.max_crops
|
255 |
+
else:
|
256 |
+
return self.max_crops
|
257 |
+
|
258 |
+
@property
|
259 |
+
def image_num_patch(self):
|
260 |
+
h, w = self.base_image_input_size
|
261 |
+
return h//self.image_patch_size, w//self.image_patch_size
|
262 |
+
|
263 |
+
@property
|
264 |
+
def special_token_ids(self):
|
265 |
+
if self._special_tokens is None:
|
266 |
+
self._special_tokens = get_special_token_ids(self.tokenizer)
|
267 |
+
return self._special_tokens
|
268 |
+
|
269 |
+
def image_to_patches_and_tokens(self, image, is_training):
|
270 |
+
"""Preprocesses an image
|
271 |
+
|
272 |
+
Args:
|
273 |
+
image: [h, w, 3] image to preprocessing
|
274 |
+
Returns:
|
275 |
+
crops: (n_crops, n_patches, patch_dim) individual crops, `n_crops` might
|
276 |
+
change between images but the other dimension are fixed
|
277 |
+
tokens: (n_tokens,) tf.int32 tokens, pad tokens indicate where to insert the
|
278 |
+
patch features, might include other special tokens as well
|
279 |
+
patch_ordering: (n_crops, n_tokens_per_crop) order image features should be inserted
|
280 |
+
into the `tokens`, negative values indicates patches features to exclude
|
281 |
+
padding_mask: (n_crops, h, w) mask of what pixels are padding, can be None
|
282 |
+
"""
|
283 |
+
do_random_scale = self.do_random_scale
|
284 |
+
if do_random_scale:
|
285 |
+
do_random_scale = is_training
|
286 |
+
|
287 |
+
base_image_input_size = self.base_image_input_size
|
288 |
+
if isinstance(base_image_input_size, int):
|
289 |
+
base_image_input_size = (base_image_input_size, base_image_input_size)
|
290 |
+
|
291 |
+
image_token_length_w, image_token_length_h = self.image_token_length_w, self.image_token_length_h
|
292 |
+
base_image_input_d = self.image_patch_size
|
293 |
+
tokens_per_image = image_token_length_w * image_token_length_h
|
294 |
+
image_base_patch_w = base_image_input_size[1] // base_image_input_d
|
295 |
+
image_base_patch_h = base_image_input_size[0] // base_image_input_d
|
296 |
+
extra_image = False
|
297 |
+
patch_ordering = None
|
298 |
+
|
299 |
+
if self.resize == "default":
|
300 |
+
image = tf.image.convert_image_dtype(image, dtype=tf.float32)
|
301 |
+
def _resize(_image, sz):
|
302 |
+
return resize_and_pad(
|
303 |
+
_image, sz,
|
304 |
+
do_random_scale=do_random_scale,
|
305 |
+
random_scale_max=self.random_scale_max,
|
306 |
+
random_scale_min=self.random_scale_min,
|
307 |
+
random_scale_ratio=self.random_scale_ratio,
|
308 |
+
return_outputs=False,
|
309 |
+
resize_method='random' if is_training else tf.image.ResizeMethod.BILINEAR)
|
310 |
+
elif self.resize == "stretch":
|
311 |
+
image = tf.image.convert_image_dtype(image, dtype=tf.float32)
|
312 |
+
assert not do_random_scale
|
313 |
+
|
314 |
+
def _resize(_image, sz):
|
315 |
+
if not is_training:
|
316 |
+
img = tf.image.resize(_image, sz, antialias=True, method=tf.image.ResizeMethod.BILINEAR)
|
317 |
+
else:
|
318 |
+
resize_methods = sorted([k for k in tf.image.ResizeMethod.__dict__.keys() if k.isupper()])
|
319 |
+
img = apply_with_random_selector(
|
320 |
+
_image,
|
321 |
+
lambda x, method_idx: tf.image.resize(x, sz,
|
322 |
+
tf.image.ResizeMethod.__dict__[resize_methods[method_idx]],
|
323 |
+
antialias=True),
|
324 |
+
num_cases=len(resize_methods))
|
325 |
+
return img, tf.ones(tf.shape(img)[:2], dtype=tf.bool)
|
326 |
+
elif self.resize in "siglip":
|
327 |
+
assert not do_random_scale
|
328 |
+
|
329 |
+
def _resize(_image, sz):
|
330 |
+
img = siglip_resize(_image, sz, truncate=None)
|
331 |
+
return img, tf.ones(tf.shape(img)[:2], dtype=tf.bool)
|
332 |
+
else:
|
333 |
+
raise NotImplementedError(self.resize)
|
334 |
+
|
335 |
+
def _img_to_patches(_img, _img_mask, dy=1, dx=1):
|
336 |
+
_img = einops.rearrange(
|
337 |
+
_img, '(dy h dh) (dx w dw) c -> (dy dx) (h w) (dh dw c)',
|
338 |
+
dh=base_image_input_d,
|
339 |
+
dw=base_image_input_d,
|
340 |
+
dy=dy,
|
341 |
+
dx=dx,
|
342 |
+
h=image_base_patch_h,
|
343 |
+
w=image_base_patch_w
|
344 |
+
)
|
345 |
+
_img_mask = einops.rearrange(
|
346 |
+
_img_mask, '(dy h dh) (dx w dw) -> (dy dx) (h w) (dh dw)',
|
347 |
+
dh=base_image_input_d,
|
348 |
+
dw=base_image_input_d,
|
349 |
+
dy=dy,
|
350 |
+
dx=dx,
|
351 |
+
h=image_base_patch_h,
|
352 |
+
w=image_base_patch_w
|
353 |
+
)
|
354 |
+
return _img, tf.reduce_mean(tf.cast(_img_mask, tf.float32), -1)
|
355 |
+
|
356 |
+
mode = self.crop_mode
|
357 |
+
if mode == "resize":
|
358 |
+
patches, img_mask = _resize(image, base_image_input_size)
|
359 |
+
patches, img_mask = _img_to_patches(patches, img_mask)
|
360 |
+
image_layout_impatch_w = 1
|
361 |
+
image_layout_impatch_h = 1
|
362 |
+
patch_ordering = tf.range(tokens_per_image)[None, :]
|
363 |
+
|
364 |
+
elif mode in ["overlap", "overlap-and-resize-c2"]:
|
365 |
+
original_image_h = tf.shape(image, out_type=tf.int32)[0]
|
366 |
+
original_image_w = tf.shape(image, out_type=tf.int32)[1]
|
367 |
+
crop_size = base_image_input_size[0]
|
368 |
+
|
369 |
+
# Discard this many patches from the (left/top, right/bottom) of crops
|
370 |
+
left_margin, right_margin = self.overlap_margins
|
371 |
+
# left_margin, right_margin = 2, 2
|
372 |
+
assert left_margin % 2 == 0 # Required for compatibility with 2x2 pooling
|
373 |
+
total_margin_pixels = base_image_input_d*(right_margin + left_margin) # pixels removed per dim
|
374 |
+
crop_patches = base_image_input_size[0] // base_image_input_d # patches per crop dim
|
375 |
+
crop_window_patches = crop_patches - (right_margin + left_margin) # usable patches
|
376 |
+
crop_window_size = crop_window_patches * base_image_input_d
|
377 |
+
tiling = select_tiling(original_image_h - total_margin_pixels, original_image_w - total_margin_pixels,
|
378 |
+
crop_window_size, self.max_crops)
|
379 |
+
src, img_mask = _resize(
|
380 |
+
image, [tiling[0]*crop_window_size+total_margin_pixels, tiling[1]*crop_window_size+total_margin_pixels])
|
381 |
+
|
382 |
+
n_crops = tiling[0]*tiling[1]
|
383 |
+
patches_arr = tf.TensorArray(
|
384 |
+
tf.float32, n_crops, element_shape=[crop_size, crop_size, 3])
|
385 |
+
mask_arr = tf.TensorArray(
|
386 |
+
tf.bool, n_crops, element_shape=[crop_size, crop_size])
|
387 |
+
# We assume 2x2 pooling, but can allow padding the right/bottom with extra
|
388 |
+
# patches if the number of patches per side is not even
|
389 |
+
assert (crop_patches+1)//2 == image_token_length_h
|
390 |
+
assert (crop_patches+1)//2 == image_token_length_w
|
391 |
+
patch_ordering_arr = tf.TensorArray(
|
392 |
+
tf.int32, n_crops, element_shape=[image_token_length_h, image_token_length_w])
|
393 |
+
on = 0
|
394 |
+
on_patch = 0
|
395 |
+
for i in range(tiling[0]):
|
396 |
+
y0 = i*crop_window_size
|
397 |
+
if i == 0:
|
398 |
+
crop_y0 = 0
|
399 |
+
else:
|
400 |
+
crop_y0 = left_margin // 2
|
401 |
+
|
402 |
+
crop_h = image_base_patch_h - (right_margin + left_margin)
|
403 |
+
if i == 0:
|
404 |
+
crop_h += left_margin
|
405 |
+
if i == (tiling[0]-1):
|
406 |
+
crop_h += right_margin
|
407 |
+
for j in range(tiling[1]):
|
408 |
+
x0 = j*crop_window_size
|
409 |
+
if j == 0:
|
410 |
+
crop_x0 = 0
|
411 |
+
else:
|
412 |
+
crop_x0 = left_margin // 2
|
413 |
+
|
414 |
+
crop_w = image_base_patch_w - (right_margin + left_margin)
|
415 |
+
if j == 0:
|
416 |
+
crop_w += left_margin
|
417 |
+
if j == (tiling[1]-1):
|
418 |
+
crop_w += right_margin
|
419 |
+
|
420 |
+
pooled_w = (crop_w + 1) // 2
|
421 |
+
pooled_h = (crop_h + 1) // 2
|
422 |
+
patch_ordering_arr = patch_ordering_arr.write(
|
423 |
+
on_patch,
|
424 |
+
pad_to_bounding_box(
|
425 |
+
tf.reshape(tf.range(on, on+pooled_h*pooled_w, dtype=tf.int32), (pooled_h, pooled_w, 1)),
|
426 |
+
crop_y0, crop_x0, image_token_length_h, image_token_length_w, value=-1
|
427 |
+
)[:, :, 0]
|
428 |
+
)
|
429 |
+
patches_arr = patches_arr.write(on_patch, src[y0:y0+crop_size, x0:x0+crop_size])
|
430 |
+
mask_arr = mask_arr.write(on_patch, img_mask[y0:y0+crop_size, x0:x0+crop_size])
|
431 |
+
|
432 |
+
on += pooled_h*pooled_w
|
433 |
+
on_patch += 1
|
434 |
+
patches = patches_arr.stack()
|
435 |
+
patch_ordering = patch_ordering_arr.stack()
|
436 |
+
img_mask = mask_arr.stack()
|
437 |
+
|
438 |
+
image_layout_impatch_w, image_layout_impatch_h = tiling[0], tiling[1]
|
439 |
+
patches = einops.rearrange(
|
440 |
+
patches, 'p (h dh) (w dw) c -> p (h w) (dh dw c)',
|
441 |
+
dh=base_image_input_d,
|
442 |
+
dw=base_image_input_d,
|
443 |
+
h=image_base_patch_h,
|
444 |
+
w=image_base_patch_w
|
445 |
+
)
|
446 |
+
img_mask = einops.rearrange(
|
447 |
+
img_mask, 'p (h dh) (w dw) -> p (h w) (dh dw)',
|
448 |
+
dh=base_image_input_d,
|
449 |
+
dw=base_image_input_d,
|
450 |
+
h=image_base_patch_h,
|
451 |
+
w=image_base_patch_w
|
452 |
+
)
|
453 |
+
img_mask = tf.reduce_mean(tf.cast(img_mask, tf.float32), -1)
|
454 |
+
patch_ordering = tf.reshape(patch_ordering, [-1])
|
455 |
+
valid = patch_ordering >= 0
|
456 |
+
|
457 |
+
# Transpose, to get left-to-right order
|
458 |
+
patch_ordering_rh = tf.reshape(patch_ordering,
|
459 |
+
[tiling[0], tiling[1], image_token_length_h, image_token_length_w])
|
460 |
+
patch_ordering_rh = tf.transpose(patch_ordering_rh, [0, 2, 1, 3])
|
461 |
+
patch_ordering_rh = tf.reshape(patch_ordering_rh, [-1])
|
462 |
+
|
463 |
+
# The tranpose will screw up which patches are masked, project the
|
464 |
+
# new order into sparse structure of `patch_ordering` to fix this
|
465 |
+
patch_ordering = tf.tensor_scatter_nd_update(
|
466 |
+
patch_ordering,
|
467 |
+
tf.where(valid),
|
468 |
+
tf.boolean_mask(patch_ordering_rh, patch_ordering_rh >= 0),
|
469 |
+
name="patch_order_transpose_Scatter"
|
470 |
+
)
|
471 |
+
|
472 |
+
h = tiling[0]*crop_window_patches + (right_margin+left_margin)
|
473 |
+
w = tiling[1]*crop_window_patches + (right_margin+left_margin)
|
474 |
+
special_token_ids = self.special_token_ids
|
475 |
+
per_row = tf.fill(((w+1)//2,),
|
476 |
+
special_token_ids[config.DEFAULT_IMAGE_PATCH_TOKEN],)
|
477 |
+
if self.use_col_tokens:
|
478 |
+
per_row = tf.concat([per_row, [special_token_ids[config.DEFAULT_IM_COL_TOKEN]]], 0)
|
479 |
+
|
480 |
+
joint = tf.tile(per_row, [(h+1)//2])
|
481 |
+
joint = [
|
482 |
+
[special_token_ids[config.DEFAULT_IM_START_TOKEN]],
|
483 |
+
joint,
|
484 |
+
[special_token_ids[config.DEFAULT_IM_END_TOKEN]]
|
485 |
+
]
|
486 |
+
|
487 |
+
if "resize" in mode:
|
488 |
+
resized, resized_mask = _resize(image, base_image_input_size)
|
489 |
+
resized, resized_mask = _img_to_patches(resized, resized_mask)
|
490 |
+
if 'c2' in mode:
|
491 |
+
patches = tf.concat([resized, patches], 0)
|
492 |
+
image_mask = tf.concat([resized_mask, img_mask], 0)
|
493 |
+
else:
|
494 |
+
patches = tf.concat([patches, resized], 0)
|
495 |
+
image_mask = tf.concat([img_mask, resized_mask], 0)
|
496 |
+
|
497 |
+
if patch_ordering is not None:
|
498 |
+
if 'c2' in mode:
|
499 |
+
patch_ordering = tf.where(
|
500 |
+
patch_ordering >= 0,
|
501 |
+
patch_ordering + tokens_per_image,
|
502 |
+
-1
|
503 |
+
)
|
504 |
+
patch_ordering = tf.concat([tf.range(0, tokens_per_image), patch_ordering], 0)
|
505 |
+
else:
|
506 |
+
raise ValueError()
|
507 |
+
per_row = tf.fill((image_token_length_w,), special_token_ids[config.DEFAULT_IMAGE_PATCH_TOKEN],)
|
508 |
+
if self.use_col_tokens:
|
509 |
+
per_row = tf.concat([per_row, [special_token_ids[config.DEFAULT_IM_COL_TOKEN]]], 0)
|
510 |
+
extra_tokens = tf.tile(per_row, [image_token_length_h])
|
511 |
+
joint = [
|
512 |
+
[special_token_ids[config.DEFAULT_IM_START_TOKEN]],
|
513 |
+
extra_tokens,
|
514 |
+
[special_token_ids[config.DEFAULT_IM_END_TOKEN]],
|
515 |
+
] + joint
|
516 |
+
|
517 |
+
joint = tf.concat(joint, 0)
|
518 |
+
return patches, joint, patch_ordering, img_mask
|
519 |
+
|
520 |
+
elif mode in ["patchify", "patchify-and-resize", "patchify-v2", "patchify-v2-and-resize", "patchify-v2-and-resize-c2"]:
|
521 |
+
original_image_w = tf.shape(image, out_type=tf.int32)[0]
|
522 |
+
original_image_h = tf.shape(image, out_type=tf.int32)[1]
|
523 |
+
assert base_image_input_size[0] == base_image_input_size[1]
|
524 |
+
base_patch_size = base_image_input_size[0]
|
525 |
+
tiling = select_tiling(original_image_w, original_image_h, base_patch_size, self.max_crops)
|
526 |
+
|
527 |
+
patches, img_mask = _resize(
|
528 |
+
image, [tiling[0]*base_patch_size, tiling[1]*base_patch_size])
|
529 |
+
patches, img_mask = _img_to_patches(patches, img_mask, tiling[0], tiling[1])
|
530 |
+
if 'v2' in mode:
|
531 |
+
# Order patches left-to-right not crop-by-crop
|
532 |
+
patch_ordering = tf.reshape(
|
533 |
+
tf.range(tokens_per_image*tiling[0]*tiling[1]),
|
534 |
+
[tiling[0], tiling[1], image_token_length_w, image_token_length_h])
|
535 |
+
patch_ordering = tf.transpose(patch_ordering, [0, 2, 1, 3])
|
536 |
+
patch_ordering = tf.reshape(patch_ordering, (-1, tokens_per_image))
|
537 |
+
else:
|
538 |
+
patch_ordering = None
|
539 |
+
|
540 |
+
# given image size, determine the number of patch size.
|
541 |
+
image_layout_impatch_w = tiling[0]
|
542 |
+
image_layout_impatch_h = tiling[1]
|
543 |
+
|
544 |
+
if "resize" in mode:
|
545 |
+
extra_image = True
|
546 |
+
resized, resized_mask = _resize(image, base_image_input_size)
|
547 |
+
resized, resized_mask = _img_to_patches(resized, resized_mask)
|
548 |
+
if 'c2' in mode:
|
549 |
+
patches = tf.concat([resized, patches], 0)
|
550 |
+
image_mask = tf.concat([resized_mask, img_mask], 0)
|
551 |
+
else:
|
552 |
+
patches = tf.concat([patches, resized], 0)
|
553 |
+
image_mask = tf.concat([img_mask, resized_mask], 0)
|
554 |
+
|
555 |
+
if patch_ordering is not None:
|
556 |
+
if 'c2' in mode:
|
557 |
+
patch_ordering = tf.concat(
|
558 |
+
[tf.range(0, tokens_per_image)[None, :], patch_ordering+tokens_per_image], 0)
|
559 |
+
else:
|
560 |
+
n = tf.shape(patch_ordering)[0]
|
561 |
+
patch_ordering = tf.concat(patch_ordering, [tf.range(n, n+tokens_per_image)[None, :]], 0)
|
562 |
+
else:
|
563 |
+
raise NotImplementedError(mode)
|
564 |
+
|
565 |
+
special_token_ids = self.special_token_ids
|
566 |
+
|
567 |
+
per_row = tf.fill((image_token_length_w*image_layout_impatch_w,),
|
568 |
+
special_token_ids[config.DEFAULT_IMAGE_PATCH_TOKEN],)
|
569 |
+
if self.use_col_tokens:
|
570 |
+
per_row = tf.concat([per_row, [special_token_ids[config.DEFAULT_IM_COL_TOKEN]]], 0)
|
571 |
+
|
572 |
+
joint = tf.tile(per_row, [image_token_length_h * image_layout_impatch_h])
|
573 |
+
joint = [
|
574 |
+
[special_token_ids[config.DEFAULT_IM_START_TOKEN]],
|
575 |
+
joint,
|
576 |
+
[special_token_ids[config.DEFAULT_IM_END_TOKEN]]
|
577 |
+
]
|
578 |
+
if extra_image:
|
579 |
+
assert not self.image_padding_mask
|
580 |
+
per_row = tf.fill((image_token_length_w,), special_token_ids[config.DEFAULT_IMAGE_PATCH_TOKEN],)
|
581 |
+
if self.use_col_tokens:
|
582 |
+
per_row = tf.concat([per_row, [special_token_ids[config.DEFAULT_IM_COL_TOKEN]]], 0)
|
583 |
+
extra_tokens = tf.tile(per_row, [image_token_length_h])
|
584 |
+
if 'c2' in mode:
|
585 |
+
joint = [
|
586 |
+
[special_token_ids[config.DEFAULT_IM_START_TOKEN]],
|
587 |
+
extra_tokens,
|
588 |
+
[special_token_ids[config.DEFAULT_IM_END_TOKEN]],
|
589 |
+
] + joint
|
590 |
+
else:
|
591 |
+
joint += [
|
592 |
+
[special_token_ids[config.DEFAULT_IM_START_TOKEN]],
|
593 |
+
extra_tokens,
|
594 |
+
[special_token_ids[config.DEFAULT_IM_END_TOKEN]]
|
595 |
+
]
|
596 |
+
if self.pad_to is not None:
|
597 |
+
n = [tf.shape(x)[0] for x in joint]
|
598 |
+
assert len(joint[-1]) == 1
|
599 |
+
to_pad = self.pad_to - tf.reduce_sum(tf.stack(n))
|
600 |
+
joint = tf.concat(joint[:-1] + [
|
601 |
+
tf.zeros(to_pad, dtype=tf.int32) - 1,
|
602 |
+
joint[-1]
|
603 |
+
], axis=0)
|
604 |
+
else:
|
605 |
+
joint = tf.concat(joint, 0)
|
606 |
+
return patches, tf.concat(joint, 0), patch_ordering, img_mask
|
607 |
+
|
608 |
+
def build_image_input_idx(self, input_tokens, patch_order, no_image=None):
|
609 |
+
"""Builds the index used to insert patch features into `input_tokens`"""
|
610 |
+
tokens_per_image = self.image_token_length_w * self.image_token_length_h
|
611 |
+
if no_image is not None and no_image:
|
612 |
+
return tf.zeros((0, tokens_per_image), tf.int32)
|
613 |
+
|
614 |
+
image_input_idx = input_tokens == self.special_token_ids[config.DEFAULT_IMAGE_PATCH_TOKEN]
|
615 |
+
image_input_idx = tf.experimental.numpy.nonzero(image_input_idx)[0]
|
616 |
+
image_input_idx = tf.cast(image_input_idx, tf.int32)
|
617 |
+
|
618 |
+
if patch_order is not None:
|
619 |
+
n_tokens = tf.shape(image_input_idx)[0]
|
620 |
+
# Item N should have the value of image_input_index[where(patch_order == n)] if >= 0 else -1
|
621 |
+
patch_order = tf.reshape(patch_order, [-1])
|
622 |
+
n_patches = tf.shape(patch_order)[0]
|
623 |
+
if n_tokens != n_patches:
|
624 |
+
# Most complex case where some patches are dropped
|
625 |
+
# First invert the valid tokens
|
626 |
+
valid = patch_order >= 0
|
627 |
+
sorted_patch_ixs = tf.scatter_nd(
|
628 |
+
tf.boolean_mask(patch_order, valid)[:, None],
|
629 |
+
tf.range(tf.reduce_sum(tf.cast(valid, tf.int32)), dtype=tf.int32),
|
630 |
+
[n_tokens],
|
631 |
+
name="valid_order_scatter"
|
632 |
+
)
|
633 |
+
|
634 |
+
# Project the inverted mapping into same sparse structure
|
635 |
+
tmp = tf.fill(tf.shape(patch_order), -1)
|
636 |
+
sorted_patch_ixs_ex = tf.tensor_scatter_nd_update(
|
637 |
+
tmp,
|
638 |
+
tf.where(valid),
|
639 |
+
sorted_patch_ixs,
|
640 |
+
name="order_with_padding_scatter"
|
641 |
+
)
|
642 |
+
|
643 |
+
# Do the gather and then re-masked outputs that were masked in `sorted_patch_ixs`
|
644 |
+
valid = tf.cast(sorted_patch_ixs_ex >= 0, tf.int32)
|
645 |
+
image_input_idx = tf.gather(image_input_idx, sorted_patch_ixs_ex*valid)
|
646 |
+
image_input_idx = image_input_idx*valid - 100*(1 - valid)
|
647 |
+
else:
|
648 |
+
sorted_patch_ixs = tf.scatter_nd(patch_order[:, None], tf.range(n_patches), [n_patches])
|
649 |
+
image_input_idx = tf.gather(tf.reshape(image_input_idx, [-1]), sorted_patch_ixs)
|
650 |
+
image_input_idx = tf.reshape(image_input_idx, [-1, tokens_per_image])
|
651 |
+
return image_input_idx
|
652 |
+
|
653 |
+
def build_multimodel_features(self, tokens, mask, subsegments, images, is_training):
|
654 |
+
"""Builds input features by pre-processing `images` and modifying `tokens`
|
655 |
+
to include image col/pad/start/end tokens instead image placeholder tokens
|
656 |
+
"""
|
657 |
+
image_token_id = self.special_token_ids[config.IMAGE_PROMPT]
|
658 |
+
image_idx = tf.experimental.numpy.nonzero(tokens == image_token_id)[0]
|
659 |
+
if images is None or tf.shape(images)[0] == 0:
|
660 |
+
tf.debugging.assert_equal(image_idx, tf.cast(0, tf.int64),
|
661 |
+
"Image placeholders in input, but no images given!")
|
662 |
+
tokens_per_image = self.image_token_length_w * self.image_token_length_h
|
663 |
+
n_pixels = self.image_patch_size ** 2 * 3
|
664 |
+
image_num_patch = np.prod(self.image_num_patch)
|
665 |
+
crops = tf.zeros((0, image_num_patch, n_pixels), dtype=tf.float32)
|
666 |
+
image_idx = tf.zeros((0, tokens_per_image), tf.int32)
|
667 |
+
out = dict(
|
668 |
+
target_tokens=tokens,
|
669 |
+
images=crops,
|
670 |
+
image_input_idx=image_idx,
|
671 |
+
loss_masks=mask
|
672 |
+
)
|
673 |
+
if self.image_padding_mask:
|
674 |
+
out["image_masks"] = tf.zeros((0, image_num_patch), dtype=tf.float32)
|
675 |
+
if subsegments is not None:
|
676 |
+
out["subsegment_ids"] = subsegments
|
677 |
+
return out
|
678 |
+
elif tf.shape(image_idx)[0] == 0 and tf.shape(images)[0] > 0:
|
679 |
+
# As a special case, no image prompt means the images are all at the start
|
680 |
+
image_idx = tf.zeros([tf.shape(images)[0]], tf.int64) - 1
|
681 |
+
else:
|
682 |
+
tf.debugging.assert_equal(
|
683 |
+
tf.shape(images)[0], tf.shape(image_idx)[0],
|
684 |
+
message="Different number of images and image placeholders")
|
685 |
+
|
686 |
+
# Each image will produce a variable number of crops/tokens, so we aggregate things
|
687 |
+
# the results tensor arrays and the concat them
|
688 |
+
tokens_per_image = self.image_token_length_w * self.image_token_length_h
|
689 |
+
n_pixels = self.image_patch_size*self.image_patch_size*3
|
690 |
+
n_patches = self.image_num_patch[0]*self.image_num_patch[1]
|
691 |
+
|
692 |
+
n = tf.shape(images)[0]
|
693 |
+
all_crops = tf.TensorArray(dtype=tf.float32, size=n, infer_shape=False,
|
694 |
+
element_shape=[None, n_patches, n_pixels])
|
695 |
+
all_image_idx = tf.TensorArray(dtype=tf.int32, size=n, infer_shape=False,
|
696 |
+
element_shape=[None, tokens_per_image])
|
697 |
+
out_tokens = tf.TensorArray(dtype=tf.int32, size=n, infer_shape=False,
|
698 |
+
element_shape=[None])
|
699 |
+
out_masks = tf.TensorArray(dtype=tf.float32, size=n, infer_shape=False,
|
700 |
+
element_shape=[None])
|
701 |
+
if self.image_padding_mask:
|
702 |
+
all_crop_masks = tf.TensorArray(dtype=tf.float32, size=n, infer_shape=False,
|
703 |
+
element_shape=[None, None])
|
704 |
+
else:
|
705 |
+
# Dummy array to keep tensorflow's control analysis happy
|
706 |
+
all_crop_masks = tf.TensorArray(dtype=tf.float32, size=0, infer_shape=False,
|
707 |
+
element_shape=[None, None])
|
708 |
+
if subsegments is not None:
|
709 |
+
out_subsegments = tf.TensorArray(dtype=tf.int32, size=n, element_shape=[None])
|
710 |
+
else:
|
711 |
+
out_subsegments = tf.TensorArray(dtype=tf.int32, size=0, element_shape=[None])
|
712 |
+
|
713 |
+
image_idx = tf.cast(image_idx, tf.int32)
|
714 |
+
for ix in range(tf.shape(image_idx)[0]):
|
715 |
+
token_ix = image_idx[ix]
|
716 |
+
crops, image_tokens, patch_ordering, img_mask = self.image_to_patches_and_tokens(images[ix], is_training)
|
717 |
+
patch_idx = self.build_image_input_idx(image_tokens, patch_ordering)
|
718 |
+
|
719 |
+
if token_ix == -1: # -1 is an image inserted at the very start
|
720 |
+
start = 0
|
721 |
+
token_ix = 0
|
722 |
+
end = 0
|
723 |
+
else:
|
724 |
+
start = 0 if ix == 0 else image_idx[ix-1] + 1
|
725 |
+
end = token_ix + 1
|
726 |
+
|
727 |
+
all_image_idx = all_image_idx.write(ix, patch_idx + token_ix)
|
728 |
+
all_crops = all_crops.write(ix, crops)
|
729 |
+
image_token_mask = tf.zeros_like(image_tokens, dtype=tf.float32)
|
730 |
+
|
731 |
+
if ix == (tf.shape(images)[0] - 1):
|
732 |
+
tokens_part = tf.concat([tokens[start:token_ix], image_tokens, tokens[end:]], 0)
|
733 |
+
mask_part = tf.concat([mask[start:token_ix], image_token_mask, mask[end:]], 0)
|
734 |
+
else:
|
735 |
+
tokens_part = tf.concat([tokens[start:token_ix], image_tokens], 0)
|
736 |
+
mask_part = tf.concat([mask[start:token_ix], image_token_mask], 0)
|
737 |
+
|
738 |
+
out_tokens = out_tokens.write(ix, tokens_part)
|
739 |
+
out_masks = out_masks.write(ix, mask_part)
|
740 |
+
if self.image_padding_mask:
|
741 |
+
all_crop_masks = all_crop_masks.write(ix, img_mask)
|
742 |
+
if subsegments is not None:
|
743 |
+
parts = tf.fill([tf.shape(image_tokens)[0]], subsegments[token_ix])
|
744 |
+
if ix == (tf.shape(images)[0] - 1):
|
745 |
+
seg = tf.concat([subsegments[start:token_ix], parts, subsegments[end:]], 0)
|
746 |
+
else:
|
747 |
+
seg = tf.concat([subsegments[start:token_ix], parts], 0)
|
748 |
+
out_subsegments = out_subsegments.write(ix, seg)
|
749 |
+
|
750 |
+
out = dict(
|
751 |
+
target_tokens=out_tokens.concat(),
|
752 |
+
images=all_crops.concat(),
|
753 |
+
image_input_idx=all_image_idx.concat(),
|
754 |
+
loss_masks=out_masks.concat()
|
755 |
+
)
|
756 |
+
if self.image_padding_mask:
|
757 |
+
out["image_masks"] = all_crop_masks.concat()
|
758 |
+
if subsegments is not None:
|
759 |
+
out["subsegment_ids"] = out_subsegments.concat()
|
760 |
+
return out
|
761 |
+
|
762 |
+
def _format_message(self, args):
|
763 |
+
message, ix = args
|
764 |
+
return self.format_message(message, ix)
|
765 |
+
|
766 |
+
def format_message(self, message, ix):
|
767 |
+
"""Applies system formatting to ith message from a sequence of messages"""
|
768 |
+
# If the image placeholder text is not preceded by space it will not get tokenized
|
769 |
+
# correctly by some tokenizers, so double check it here
|
770 |
+
assert config.IMAGE_PROMPT == "<|image|>"
|
771 |
+
tf.debugging.assert_equal(
|
772 |
+
tf.strings.regex_full_match(message, r".*[^ ]<\|image\|>.*"),
|
773 |
+
False,
|
774 |
+
message="Image token must always be preceded by a space"
|
775 |
+
)
|
776 |
+
is_user = ix % 2 == 0
|
777 |
+
if self.message_format == "none" or self.message_format is None:
|
778 |
+
pass
|
779 |
+
elif self.message_format == "role":
|
780 |
+
if is_user:
|
781 |
+
# We put the "System:" prefix here since it doesn't need a loss
|
782 |
+
message = tf.strings.join(["User: ", message, " Assistant:"])
|
783 |
+
elif self.message_format == "cleanup":
|
784 |
+
if is_user:
|
785 |
+
# We put the "System:" prefix here since it doesn't need a loss
|
786 |
+
message = tf.strings.join(
|
787 |
+
[
|
788 |
+
"[[User]]: Correct the spelling and punctuation mistakes on the following transcript based on what appears in the image.\n\n{before} ",
|
789 |
+
message,
|
790 |
+
"\n[[Assistant]]: {after}"
|
791 |
+
]
|
792 |
+
)
|
793 |
+
elif self.message_format == "mistral":
|
794 |
+
if is_user:
|
795 |
+
message = tf.strings.join(["[INST] ", message, " [/INST]"])
|
796 |
+
else:
|
797 |
+
raise NotImplementedError(self.message_format)
|
798 |
+
|
799 |
+
# For now assume a space will be used to separate the messages
|
800 |
+
if not self.tokenizer.adds_space:
|
801 |
+
if ix != 0 or self.always_start_with_space:
|
802 |
+
message = tf.strings.join([" ", message])
|
803 |
+
# Else space added automatically by the tokenizer
|
804 |
+
|
805 |
+
return message
|
806 |
+
|
807 |
+
def get_multi_message_token_input(self, conversations, text_weights=None):
|
808 |
+
"""Build inputs for a ragged tensor of conversations, where each row of the tensor,
|
809 |
+
is a different conversation"""
|
810 |
+
tf.debugging.assert_equal(tf.reduce_any(tf.strings.regex_full_match(
|
811 |
+
conversations.values, re.escape(config.IMAGE_PROMPT))), False, "Segmented prompts must start with the image")
|
812 |
+
|
813 |
+
n_conversation = tf.shape(conversations)[0]
|
814 |
+
ar = tf.TensorArray(dtype=tf.int32, infer_shape=False, element_shape=[None],
|
815 |
+
size=n_conversation)
|
816 |
+
n_messages_per_conversation = conversations.row_lengths()
|
817 |
+
for ix in range(n_conversation):
|
818 |
+
ar = ar.write(ix, tf.range(n_messages_per_conversation[ix], dtype=tf.int32))
|
819 |
+
message_ix = ar.concat()
|
820 |
+
messages = tf.map_fn(
|
821 |
+
self._format_message, elems=(conversations.values, message_ix), fn_output_signature=tf.string)
|
822 |
+
messages = self.tokenizer.encode_tf(messages)
|
823 |
+
|
824 |
+
# Append EOS
|
825 |
+
is_response = message_ix % 2 == 1
|
826 |
+
is_response_int = tf.cast(is_response, tf.int32)
|
827 |
+
eos = tf.RaggedTensor.from_row_lengths(
|
828 |
+
tf.fill([tf.reduce_sum(is_response_int)], self.tokenizer.eos_token_id),
|
829 |
+
tf.cast(is_response_int, messages.row_splits.dtype)
|
830 |
+
)
|
831 |
+
messages = tf.concat([messages, eos], axis=1)
|
832 |
+
|
833 |
+
# Build mask over system responses
|
834 |
+
mask = tf.ones_like(messages) * tf.cast(tf.expand_dims(is_response, axis=1), tf.int32)
|
835 |
+
decoder_loss_weights = tf.cast(mask.values, tf.float32)
|
836 |
+
|
837 |
+
# Build subsegment ids for each conversation
|
838 |
+
tokens_per_message = tf.RaggedTensor.from_row_splits(
|
839 |
+
row_splits=conversations.row_splits,
|
840 |
+
values=messages.row_lengths()
|
841 |
+
)
|
842 |
+
token_per_conversation = tf.reduce_sum(tokens_per_message, axis=1)
|
843 |
+
subsegment_ids = tf.repeat(tf.range(n_conversation, dtype=tf.int32)+1, token_per_conversation)
|
844 |
+
|
845 |
+
image_ix = self.special_token_ids[config.IMAGE_PROMPT]
|
846 |
+
messages = tf.concat([[image_ix], messages.values], axis=0)
|
847 |
+
decoder_loss_weights = tf.concat([[0], decoder_loss_weights], axis=0)
|
848 |
+
subsegment_ids = tf.concat([[10000], subsegment_ids], axis=0)
|
849 |
+
return messages, decoder_loss_weights, subsegment_ids
|
850 |
+
|
851 |
+
def get_multi_response_token_input(self, user_prompt, text, text_weights=None):
|
852 |
+
"""Build tokens for a multi-response-per-image example"""
|
853 |
+
# FIXME this could be relaxed to just having the same prefix
|
854 |
+
tf.debugging.assert_equal(tf.reduce_any(tf.strings.regex_full_match(
|
855 |
+
user_prompt, re.escape(config.IMAGE_PROMPT))), False, "Segmented prompts must start with the image")
|
856 |
+
user_prompt = self.format_message(user_prompt, 0)
|
857 |
+
vocab = self.tokenizer
|
858 |
+
prompts = vocab.encode_tf(user_prompt)
|
859 |
+
response = self.format_message(text, 1)
|
860 |
+
responses = vocab.encode_tf(response)
|
861 |
+
responses = _append_to_innermost_axis(responses, vocab.eos_token_id)
|
862 |
+
response_mask = tf.ones_like(responses, dtype=tf.float32)
|
863 |
+
if text_weights is not None:
|
864 |
+
response_mask *= text_weights
|
865 |
+
image_tokens = tf.constant([self.special_token_ids[config.IMAGE_PROMPT]])
|
866 |
+
|
867 |
+
if len(responses.shape) == 3:
|
868 |
+
# Tricky case where we have multiple questions, each of which has multiple answers
|
869 |
+
assert len(prompts.shape) == 2
|
870 |
+
|
871 |
+
# Also shift the last tokens to the response segment since that tokens will
|
872 |
+
# have multiple possible target tokens to predict
|
873 |
+
last_prompt_tokens = prompts[:, -1:]
|
874 |
+
last_prompt_tokens = tf.repeat(last_prompt_tokens, responses.row_lengths())
|
875 |
+
last_prompt_tokens = tf.RaggedTensor.from_row_splits(
|
876 |
+
values=tf.RaggedTensor.from_row_lengths(
|
877 |
+
values=last_prompt_tokens,
|
878 |
+
row_lengths=tf.ones_like(last_prompt_tokens, dtype=responses.row_splits.dtype)
|
879 |
+
),
|
880 |
+
row_splits=responses.row_splits
|
881 |
+
)
|
882 |
+
responses = tf.concat([last_prompt_tokens, responses], 2)
|
883 |
+
prompts = prompts[:, :-1]
|
884 |
+
|
885 |
+
shared_prefix = image_tokens
|
886 |
+
segmented_suffix = tf.concat([tf.expand_dims(prompts, 1), responses], 1)
|
887 |
+
targets = tf.concat([shared_prefix, segmented_suffix.values.values], 0)
|
888 |
+
|
889 |
+
segmented_mask = tf.concat([
|
890 |
+
tf.zeros_like(tf.expand_dims(prompts, 1), dtype=tf.float32),
|
891 |
+
tf.concat([
|
892 |
+
tf.zeros_like(last_prompt_tokens, dtype=tf.float32),
|
893 |
+
response_mask
|
894 |
+
], 2)
|
895 |
+
], 1).values.values
|
896 |
+
decoder_loss_weights = tf.concat(
|
897 |
+
[tf.zeros_like(shared_prefix, dtype=tf.float32), segmented_mask], 0)
|
898 |
+
|
899 |
+
text_segment_ids = get_3d_subsegments(segmented_suffix)
|
900 |
+
subsegment_ids = tf.concat([
|
901 |
+
tf.zeros_like(shared_prefix) + tf.reduce_max(text_segment_ids)+1,
|
902 |
+
text_segment_ids], 0)
|
903 |
+
subsegment_ids = tf.cast(subsegment_ids, tf.int32)
|
904 |
+
else:
|
905 |
+
if len(prompts.shape) == 1:
|
906 |
+
# One prompt for all responses, we use the last token of the prompt as the
|
907 |
+
# first token of each response segment since there will be multiple targets
|
908 |
+
# for that token, the remaining targets are part of the prefix
|
909 |
+
shared_prefix = tf.concat([image_tokens, prompts[:-1]], 0)
|
910 |
+
prompts = prompts[-1:]
|
911 |
+
prompts = tf.tile(tf.expand_dims(prompts, axis=0), [tf.shape(text)[0], 1])
|
912 |
+
else:
|
913 |
+
shared_prefix = image_tokens
|
914 |
+
|
915 |
+
# Separate prompt for each response
|
916 |
+
segmented_suffix = tf.concat([prompts, responses], 1)
|
917 |
+
segmented_mask = tf.concat([tf.zeros_like(prompts, dtype=tf.float32), response_mask], 1).values
|
918 |
+
|
919 |
+
targets = tf.concat([shared_prefix, segmented_suffix.values], 0)
|
920 |
+
decoder_loss_weights = tf.concat(
|
921 |
+
[tf.zeros_like(shared_prefix, dtype=tf.float32), segmented_mask], 0)
|
922 |
+
subsegments = tf.ragged.row_splits_to_segment_ids(segmented_suffix.row_splits) + 1
|
923 |
+
subsegment_ids = tf.concat([tf.zeros_like(shared_prefix)+10000,
|
924 |
+
tf.cast(subsegments, tf.int32)], 0)
|
925 |
+
return targets, decoder_loss_weights, subsegment_ids
|
926 |
+
|
927 |
+
def get_tokens_input(self, messages, for_inference=False, text_weights=None):
|
928 |
+
"""Gets the token input for an example, using image placeholder tokens to
|
929 |
+
indicate where images features should be inserted
|
930 |
+
|
931 |
+
inputs
|
932 |
+
messages: List or tensor users/system text messages, can have image placeholder tokens
|
933 |
+
for_inference: bool, if true truncate the messages if it is a system message
|
934 |
+
text_weights: Weights per a system message
|
935 |
+
|
936 |
+
returns
|
937 |
+
tokens: [n_tokens] tf.int32 token inputs with image placeholder tokens
|
938 |
+
loss_mask: [n_tokens] tf.float32 token weights for loss
|
939 |
+
subsegment: [n_tokens] tf.int32 or None, subsegment ids used to build more complex
|
940 |
+
attention masks if needed
|
941 |
+
"""
|
942 |
+
if isinstance(messages, tf.RaggedTensor):
|
943 |
+
assert not for_inference, "Cannot have multiple target messages for inference"
|
944 |
+
return self.get_multi_message_token_input(messages, text_weights)
|
945 |
+
elif len(tf.shape(messages[-1])) > 0:
|
946 |
+
assert not for_inference, "Cannot have multiple target messages for inference"
|
947 |
+
assert len(messages) == 2
|
948 |
+
prompt = messages[0]
|
949 |
+
response = messages[1]
|
950 |
+
return self.get_multi_response_token_input(prompt, response, text_weights)
|
951 |
+
else:
|
952 |
+
messages = tf.convert_to_tensor(messages)
|
953 |
+
if for_inference:
|
954 |
+
if tf.shape(messages) % 2 == 0:
|
955 |
+
# Remove the last message since the model should predict it
|
956 |
+
messages = messages[:-1]
|
957 |
+
|
958 |
+
# Apply system formatting
|
959 |
+
ix = tf.range(tf.shape(messages)[0])
|
960 |
+
is_response = ix % 2 == 1
|
961 |
+
messages = tf.map_fn(
|
962 |
+
self._format_message, elems=(messages, ix), fn_output_signature=tf.string)
|
963 |
+
|
964 |
+
# Tokenize
|
965 |
+
messages = self.tokenizer.encode_tf(messages)
|
966 |
+
|
967 |
+
# Add EOS to system messages
|
968 |
+
is_response_int = tf.cast(is_response, tf.int32)
|
969 |
+
eos = tf.RaggedTensor.from_row_lengths(
|
970 |
+
tf.fill([tf.reduce_sum(is_response_int)], self.tokenizer.eos_token_id),
|
971 |
+
tf.cast(is_response_int, messages.row_splits.dtype)
|
972 |
+
)
|
973 |
+
messages = tf.concat([messages, eos], axis=1)
|
974 |
+
targets = messages.values
|
975 |
+
|
976 |
+
# Build mask over system responses
|
977 |
+
mask = tf.ones_like(messages) * tf.cast(tf.expand_dims(is_response, axis=1), tf.int32)
|
978 |
+
decoder_loss_weights = tf.cast(mask.values, tf.float32)
|
979 |
+
if text_weights is not None:
|
980 |
+
decoder_loss_weights = decoder_loss_weights * text_weights
|
981 |
+
return messages.values, decoder_loss_weights, None
|
982 |
+
|
983 |
+
def preprocess(self, image, input_text, is_training=False,
|
984 |
+
seq_len=None, pad_images=1, style=None, for_inference=True):
|
985 |
+
"""Get input tensors for the given image/text data
|
986 |
+
|
987 |
+
image: [h, w, 3] numpy uint8 array of image pixels
|
988 |
+
input_text: string input text, a list of text for a multi-turn conversation or dictionary
|
989 |
+
of inputs to use to build the prompt from a template
|
990 |
+
is_training: allow training-time preprocessing (e.g., image augmentation)
|
991 |
+
seq_len: pad input tokens to `seq_len`
|
992 |
+
pad_images: pad input images to `self.get_max_total_crops()`
|
993 |
+
style: Style to use for prompt templating
|
994 |
+
"""
|
995 |
+
if image is not None and len(tf.shape(image)) == 3:
|
996 |
+
image = tf.expand_dims(image, axis=0)
|
997 |
+
|
998 |
+
messages = self.get_messages(input_text, style, is_training, for_inference=for_inference, user_prompt_seed=None, system_prompt_seed=None)
|
999 |
+
targets, loss_masks, subsegments = self.get_tokens_input(messages, for_inference=for_inference)
|
1000 |
+
batch = self.build_multimodel_features(
|
1001 |
+
targets, loss_masks, subsegments, image, is_training)
|
1002 |
+
|
1003 |
+
# Optionally padding to get constant sized arrays
|
1004 |
+
if pad_images:
|
1005 |
+
max_crops = self.get_max_total_crops() * pad_images
|
1006 |
+
image = batch["images"]
|
1007 |
+
n = max_crops - tf.shape(batch["images"])[0]
|
1008 |
+
batch["images"] = tf.pad(image, [[0, n], [0, 0], [0, 0]], constant_values=-1)
|
1009 |
+
if self.image_padding_mask:
|
1010 |
+
m = max_crops - tf.shape(batch["image_masks"])[0]
|
1011 |
+
batch["image_masks"] = tf.pad(batch["image_masks"], [[0, m], [0, 0]], constant_values=-1)
|
1012 |
+
batch["image_input_idx"] = tf.pad(batch["image_input_idx"], [[0, n], [0, 0]], constant_values=-1)
|
1013 |
+
|
1014 |
+
if seq_len is not None:
|
1015 |
+
targets = batch["target_tokens"]
|
1016 |
+
if seq_len < len(targets):
|
1017 |
+
raise ValueError("Sequence length too short")
|
1018 |
+
n = seq_len - len(targets)
|
1019 |
+
batch["target_tokens"] = tf.pad(targets, [[0, n]], constant_values=-1)
|
1020 |
+
batch["loss_masks"] = tf.pad(batch["loss_masks"], [[0, n]], constant_values=-1)
|
1021 |
+
|
1022 |
+
batch = self.get_post_mixing_preprocessor(pack=False)._convert_example(batch)
|
1023 |
+
return batch
|
1024 |
+
|
1025 |
+
def get_user_prompt(self, style, example, is_training=True, for_inference=False, seed=None):
|
1026 |
+
"""Build a list of strings of what a user might type in to the model for the given example,
|
1027 |
+
and its responses, by applying a prompt template to the fields in `example`
|
1028 |
+
|
1029 |
+
Can return multiple strings for one message for multi-response examples
|
1030 |
+
"""
|
1031 |
+
if "style" in example:
|
1032 |
+
style = example["style"]
|
1033 |
+
|
1034 |
+
if "prompt" in example:
|
1035 |
+
# Examples have a complete user prompt pre-specified, usually for eval sets
|
1036 |
+
prompt = example["prompt"]
|
1037 |
+
|
1038 |
+
elif self.prompt_templates == "none":
|
1039 |
+
# Bare-bone prompt with not templating of instructions
|
1040 |
+
if "prompt" in example:
|
1041 |
+
prompt = example["prompt"]
|
1042 |
+
elif "refexp" in example:
|
1043 |
+
prompt = example["refexp"]
|
1044 |
+
elif "question" in example and "options" in example:
|
1045 |
+
prompt = tf.strings.join([example["question"], "\n", example["options"], "\n"])
|
1046 |
+
elif "question" in example:
|
1047 |
+
prompt = example["question"]
|
1048 |
+
else:
|
1049 |
+
prompt = ""
|
1050 |
+
|
1051 |
+
elif self.prompt_templates == "uber_model":
|
1052 |
+
if not isinstance(style, str):
|
1053 |
+
tf.debugging.assert_equal(tf.logical_or(
|
1054 |
+
style == "ai2_diagram_no_letter",
|
1055 |
+
style == "ai2_diagram",
|
1056 |
+
), True)
|
1057 |
+
prompt = tf.strings.join([example["question"], "\n", example["options"], "\n"])
|
1058 |
+
else:
|
1059 |
+
# We template long captions and pointing since they are "demo" tasks, and use
|
1060 |
+
# plain text for everything else
|
1061 |
+
if style == "long_caption":
|
1062 |
+
prompt = apply_keyword_prompt(GENERAL_PROMPTS_V1["long_caption"], example, seed)
|
1063 |
+
elif style == "pointing":
|
1064 |
+
prompt = apply_keyword_prompt(GENERAL_PROMPTS_V1["pointing"], example, seed)
|
1065 |
+
elif style == "point_count":
|
1066 |
+
prompt = apply_keyword_prompt(GENERAL_PROMPTS_V1["point_count"], example, seed)
|
1067 |
+
elif "prompt" in example:
|
1068 |
+
prompt = example["prompt"]
|
1069 |
+
elif "refexp" in example:
|
1070 |
+
prompt = example["refexp"]
|
1071 |
+
elif "question" in example and "options" in example:
|
1072 |
+
prompt = tf.strings.join([example["question"], "\n", example["options"], "\n"])
|
1073 |
+
elif "question" in example:
|
1074 |
+
prompt = example["question"]
|
1075 |
+
else:
|
1076 |
+
prompt = ""
|
1077 |
+
|
1078 |
+
elif self.prompt_templates == "uber_model_pointing":
|
1079 |
+
if style == "long_caption":
|
1080 |
+
long_captions = GENERAL_PROMPTS_V1["long_caption_no_pointing"]
|
1081 |
+
prompt = apply_keyword_prompt(GENERAL_PROMPTS_V1["long_caption"], example, seed)
|
1082 |
+
elif style == "pointing":
|
1083 |
+
prompt = apply_keyword_prompt(GENERAL_PROMPTS_V1["pointing"], example, seed)
|
1084 |
+
elif style in [
|
1085 |
+
"scifi_charts_explanation",
|
1086 |
+
"scifi_table_explanation",
|
1087 |
+
"scifi_document_explanation",
|
1088 |
+
"scifi_diagram_explanation",
|
1089 |
+
"user_qa",
|
1090 |
+
"long_caption",
|
1091 |
+
]:
|
1092 |
+
raise NotImplementedError()
|
1093 |
+
if style == "long_caption":
|
1094 |
+
prompts = GENERAL_PROMPTS_V1["long_caption"]
|
1095 |
+
elif "prompt" in example:
|
1096 |
+
prompts = tf.expand_dims(example["prompt"], axis=0)
|
1097 |
+
else:
|
1098 |
+
prompts = tf.expand_dims(example["question"], axis=0)
|
1099 |
+
suffixes = []
|
1100 |
+
for suffix in GENERAL_PROMPTS_V1["no_pointing_suffix"]:
|
1101 |
+
if not suffix[0].isspace():
|
1102 |
+
suffix = " " + suffix
|
1103 |
+
suffixes.append(suffix)
|
1104 |
+
no_point_prompts = tf.reshape(tf.strings.join([
|
1105 |
+
tf.tile(tf.expand_dims(suffixes, 1), [1, tf.shape(prompts)[1]]),
|
1106 |
+
tf.tile(prompts, [len(suffixes), 1]),
|
1107 |
+
]), [-1])
|
1108 |
+
# prefixes = []
|
1109 |
+
# for prefix in GENERAL_PROMPTS_V1["no_pointing_prefix"]:
|
1110 |
+
# if not prefix[0].isspace():
|
1111 |
+
# prefix = prefix + " "
|
1112 |
+
# prefixes.append(prompts + prefix)
|
1113 |
+
prompt = apply_keyword_prompt(no_point_prompts, example, seed, keywords=[])
|
1114 |
+
elif "prompt" in example:
|
1115 |
+
prompt = example["prompt"]
|
1116 |
+
elif "refexp" in example:
|
1117 |
+
prompt = example["refexp"]
|
1118 |
+
elif "question" in example and "options" in example:
|
1119 |
+
prompt = tf.strings.join([example["question"], "\n", example["options"], "\n"])
|
1120 |
+
elif "question" in example:
|
1121 |
+
prompt = example["question"]
|
1122 |
+
else:
|
1123 |
+
prompt = ""
|
1124 |
+
|
1125 |
+
elif self.prompt_templates == "general_instructions_v1":
|
1126 |
+
if isinstance(style, str):
|
1127 |
+
prompt = apply_keyword_prompt(GENERAL_PROMPTS_V1[STYLE_TO_GENERAL_PROMPT[style]], example, seed)
|
1128 |
+
elif isinstance(style, list):
|
1129 |
+
# This ia bit of hack to allow apply prompts to joint caption/transcript data
|
1130 |
+
# FIXME ideally we can apply the templating to multiple styles more generally
|
1131 |
+
def _apply(_style, ix):
|
1132 |
+
tmp = dict(example)
|
1133 |
+
# prevent apply_keyword_prompt for generating multiple templates
|
1134 |
+
tmp["text"] = tmp["text"][0]
|
1135 |
+
if _style == "long_caption":
|
1136 |
+
return apply_keyword_prompt(GENERAL_PROMPTS_V1["long_caption"], tmp, seed)
|
1137 |
+
elif _style == "transcript":
|
1138 |
+
return apply_keyword_prompt(GENERAL_PROMPTS_V1["transcript"], tmp, seed)
|
1139 |
+
else:
|
1140 |
+
raise NotImplementedError(_style)
|
1141 |
+
prompt = [_apply(x, ix) for ix, x in enumerate(style)]
|
1142 |
+
else:
|
1143 |
+
raise NotImplementedError()
|
1144 |
+
|
1145 |
+
elif self.prompt_templates == "zero_shot_v1":
|
1146 |
+
assert style is not None
|
1147 |
+
if not isinstance(style, str):
|
1148 |
+
# FIXME can we handle tensor style's in a better way?
|
1149 |
+
if style == "ai2_diagram":
|
1150 |
+
prompt = "Question: {question}\nAnswer with correct answer option letter only\nOptions: {options}\nAnswer:"
|
1151 |
+
prompt = apply_keyword_prompt([prompt], example, seed)
|
1152 |
+
elif style == "ai2_diagram_no_letter":
|
1153 |
+
prompt = "Question: {question}\nAnswer with correct answer option only\nOptions: {options}\nAnswer:"
|
1154 |
+
prompt = apply_keyword_prompt([prompt], example, seed)
|
1155 |
+
else:
|
1156 |
+
prompt = ""
|
1157 |
+
tf.debugging.assert_equal(prompt != "", True)
|
1158 |
+
else:
|
1159 |
+
general_style = STYLE_TO_GENERAL_PROMPT[style]
|
1160 |
+
if general_style == "short_answer":
|
1161 |
+
prompt = apply_keyword_prompt(["Question: {question} Answer with as few words as possible. Answer:"], example, seed)
|
1162 |
+
elif general_style == "multiple_choice":
|
1163 |
+
prompt = apply_keyword_prompt(["Question: {question}\nAnswer with correct answer option letter only\nOptions: {options}\nAnswer:"], example, seed)
|
1164 |
+
elif general_style == "count_bench":
|
1165 |
+
prompt = apply_keyword_prompt(["Question: How many {object} are there?\nRespond with only a number.\nAnswer:"], example, seed)
|
1166 |
+
else:
|
1167 |
+
raise NotImplementedError(general_style)
|
1168 |
+
|
1169 |
+
elif self.prompt_templates == "zero_shot_v2":
|
1170 |
+
assert style is not None
|
1171 |
+
|
1172 |
+
if self.prompt_override:
|
1173 |
+
prompt = apply_keyword_prompt([self.prompt_override], example, seed)
|
1174 |
+
elif not isinstance(style, str):
|
1175 |
+
if style == "ai2_diagram":
|
1176 |
+
prompt = "{question} Answer with correct answer option letter only. Options: {options}"
|
1177 |
+
prompt = apply_keyword_prompt([prompt], example, seed)
|
1178 |
+
elif style == "ai2_diagram_no_letter":
|
1179 |
+
prompt = "{question} Answer with correct answer option only. Options: {options}"
|
1180 |
+
prompt = apply_keyword_prompt([prompt], example, seed)
|
1181 |
+
else:
|
1182 |
+
prompt = ""
|
1183 |
+
tf.debugging.assert_equal(prompt != "", True)
|
1184 |
+
else:
|
1185 |
+
if style in ["vqa2", "gqa", "tally_qa", "okvqa", "a_okvqa_da"]:
|
1186 |
+
prompt = "Answer with a single word. {question}"
|
1187 |
+
elif style in ["text_vqa", "doc_qa", "info_qa", "chart_qa", "st_qa", "ocr_vqa", "dv_qa", "tabwmp_da", "figure_qa", "figure_qa_zero_shot", "plot_qa"]:
|
1188 |
+
prompt = "{question}\nRespond as concisely as possible, do not output anything other than the answer."
|
1189 |
+
elif STYLE_TO_GENERAL_PROMPT[style] == "multiple_choice":
|
1190 |
+
prompt = "{question} Answer with correct answer option letter only. Options: {options}"
|
1191 |
+
elif STYLE_TO_GENERAL_PROMPT[style] == "short_answer":
|
1192 |
+
prompt = "{question} Answer with as few words as possible."
|
1193 |
+
elif style == "vtabfact":
|
1194 |
+
prompt = "{question}"
|
1195 |
+
elif style == "count_bench":
|
1196 |
+
prompt = "How many {object} are there?\nRespond with only a number."
|
1197 |
+
else:
|
1198 |
+
raise NotImplementedError(style)
|
1199 |
+
prompt = apply_keyword_prompt([prompt], example, seed)
|
1200 |
+
else:
|
1201 |
+
raise NotImplementedError(self.prompt_templates)
|
1202 |
+
|
1203 |
+
if for_inference:
|
1204 |
+
return [prompt]
|
1205 |
+
else:
|
1206 |
+
return [prompt, example["text"]]
|
1207 |
+
|
1208 |
+
def get_system_prompt(self, style, example, for_inference,
|
1209 |
+
messages, seed=None):
|
1210 |
+
if isinstance(style, str) and style == "count_bench":
|
1211 |
+
style = "ok_vqa"
|
1212 |
+
|
1213 |
+
if self.system_prompt == "style":
|
1214 |
+
if isinstance(style, str):
|
1215 |
+
prefix = style + ":"
|
1216 |
+
else:
|
1217 |
+
prefix = tf.strings.join([style, ":"])
|
1218 |
+
|
1219 |
+
elif self.system_prompt == "demo_or_style":
|
1220 |
+
if isinstance(style, str):
|
1221 |
+
if style == "android_control" or style == "demo":
|
1222 |
+
# android is a special case since I hacked in prefix in the preprocessor
|
1223 |
+
prefix = ""
|
1224 |
+
elif style in ["scifi_demo", "synthetic_qa"] or style in DEMO_STYLES:
|
1225 |
+
if style == "scifi_demo":
|
1226 |
+
p_no_prompt = 0.2
|
1227 |
+
elif style == "synthetic_qa":
|
1228 |
+
p_no_prompt = 0.25
|
1229 |
+
else:
|
1230 |
+
p_no_prompt = 0.9
|
1231 |
+
if len(tf.shape(messages)) > 1:
|
1232 |
+
n_messages = tf.shape(messages)[1]
|
1233 |
+
style = tf.tile(tf.expand_dims(style, axis=0), [n_messages])
|
1234 |
+
r = tf.random.stateless_uniform([n_messages], seed, 0, 1)
|
1235 |
+
else:
|
1236 |
+
r = tf.random.stateless_uniform((), seed, 0, 1)
|
1237 |
+
prefix = tf.where(r < p_no_prompt, "", tf.strings.join([style + ":"]))
|
1238 |
+
else:
|
1239 |
+
prefix = style + ":"
|
1240 |
+
else:
|
1241 |
+
if tf.reduce_any(style == tf.constant(DEMO_STYLES + ["scifi_demo", "android_control", "demo"])):
|
1242 |
+
prefix = ""
|
1243 |
+
else:
|
1244 |
+
prefix = tf.strings.join([style, ":"])
|
1245 |
+
|
1246 |
+
elif self.system_prompt in ["long_caption_length_hint", "style_long_caption_length_hint"]:
|
1247 |
+
if seed is not None:
|
1248 |
+
raise NotImplementedError("Determinism")
|
1249 |
+
std = 25
|
1250 |
+
use_hint = tf.logical_or(
|
1251 |
+
tf.equal(style, "long_caption"), tf.equal(style, "transcript"))
|
1252 |
+
if self.system_prompt == "style_long_caption_length_hint":
|
1253 |
+
default = tf.strings.join([style, ": "])
|
1254 |
+
else:
|
1255 |
+
default = ""
|
1256 |
+
if for_inference:
|
1257 |
+
assert len(tf.shape(use_hint)) == 0
|
1258 |
+
if self.default_inference_len and use_hint:
|
1259 |
+
prefix = tf.strings.join([style, " ", str(self.default_inference_len), ": "])
|
1260 |
+
else:
|
1261 |
+
prefix = default
|
1262 |
+
else:
|
1263 |
+
std = 25
|
1264 |
+
n = tf.strings.length(messages[-1])
|
1265 |
+
n += tf.cast(tf.random.normal(n.shape)*std, tf.int32)
|
1266 |
+
hint = tf.strings.join([style, " ", tf.strings.as_string(n//15), ": "])
|
1267 |
+
use_hint = tf.logical_and(use_hint, tf.random.uniform(tf.shape(hint)) > 0.1)
|
1268 |
+
prefix = tf.where(use_hint, hint, default)
|
1269 |
+
|
1270 |
+
elif for_inference and self.system_prompt in ["style_and_length", "style_and_length_v2"]:
|
1271 |
+
v2 = self.system_prompt == "style_and_length_v2"
|
1272 |
+
if example.get("length_cond") is not None:
|
1273 |
+
# Examples have individual length conditioning
|
1274 |
+
n = tf.strings.as_string(example["length_cond"])
|
1275 |
+
else:
|
1276 |
+
inference_len = self.default_inference_len
|
1277 |
+
n = None if inference_len is None else str(inference_len)
|
1278 |
+
logging.warning(f"eval len: {n}")
|
1279 |
+
if n is not None and tf.strings.length(n) > 0: # allow empty string to signal unconditioned
|
1280 |
+
prefix = tf.strings.join([style, " ", n, ":"])
|
1281 |
+
else:
|
1282 |
+
prefix = tf.strings.join([style, ":" if v2 else " :"])
|
1283 |
+
elif self.system_prompt in ["style_and_length", "style_and_length_v2"]:
|
1284 |
+
v2 = self.system_prompt == "style_and_length_v2"
|
1285 |
+
std = 25
|
1286 |
+
logging.info(f"style prompt std={std}, percent=10")
|
1287 |
+
if seed is not None:
|
1288 |
+
seeds = tf.random.split(seed)
|
1289 |
+
p = tf.random.stateless_uniform((), seed=seeds[0])
|
1290 |
+
else:
|
1291 |
+
p = tf.random.uniform(())
|
1292 |
+
if p > 0.10:
|
1293 |
+
n = tf.strings.length(messages[-1])
|
1294 |
+
if seed is not None:
|
1295 |
+
n += tf.cast(tf.random.stateless_normal(n.shape, seed=seeds[1])*std, tf.int32)
|
1296 |
+
else:
|
1297 |
+
n += tf.cast(tf.random.normal(n.shape)*std, tf.int32)
|
1298 |
+
n = tf.strings.as_string(n//15)
|
1299 |
+
prefix = tf.strings.join([style, " ", n, ":"])
|
1300 |
+
else:
|
1301 |
+
prefix = tf.strings.join([style, ":" if v2 else " :"])
|
1302 |
+
else:
|
1303 |
+
raise NotImplementedError(self.system_prompt)
|
1304 |
+
|
1305 |
+
return prefix
|
1306 |
+
|
1307 |
+
def preprend_system_prompt(self, style, example, for_inference, messages, seed=None):
|
1308 |
+
prefix = self.get_system_prompt(style, example, for_inference, messages, seed=seed)
|
1309 |
+
separator = tf.where(tf.logical_and(
|
1310 |
+
tf.strings.length(prefix) > 0, tf.strings.length(messages[0]) > 0), " ", "")
|
1311 |
+
with_system_prompt = tf.strings.join([prefix, separator, messages[0]])
|
1312 |
+
if isinstance(messages, list):
|
1313 |
+
messages = [with_system_prompt] + messages[1:]
|
1314 |
+
else:
|
1315 |
+
messages = tf.concat([tf.expand_dims(with_system_prompt, 0), messages[1:]], axis=0)
|
1316 |
+
return messages
|
1317 |
+
|
1318 |
+
def get_messages(self, ex, style, is_training, for_inference, user_prompt_seed, system_prompt_seed):
|
1319 |
+
if isinstance(ex, list):
|
1320 |
+
messages = ex
|
1321 |
+
elif isinstance(ex, str):
|
1322 |
+
messages = [ex]
|
1323 |
+
elif "messages" in ex:
|
1324 |
+
messages = ex["messages"]
|
1325 |
+
else:
|
1326 |
+
# Apply a prompt template
|
1327 |
+
messages = self.get_user_prompt(style, ex, is_training, for_inference=for_inference, seed=user_prompt_seed)
|
1328 |
+
|
1329 |
+
# Maybe add a system prompt. The system prompt gets concatenated with the first user input
|
1330 |
+
if self.system_prompt and self.system_prompt != "none":
|
1331 |
+
if isinstance(ex, dict):
|
1332 |
+
style = ex.get("style", style)
|
1333 |
+
|
1334 |
+
if isinstance(messages, tf.RaggedTensor):
|
1335 |
+
n = tf.shape(messages)[0]
|
1336 |
+
message_arr = tf.TensorArray(dtype=tf.string, size=n, element_shape=(None,))
|
1337 |
+
seeds = tf.random.split(system_prompt_seed, n)
|
1338 |
+
for i in range(n):
|
1339 |
+
message_arr = message_arr.write(i, self.preprend_system_prompt(style, None, for_inference, messages[i], seed=seeds[i]))
|
1340 |
+
messages = tf.RaggedTensor.from_row_splits(
|
1341 |
+
values=message_arr.concat(), row_splits=messages.row_splits)
|
1342 |
+
else:
|
1343 |
+
messages = self.preprend_system_prompt(style, ex, for_inference, messages, seed=system_prompt_seed)
|
1344 |
+
|
1345 |
+
return messages
|
1346 |
+
|
1347 |
+
def get_preprocessor(self, is_training, for_inference, style=None, include_metadata=None):
|
1348 |
+
"""Build a preprocessing function that can be applied ot a tf.data.Dataset"""
|
1349 |
+
vocab = self.tokenizer
|
1350 |
+
include_response = not for_inference
|
1351 |
+
if include_metadata is None:
|
1352 |
+
include_metadata = for_inference
|
1353 |
+
|
1354 |
+
@seqio.map_over_dataset(num_seeds=2)
|
1355 |
+
def to_inputs_and_targets(ex, seeds):
|
1356 |
+
if "unconditioned" in ex:
|
1357 |
+
raise NotImplementedError()
|
1358 |
+
if "image" not in ex:
|
1359 |
+
image = None
|
1360 |
+
elif ex['image'].dtype == tf.string:
|
1361 |
+
image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
|
1362 |
+
else:
|
1363 |
+
image = ex['image']
|
1364 |
+
raw_image = image
|
1365 |
+
if image is not None and len(tf.shape(image)) == 3:
|
1366 |
+
image = tf.expand_dims(image, axis=0)
|
1367 |
+
|
1368 |
+
unconditioned = self.unconditioned
|
1369 |
+
if unconditioned and isinstance(unconditioned, float):
|
1370 |
+
assert image is not None
|
1371 |
+
if is_training and tf.random.uniform((), 0, 1, dtype=tf.float32) < unconditioned:
|
1372 |
+
image = image[:0]
|
1373 |
+
elif unconditioned:
|
1374 |
+
image = None
|
1375 |
+
|
1376 |
+
messages = self.get_messages(ex, style, is_training, for_inference, seeds[0], seeds[1])
|
1377 |
+
targets, loss_masks, subsegments = self.get_tokens_input(
|
1378 |
+
messages, for_inference, ex.get("text_weights"))
|
1379 |
+
# if "scifi" in style and style.endswith("_explanation"):
|
1380 |
+
# logging.warning(f"No loss on EOS for {style}")
|
1381 |
+
# loss_masks = tf.where(targets == self.tokenizer.eos_token_id, tf.zeros_like(loss_masks), loss_masks)
|
1382 |
+
out = self.build_multimodel_features(targets, loss_masks, subsegments, image, is_training)
|
1383 |
+
|
1384 |
+
if include_metadata:
|
1385 |
+
# FIXME remove these special cases
|
1386 |
+
if "text" in ex:
|
1387 |
+
if len(ex["text"].shape) > 0:
|
1388 |
+
# FIXME can this be variable lengths after all?
|
1389 |
+
out["metadata/captions"] = tf.strings.reduce_join(
|
1390 |
+
tf.strings.regex_replace(ex['text'], "\\s+", " "),
|
1391 |
+
separator="\n"
|
1392 |
+
)
|
1393 |
+
else:
|
1394 |
+
out["metadata/captions"] = ex["text"]
|
1395 |
+
|
1396 |
+
if "image_url" in ex:
|
1397 |
+
out["metadata/image_url"] = ex["image_url"]
|
1398 |
+
elif "url" in ex:
|
1399 |
+
out["metadata/image_url"] = ex["url"]
|
1400 |
+
if "image_id" in ex:
|
1401 |
+
out["metadata/image_id"] = ex["image_id"]
|
1402 |
+
for k, v in ex.items():
|
1403 |
+
if k.startswith("metadata"):
|
1404 |
+
out[k] = v
|
1405 |
+
if raw_image is not None and "metadata/image_size" not in out:
|
1406 |
+
img_h = tf.shape(raw_image)[0]
|
1407 |
+
img_w = tf.shape(raw_image)[1]
|
1408 |
+
out["metadata/image_size"] = [img_w, img_h]
|
1409 |
+
if "metadata/image_url" not in out and raw_image is not None:
|
1410 |
+
if len(ex["image"].shape) < 4:
|
1411 |
+
# For visualizations FIXME can we make this variable length
|
1412 |
+
out["metadata/image"] = tf.io.encode_jpeg(
|
1413 |
+
tf.image.convert_image_dtype(raw_image, tf.uint8))
|
1414 |
+
return out
|
1415 |
+
return to_inputs_and_targets
|
1416 |
+
|
1417 |
+
def get_post_mixing_preprocessor(self, pack=False):
|
1418 |
+
"""Build a feature conversion function that can be applied ot a tf.data.Dataset
|
1419 |
+
|
1420 |
+
This function applies a second stage of pre-processing, but unlike `self.get_preprocessor`
|
1421 |
+
this stage can be applied after mixing tf.data.Datasets into a mixture
|
1422 |
+
"""
|
1423 |
+
return MultiModalLMFeatureConverter(
|
1424 |
+
loss_token_weighting=self.loss_token_weighting,
|
1425 |
+
bos_id=self.tokenizer.bos_token_id,
|
1426 |
+
fix_image_input_idx=self.fix_image_input_idx,
|
1427 |
+
pack=pack,
|
1428 |
+
special_tokens=list(self.special_token_ids.values()),
|
1429 |
+
)
|
1430 |
+
|
1431 |
+
|
1432 |
+
class MultiModalLMFeatureConverter:
|
1433 |
+
|
1434 |
+
def __init__(
|
1435 |
+
self, pack: bool = False, loss_token_weighting: str=None, bos_id: int = 1,
|
1436 |
+
special_tokens=None, fix_image_input_idx=2
|
1437 |
+
):
|
1438 |
+
self.pack = pack
|
1439 |
+
self.bos_id = bos_id
|
1440 |
+
self.fix_image_input_idx = fix_image_input_idx
|
1441 |
+
self.special_tokens = tf.constant(special_tokens) if special_tokens else None
|
1442 |
+
self.loss_token_weighting = loss_token_weighting
|
1443 |
+
|
1444 |
+
def _convert_example(
|
1445 |
+
self, features: Mapping[str, tf.Tensor]
|
1446 |
+
) -> Mapping[str, tf.Tensor]:
|
1447 |
+
"""Convert an LM example into an example with model features."""
|
1448 |
+
# targets_segment_id is present only for a packed dataset.
|
1449 |
+
decoder_input_tokens = make_autoregressive_inputs(
|
1450 |
+
features["target_tokens"],
|
1451 |
+
sequence_id=features.get("targets_segment_ids", None),
|
1452 |
+
bos_id=self.bos_id,
|
1453 |
+
)
|
1454 |
+
|
1455 |
+
tf.assert_equal(
|
1456 |
+
True,
|
1457 |
+
tf.reduce_all(decoder_input_tokens[-1] != self.special_tokens),
|
1458 |
+
message="An input ends with an image special token",
|
1459 |
+
)
|
1460 |
+
|
1461 |
+
image_input_idx = features["image_input_idx"]
|
1462 |
+
if self.fix_image_input_idx == 2:
|
1463 |
+
# plus one sine we have added BOS to the inputs
|
1464 |
+
image_input_idx = tf.where(image_input_idx < 0, image_input_idx, image_input_idx + 1)
|
1465 |
+
else:
|
1466 |
+
# Some old models trained like this, sometimes image_input_idx will go from -1 -> 0 didn't
|
1467 |
+
# effect performance but keep this code path for backwards compatiblity with those checkpoints
|
1468 |
+
image_input_idx = image_input_idx + 1
|
1469 |
+
|
1470 |
+
d = {
|
1471 |
+
"target_tokens": features["target_tokens"],
|
1472 |
+
"input_tokens": decoder_input_tokens,
|
1473 |
+
"loss_masks": features["loss_masks"],
|
1474 |
+
"images": features["images"],
|
1475 |
+
"image_input_idx": image_input_idx
|
1476 |
+
}
|
1477 |
+
if "image_masks" in features:
|
1478 |
+
d["image_masks"] = features["image_masks"]
|
1479 |
+
|
1480 |
+
has_custom_text_weight = features.get("has_custom_loss_weight", False)
|
1481 |
+
|
1482 |
+
if "subsegment_ids" in features:
|
1483 |
+
subsegment_ids = make_autoregressive_inputs(
|
1484 |
+
features["subsegment_ids"],
|
1485 |
+
sequence_id=features.get("targets_segment_ids", None),
|
1486 |
+
bos_id=features["subsegment_ids"][0],
|
1487 |
+
)
|
1488 |
+
|
1489 |
+
# Subsegment have a position based on the sum of previous positions they can attend to
|
1490 |
+
position_ids = tf.zeros_like(subsegment_ids)
|
1491 |
+
unique_segments = tf.unique(subsegment_ids)[0]
|
1492 |
+
for i in unique_segments:
|
1493 |
+
segment_position_ids = tf.cumsum(tf.cast(subsegment_ids >= i, tf.int32)) - 1
|
1494 |
+
position_ids = tf.where(subsegment_ids == i, segment_position_ids, position_ids)
|
1495 |
+
|
1496 |
+
# Apply loss weighting, this is done here so it occurs after truncation
|
1497 |
+
if has_custom_text_weight:
|
1498 |
+
pass
|
1499 |
+
elif self.loss_token_weighting in ["subsegments", "root_subsegments"]:
|
1500 |
+
n_loss_segments = tf.shape(tf.unique(tf.boolean_mask(subsegment_ids, d["loss_masks"] > 0))[0])[0]
|
1501 |
+
n_loss_segments = tf.maximum(tf.cast(n_loss_segments, tf.float32), 1)
|
1502 |
+
weight = 1/n_loss_segments if self.loss_token_weighting == "subsegments" else tf.math.rsqrt(n_loss_segments)
|
1503 |
+
d["loss_masks"] = tf.where(d["loss_masks"] > 0, d["loss_masks"]*weight, d["loss_masks"])
|
1504 |
+
elif self.loss_token_weighting is not None:
|
1505 |
+
raise NotImplementedError(self.loss_token_weighting)
|
1506 |
+
|
1507 |
+
d["subsegment_ids"] = subsegment_ids
|
1508 |
+
d["position_ids"] = position_ids
|
1509 |
+
else:
|
1510 |
+
if self.loss_token_weighting not in [None, "subsegments", "root_subsegments"] and not has_custom_text_weight:
|
1511 |
+
raise NotImplementedError(self.loss_token_weighting)
|
1512 |
+
if self.pack:
|
1513 |
+
d["decoder_segment_ids"] = features["targets_segment_ids"]
|
1514 |
+
d["decoder_positions"] = features["targets_positions"]
|
1515 |
+
|
1516 |
+
for k in features:
|
1517 |
+
if k.startswith("metadata/"):
|
1518 |
+
d[k] = features[k]
|
1519 |
+
return d
|
1520 |
+
|
1521 |
+
def _pack_or_pad(self, ds, task_feature_lengths):
|
1522 |
+
if self.pack:
|
1523 |
+
raise NotImplementedError()
|
1524 |
+
else:
|
1525 |
+
return trim_and_pad_dataset(ds, task_feature_lengths)
|
1526 |
+
|
1527 |
+
def __call__(self, ds: tf.data.Dataset, task_feature_lengths: Mapping[str, int]) -> tf.data.Dataset:
|
1528 |
+
"""Convert the dataset to be fed to a language model."""
|
1529 |
+
task_feature_lengths = dict(task_feature_lengths)
|
1530 |
+
|
1531 |
+
if "images" in ds.element_spec and "images" in task_feature_lengths:
|
1532 |
+
# Images should never be truncated
|
1533 |
+
ds = assert_not_truncated(ds, ["images", "image_input_idx"], task_feature_lengths["images"])
|
1534 |
+
|
1535 |
+
if any(x.startswith("metadata/") for x in ds.element_spec):
|
1536 |
+
# Metadata indicates the dataset is being used for inference, inference datasets
|
1537 |
+
# should not be truncated
|
1538 |
+
ds = assert_not_truncated(ds, ["target_tokens"], task_feature_lengths["target_tokens"])
|
1539 |
+
|
1540 |
+
if "image_masks" in ds.element_spec and "images" in task_feature_lengths:
|
1541 |
+
task_feature_lengths["image_masks"] = task_feature_lengths["images"]
|
1542 |
+
if "subsegment_ids" in ds.element_spec and "target_tokens" in task_feature_lengths:
|
1543 |
+
task_feature_lengths["subsegment_ids"] = task_feature_lengths["target_tokens"]
|
1544 |
+
if "loss_masks" not in task_feature_lengths and "target_tokens" in task_feature_lengths:
|
1545 |
+
task_feature_lengths["loss_masks"] = task_feature_lengths["target_tokens"]
|
1546 |
+
ds = self._pack_or_pad(ds, task_feature_lengths)
|
1547 |
+
|
1548 |
+
return ds.map(
|
1549 |
+
self._convert_example, num_parallel_calls=tf.data.experimental.AUTOTUNE)
|
preprocesssors.py
ADDED
@@ -0,0 +1,2472 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import hashlib
|
2 |
+
import json
|
3 |
+
import math
|
4 |
+
from functools import reduce
|
5 |
+
from typing import Mapping, Optional, Sequence
|
6 |
+
|
7 |
+
import numpy as np
|
8 |
+
import tensorflow as tf
|
9 |
+
import seqio
|
10 |
+
import gin
|
11 |
+
|
12 |
+
from .data_utils import flatten_parts, stateless_permutation, stateless_shuffle
|
13 |
+
from .. import config
|
14 |
+
|
15 |
+
|
16 |
+
def get_from_dict(data, keys):
|
17 |
+
"""Iterate nested dictionary"""
|
18 |
+
return reduce(dict.get, keys, data)
|
19 |
+
|
20 |
+
def get_blank_image():
|
21 |
+
image = tf.zeros([224, 224, 3], dtype=tf.uint8)
|
22 |
+
image = tf.expand_dims(image, 0)[:1]
|
23 |
+
return image
|
24 |
+
|
25 |
+
|
26 |
+
@seqio.utils.map_over_dataset
|
27 |
+
def rekey(x, key_map=None):
|
28 |
+
"""Replace the feature keys according to the mapping in `key_map`.
|
29 |
+
For example, if the dataset returns examples of the format:
|
30 |
+
{'foo': 'something', 'bar': 'something else'}
|
31 |
+
and key_map = {'boo': 'foo', 'spar': 'bar'} then this function will return
|
32 |
+
examples with the format
|
33 |
+
{'boo': 'something', 'spar': 'something else'}
|
34 |
+
If a mapping is to an empty key or None, set the new key to an empty string.
|
35 |
+
Args:
|
36 |
+
x: an example to process.
|
37 |
+
key_map: dictionary mapping new keys to original keys
|
38 |
+
Returns:
|
39 |
+
A preprocessed example with the format listed above.
|
40 |
+
"""
|
41 |
+
if key_map:
|
42 |
+
out = {}
|
43 |
+
for new_key, old_key in key_map.items():
|
44 |
+
if isinstance(old_key, list):
|
45 |
+
out[new_key] = get_from_dict(x, old_key)
|
46 |
+
else:
|
47 |
+
out[new_key] = x[old_key]
|
48 |
+
return out
|
49 |
+
return x
|
50 |
+
|
51 |
+
|
52 |
+
def rename(**kwargs):
|
53 |
+
@seqio.map_over_dataset
|
54 |
+
def _fn(x):
|
55 |
+
updates = {}
|
56 |
+
for new_key, old_key in kwargs.items():
|
57 |
+
if isinstance(old_key, list):
|
58 |
+
val = x[old_key[0]]
|
59 |
+
for k in old_key[1:-1]:
|
60 |
+
val = val[k]
|
61 |
+
updates[new_key] = val.pop(old_key[-1])
|
62 |
+
else:
|
63 |
+
updates[new_key] = x.pop(old_key)
|
64 |
+
x.update(updates)
|
65 |
+
return x
|
66 |
+
return _fn
|
67 |
+
|
68 |
+
|
69 |
+
def extract_transcripts(ds):
|
70 |
+
ds = flatten_parts(ds, ["transcripts"])
|
71 |
+
def _map(ex):
|
72 |
+
return dict(
|
73 |
+
image=ex["image"],
|
74 |
+
text=ex["transcripts"],
|
75 |
+
url=ex["url"]
|
76 |
+
)
|
77 |
+
return ds.map(_map)
|
78 |
+
|
79 |
+
|
80 |
+
@seqio.map_over_dataset
|
81 |
+
def extract_caption_and_all_transcripts(ex):
|
82 |
+
transcripts = tf.random.shuffle(ex["transcripts"])[:3]
|
83 |
+
weight = 1.0 / tf.cast(tf.shape(transcripts)[0], tf.float32)
|
84 |
+
return dict(
|
85 |
+
image=ex["image"],
|
86 |
+
text=tf.concat([tf.expand_dims(ex["caption"], 0), transcripts], 0),
|
87 |
+
url=ex["url"],
|
88 |
+
text_weights=tf.pad(
|
89 |
+
tf.ones((1,), dtype=tf.float32), [[0, tf.shape(transcripts)[0]]],
|
90 |
+
constant_values=weight),
|
91 |
+
)
|
92 |
+
|
93 |
+
|
94 |
+
@seqio.map_over_dataset
|
95 |
+
def extract_all_transcripts(ex):
|
96 |
+
transcripts = tf.random.shuffle(ex["transcripts"])[:3]
|
97 |
+
weight = 3.0 / tf.cast(tf.shape(transcripts)[0], tf.float32)
|
98 |
+
return dict(
|
99 |
+
image=ex["image"],
|
100 |
+
text=transcripts,
|
101 |
+
url=ex["url"],
|
102 |
+
text_weights=tf.fill((tf.shape(transcripts)[0],), weight),
|
103 |
+
)
|
104 |
+
|
105 |
+
|
106 |
+
@seqio.map_over_dataset
|
107 |
+
def extract_transcript(ex):
|
108 |
+
transcripts = tf.random.shuffle(ex["transcripts"])
|
109 |
+
return dict(
|
110 |
+
image=ex["image"],
|
111 |
+
text=transcripts[0],
|
112 |
+
url=ex["url"],
|
113 |
+
)
|
114 |
+
|
115 |
+
|
116 |
+
@seqio.map_over_dataset
|
117 |
+
def extract_caption(ex):
|
118 |
+
caption = ex["caption"]
|
119 |
+
if len(caption.shape) > 0:
|
120 |
+
ex["text"] = caption[0]
|
121 |
+
else:
|
122 |
+
ex["text"] = caption
|
123 |
+
return ex
|
124 |
+
|
125 |
+
|
126 |
+
@seqio.map_over_dataset
|
127 |
+
def extract_joint_captions(ex):
|
128 |
+
caption = ex["caption"]
|
129 |
+
if len(caption.shape) > 0:
|
130 |
+
caption = caption[0]
|
131 |
+
_ix = tf.random.uniform((), 0, tf.shape(ex["transcripts"])[0], dtype=tf.int32)
|
132 |
+
_ix = _ix % tf.shape(ex["transcripts"])[0]
|
133 |
+
return dict(
|
134 |
+
image=ex["image"],
|
135 |
+
text=tf.stack([caption, ex["mistral_caption"], ex["transcripts"][_ix]], 0),
|
136 |
+
url=ex["url"]
|
137 |
+
)
|
138 |
+
|
139 |
+
|
140 |
+
@seqio.map_over_dataset(num_seeds=1)
|
141 |
+
def extract_caption_and_transcript(ex, seed):
|
142 |
+
caption = ex["caption"]
|
143 |
+
if len(caption.shape) > 0:
|
144 |
+
caption = caption[0]
|
145 |
+
_ix = tf.random.stateless_uniform((), seed, 0, tf.shape(ex["transcripts"])[0], dtype=tf.int32)
|
146 |
+
return dict(
|
147 |
+
image=ex["image"],
|
148 |
+
text=tf.stack([caption, ex["transcripts"][_ix]], 0),
|
149 |
+
url=ex["url"]
|
150 |
+
)
|
151 |
+
|
152 |
+
|
153 |
+
@seqio.map_over_dataset
|
154 |
+
def caption_transcript_augmented(ex, sequence_length):
|
155 |
+
caption = ex["caption"]
|
156 |
+
if len(caption.shape) > 0:
|
157 |
+
caption = caption[0]
|
158 |
+
image = ex["image"]
|
159 |
+
properties = []
|
160 |
+
|
161 |
+
do_augmentation = sequence_length["is_training"]
|
162 |
+
# do_augmentation = False
|
163 |
+
|
164 |
+
# Keep this off, it screws up OCR
|
165 |
+
# do_hflip = (tf.random.uniform(()) > 0.2 and do_augmentation)
|
166 |
+
do_hflip = False
|
167 |
+
if do_hflip:
|
168 |
+
image = image[:, ::-1]
|
169 |
+
|
170 |
+
# Mild color jitter
|
171 |
+
do_color = (tf.random.uniform(()) > 0.5 and do_augmentation)
|
172 |
+
if do_color:
|
173 |
+
image = tf.image.random_hue(image, max_delta=0.05)
|
174 |
+
image = tf.image.random_brightness(image, max_delta=0.2)
|
175 |
+
image = tf.image.random_saturation(image, 0.7, 1.3)
|
176 |
+
image = tf.image.random_contrast(image, 0.7, 1.3)
|
177 |
+
|
178 |
+
# Mild affine transformation
|
179 |
+
do_affine = (tf.random.uniform(()) > 0.5 and do_augmentation)
|
180 |
+
if do_affine and do_augmentation:
|
181 |
+
shift_x = tf.random.uniform((), -10, 10) * 0
|
182 |
+
shift_y = tf.random.uniform((), -10, 10) * 0
|
183 |
+
shear_x = tf.random.uniform((), -2, 2)
|
184 |
+
shear_y = tf.random.uniform((), -2, 2)
|
185 |
+
rotation = tf.random.uniform((), -6, 6)
|
186 |
+
max_scale = 1.1
|
187 |
+
scale = tf.random.uniform((), 0.8, max_scale)
|
188 |
+
center = tf.cast(tf.shape(image), tf.float32)/2
|
189 |
+
|
190 |
+
image = tf.keras.ops.image.affine_transform(
|
191 |
+
image,
|
192 |
+
tf.stack(get_affine_matrix(
|
193 |
+
[center[0], center[1]],
|
194 |
+
rotation,
|
195 |
+
[shift_x, shift_y],
|
196 |
+
1/scale,
|
197 |
+
[shear_x, shear_y]
|
198 |
+
) + [0., 0.]),
|
199 |
+
interpolation='bilinear',
|
200 |
+
fill_mode='constant',
|
201 |
+
fill_value=1.,
|
202 |
+
data_format='channels_last'
|
203 |
+
)
|
204 |
+
|
205 |
+
properties = tf.stack([
|
206 |
+
("[hflip]" if do_hflip else ""),
|
207 |
+
("[color]" if do_color else ""),
|
208 |
+
("[affine]" if do_affine else "")
|
209 |
+
])
|
210 |
+
properties = tf.boolean_mask(properties, tf.strings.length(properties) > 0)
|
211 |
+
prompt = tf.strings.reduce_join(properties, separator=" ")
|
212 |
+
ix = tf.random.uniform((), 0, tf.shape(ex["transcripts"])[0], dtype=tf.int32)
|
213 |
+
out = dict(
|
214 |
+
image=image,
|
215 |
+
text=tf.stack([caption, ex["transcripts"][ix]], 0),
|
216 |
+
url=ex["url"],
|
217 |
+
prompt=prompt,
|
218 |
+
)
|
219 |
+
# out["metadata/unaugmented_image"] = image
|
220 |
+
return out
|
221 |
+
|
222 |
+
|
223 |
+
def extract_caption_and_transcript_hflip(ds):
|
224 |
+
|
225 |
+
# Just in case they are ordered somehow in Matt's data
|
226 |
+
@seqio.map_over_dataset
|
227 |
+
def _shuffle_transcripts(_ex):
|
228 |
+
_ex["transcripts"] = tf.random.shuffle(_ex["transcripts"])
|
229 |
+
_ex["hflip"] = tf.random.uniform((), 0, 3, dtype=tf.int32)
|
230 |
+
return _ex
|
231 |
+
|
232 |
+
ds = _shuffle_transcripts(ds)
|
233 |
+
|
234 |
+
# Build a 3x long dataset with each individual transcript so we iterate through
|
235 |
+
# each transcript
|
236 |
+
@seqio.map_over_dataset
|
237 |
+
def _with_transcript(ex, _ix):
|
238 |
+
caption = ex["caption"]
|
239 |
+
if len(caption.shape) > 0:
|
240 |
+
caption = caption[0]
|
241 |
+
hflip = ex["hflip"] == _ix
|
242 |
+
if hflip:
|
243 |
+
ex["image"] = ex["image"][:, ::-1]
|
244 |
+
style = ["long_caption_flipped", "transcript_flipped"]
|
245 |
+
else:
|
246 |
+
style = ["long_caption", "transcript"]
|
247 |
+
return dict(
|
248 |
+
image=ex["image"],
|
249 |
+
text=tf.stack([caption, ex["transcripts"][_ix]], 0),
|
250 |
+
url=ex["url"],
|
251 |
+
style=style
|
252 |
+
)
|
253 |
+
|
254 |
+
joint_ds = _with_transcript(ds, 0)
|
255 |
+
for i in range(1, 3):
|
256 |
+
joint_ds = joint_ds.concatenate(_with_transcript(ds, i))
|
257 |
+
|
258 |
+
return joint_ds
|
259 |
+
|
260 |
+
|
261 |
+
@seqio.map_over_dataset
|
262 |
+
def extract_llava(ex, sequence_length, output_features):
|
263 |
+
tf.assert_equal(tf.shape(ex['conversations']['value'])[0], 2)
|
264 |
+
prompt = ex['conversations']['value'][0]
|
265 |
+
text = ex['conversations']['value'][1]
|
266 |
+
ex.pop('conversations')
|
267 |
+
ex["text"] = text
|
268 |
+
ex["prompt"] = prompt
|
269 |
+
return ex
|
270 |
+
|
271 |
+
|
272 |
+
def extract_localized_narrative(ds):
|
273 |
+
ds = ds.filter(lambda ex: tf.shape(ex["cap/cap_caption"])[0] > 0)
|
274 |
+
def _map(ex):
|
275 |
+
return dict(
|
276 |
+
image=ex["image"],
|
277 |
+
text=tf.strings.reduce_join(ex["cap/cap_caption"], separator="\n")
|
278 |
+
)
|
279 |
+
return ds.map(_map)
|
280 |
+
|
281 |
+
|
282 |
+
def float_to_text(val):
|
283 |
+
return tf.strings.as_string(tf.cast(val * 100, tf.int32))
|
284 |
+
|
285 |
+
|
286 |
+
@seqio.map_over_dataset
|
287 |
+
def extract_vqa(ex):
|
288 |
+
questions = ex["vqa"]["questions"]
|
289 |
+
answers = ex["vqa"]["answers"]
|
290 |
+
answers = tf.strings.reduce_join(answers, 1, separator="; ")
|
291 |
+
qas = tf.strings.reduce_join(tf.stack([questions, answers], 1), separator=" ")
|
292 |
+
return dict(
|
293 |
+
image=ex["image"],
|
294 |
+
text=tf.strings.reduce_join(qas, separator="\n")
|
295 |
+
)
|
296 |
+
|
297 |
+
|
298 |
+
@seqio.map_over_dataset
|
299 |
+
def coco_image_id_from_path(ex):
|
300 |
+
image_id = tf.strings.substr(ex["image/filename"], 0, tf.strings.length(ex["image/filename"])-4)
|
301 |
+
ex["image_id"] = tf.strings.to_number(image_id)
|
302 |
+
return ex
|
303 |
+
|
304 |
+
|
305 |
+
@seqio.map_over_dataset
|
306 |
+
def add_coco_url(ex):
|
307 |
+
"""Turns a COCO path into a URL, which can then be used in visualizations"""
|
308 |
+
path = ex["image/filename"]
|
309 |
+
if not tf.strings.regex_full_match(path, ".*/.*"):
|
310 |
+
prefix = tf.strings.regex_replace(path, "COCO_", "")
|
311 |
+
prefix = tf.strings.regex_replace(prefix, "_[0-9]+.jpg", "")
|
312 |
+
path = tf.strings.join([prefix, path], separator="/")
|
313 |
+
|
314 |
+
# images are hosted by the COCO website here
|
315 |
+
url = tf.strings.join(["https://s3.us-east-1.amazonaws.com/images.cocodataset.org/", path])
|
316 |
+
ex["metadata/image_url"] = url
|
317 |
+
return ex
|
318 |
+
|
319 |
+
|
320 |
+
def flatten_vqa(ds):
|
321 |
+
parts = ["questions", "answers"]
|
322 |
+
for k in ["id", "question_id"]:
|
323 |
+
if k in ds.element_spec:
|
324 |
+
parts.append(k)
|
325 |
+
return flatten_parts(ds, parts)
|
326 |
+
|
327 |
+
|
328 |
+
def format_gqa(ds, is_balanced=True, flatten=True):
|
329 |
+
if is_balanced:
|
330 |
+
ds = ds.filter(lambda x: tf.reduce_any(x["questions"]["is_balanced"]))
|
331 |
+
def _filter_qs(ex):
|
332 |
+
qs = ex["questions"]
|
333 |
+
mask = qs["is_balanced"]
|
334 |
+
qs = {k: tf.boolean_mask(v, mask) for k, v in qs.items()}
|
335 |
+
ex["questions"] = qs
|
336 |
+
return ex
|
337 |
+
ds = ds.map(_filter_qs)
|
338 |
+
|
339 |
+
if flatten:
|
340 |
+
ds = flatten_parts(ds, ["questions"])
|
341 |
+
|
342 |
+
def _rename(ex):
|
343 |
+
out = ex["questions"]
|
344 |
+
out["image"] = ex["image"]
|
345 |
+
out["image_id"] = ex["image_id"]
|
346 |
+
return out
|
347 |
+
return ds.map(_rename)
|
348 |
+
|
349 |
+
|
350 |
+
@seqio.map_over_dataset
|
351 |
+
def fix_doqa_url(x):
|
352 |
+
x["image_url"] = tf.strings.regex_replace(x["image_url"], "gs://", "")
|
353 |
+
return x
|
354 |
+
|
355 |
+
|
356 |
+
def _add_metadata(ex):
|
357 |
+
out = {}
|
358 |
+
if "id" in ex:
|
359 |
+
out["metadata/example_id"] = ex["id"]
|
360 |
+
elif "example_id" in ex:
|
361 |
+
out["metadata/example_id"] = ex["example_id"]
|
362 |
+
elif "question_id" in ex:
|
363 |
+
out["metadata/example_id"] = ex["question_id"]
|
364 |
+
if "image_url" in ex:
|
365 |
+
out["metadata/image_url"] = ex["image_url"]
|
366 |
+
for k, v in ex.items():
|
367 |
+
if k.startswith("metadata/"):
|
368 |
+
out[k] = v
|
369 |
+
return out
|
370 |
+
|
371 |
+
|
372 |
+
def image_only(ds):
|
373 |
+
return ds.filter(lambda x: x["has_image"])
|
374 |
+
|
375 |
+
|
376 |
+
def filter_difficult_direct_answer(ds):
|
377 |
+
return ds.filter(lambda x: not x["difficult_direct_answer"])
|
378 |
+
|
379 |
+
|
380 |
+
@seqio.map_over_dataset()
|
381 |
+
def format_ai2d(ex, variable_style=True):
|
382 |
+
abc = tf.constant(list("abcdefg".upper()))
|
383 |
+
out = dict(image=ex["image"])
|
384 |
+
out.update(_add_metadata(ex))
|
385 |
+
|
386 |
+
options = ex["choices"]
|
387 |
+
# >= 3 in case of none of the above like answers
|
388 |
+
n_options = tf.shape(ex["option_is_abc"])[0]
|
389 |
+
if ex["abc_label"] and tf.reduce_sum(tf.cast(ex["option_is_abc"], tf.int32)) >= (n_options - 1):
|
390 |
+
# The image labels are always upper, so use upper in the answer ptions
|
391 |
+
options = tf.where(
|
392 |
+
ex["option_is_abc"],
|
393 |
+
tf.strings.upper(options),
|
394 |
+
options
|
395 |
+
)
|
396 |
+
short_options = options
|
397 |
+
style = "ai2_diagram_no_letter"
|
398 |
+
else:
|
399 |
+
short_options = abc[:tf.shape(options)[0]]
|
400 |
+
options = tf.stack([short_options, options,], 1)
|
401 |
+
options = tf.strings.reduce_join(options, axis=-1, separator=": ")
|
402 |
+
style = "ai2_diagram"
|
403 |
+
|
404 |
+
options = tf.strings.reduce_join(options, separator="\n")
|
405 |
+
out["question"] = ex["question"]
|
406 |
+
out["options"] = options
|
407 |
+
if variable_style:
|
408 |
+
out["style"] = style
|
409 |
+
if ex["answer_idx"] < 0:
|
410 |
+
out["text"] = "?"
|
411 |
+
else:
|
412 |
+
out["text"] = short_options[ex["answer_idx"]]
|
413 |
+
out["metadata/answer_idx"] = ex["answer_idx"]
|
414 |
+
tf.debugging.assert_equal(tf.reduce_any(tf.strings.regex_full_match(options, ".*\|\|\|.*")), False)
|
415 |
+
out["metadata/option_names"] = tf.strings.reduce_join(short_options, separator="|||")
|
416 |
+
out["metadata/has_transparent_box"] = ex.get("has_transparent_box", tf.constant(False))
|
417 |
+
out["metadata/abc_label"] = ex["abc_label"]
|
418 |
+
return out
|
419 |
+
|
420 |
+
|
421 |
+
@gin.configurable()
|
422 |
+
@seqio.map_over_dataset()
|
423 |
+
def format_multiple_choice_qa(ex, option_format="abc"):
|
424 |
+
assert option_format == "abc"
|
425 |
+
abc = tf.constant(list("abcdefg".upper()))
|
426 |
+
out = dict(image=ex["image"])
|
427 |
+
out.update(_add_metadata(ex))
|
428 |
+
options = ex["choices"]
|
429 |
+
short_options = abc[:tf.shape(options)[0]]
|
430 |
+
options = tf.stack([short_options, options,], 1)
|
431 |
+
options = tf.strings.reduce_join(options, axis=-1, separator=": ")
|
432 |
+
options = tf.strings.reduce_join(options, separator="\n")
|
433 |
+
out["question"] = ex["question"]
|
434 |
+
out["options"] = options
|
435 |
+
if ex["answer_idx"] < 0:
|
436 |
+
out["text"] = "?"
|
437 |
+
else:
|
438 |
+
out["text"] = short_options[ex["answer_idx"]]
|
439 |
+
out["metadata/answer_idx"] = ex["answer_idx"]
|
440 |
+
tf.debugging.assert_equal(tf.reduce_any(tf.strings.regex_full_match(options, ".*\|\|\|.*")), False)
|
441 |
+
out["metadata/option_names"] = tf.strings.reduce_join(short_options, separator="|||")
|
442 |
+
# out["metadata/option_names"] = tf.RaggedTensor.from_row_lengths(short_options, tf.shape(short_options))
|
443 |
+
# out["metadata/option_names"] = short_options
|
444 |
+
return out
|
445 |
+
|
446 |
+
|
447 |
+
@seqio.map_over_dataset()
|
448 |
+
def output_options(ex):
|
449 |
+
ex["metadata/options"] = ex["options"]
|
450 |
+
return ex
|
451 |
+
|
452 |
+
|
453 |
+
@seqio.map_over_dataset()
|
454 |
+
def extract_tally_qa(ex):
|
455 |
+
questions = ex.pop("questions")
|
456 |
+
ex["questions"] = questions["question"]
|
457 |
+
ex["answers"] = tf.strings.as_string(questions["answer"])
|
458 |
+
ex["question_id"] = questions["question_id"]
|
459 |
+
return ex
|
460 |
+
|
461 |
+
|
462 |
+
@seqio.map_over_dataset()
|
463 |
+
def count_bench_preprocessor(ex):
|
464 |
+
return {
|
465 |
+
"image": ex["image"],
|
466 |
+
"text": tf.strings.as_string(ex["number"]),
|
467 |
+
"object": ex["noun"],
|
468 |
+
"question": tf.strings.join([
|
469 |
+
"How many ", ex["noun"], " are there?"
|
470 |
+
]),
|
471 |
+
"metadata/count": ex["number"],
|
472 |
+
}
|
473 |
+
|
474 |
+
|
475 |
+
def filter_human(ds):
|
476 |
+
return ds.filter(lambda x: x["is_human"])
|
477 |
+
|
478 |
+
|
479 |
+
def filter_aug(ds):
|
480 |
+
return ds.filter(lambda x: not x["is_human"])
|
481 |
+
|
482 |
+
|
483 |
+
@seqio.map_over_dataset()
|
484 |
+
def reweight_chartqa(ex, human, aug):
|
485 |
+
is_human = ex["metadata/is_human"]
|
486 |
+
ex["text_weights"] = human if is_human else aug
|
487 |
+
return ex
|
488 |
+
|
489 |
+
|
490 |
+
@seqio.map_over_dataset()
|
491 |
+
def chartqa_prompting(ex):
|
492 |
+
question = tf.strings.join([ex["question"], " Answer:"])
|
493 |
+
return dict(
|
494 |
+
image=ex["image"],
|
495 |
+
question=question,
|
496 |
+
answer=ex["answer"]
|
497 |
+
)
|
498 |
+
|
499 |
+
|
500 |
+
@seqio.map_over_dataset()
|
501 |
+
def chartqa_explanation(ex):
|
502 |
+
question = tf.strings.join([ex["question"], " Explanation:"])
|
503 |
+
out = {
|
504 |
+
"image": ex["image"],
|
505 |
+
"question": question,
|
506 |
+
"answer": ex["answer"],
|
507 |
+
}
|
508 |
+
out.update({k: v for k, v in ex.items() if k.startswith("metadata/")})
|
509 |
+
return out
|
510 |
+
|
511 |
+
|
512 |
+
@seqio.map_over_dataset(num_seeds=1)
|
513 |
+
def _preprocess_scifi(ex, seed):
|
514 |
+
if "qa_pairs" in ex:
|
515 |
+
q = ex["qa_pairs"]
|
516 |
+
else:
|
517 |
+
q = ex["qa"]
|
518 |
+
ix = stateless_permutation(tf.shape(q["question"])[0], seed)
|
519 |
+
return dict(
|
520 |
+
image=ex["image"],
|
521 |
+
question=tf.gather(q["question"], ix),
|
522 |
+
explanation=tf.gather(q["explanation"], ix),
|
523 |
+
answer=tf.gather(q["answer"], ix),
|
524 |
+
)
|
525 |
+
|
526 |
+
@seqio.map_over_dataset
|
527 |
+
def scifi_explanation_only(ex):
|
528 |
+
return dict(
|
529 |
+
image=ex["image"],
|
530 |
+
question=ex["question"],
|
531 |
+
answer=ex["explanation"],
|
532 |
+
)
|
533 |
+
|
534 |
+
|
535 |
+
def filter_named_entity(ds):
|
536 |
+
@seqio.map_over_dataset
|
537 |
+
def _load_image(ex):
|
538 |
+
ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
|
539 |
+
return ex
|
540 |
+
|
541 |
+
ds = _load_image(ds)
|
542 |
+
return ds.filter(lambda x: tf.reduce_min(tf.shape(x["image"])[:2]) >= 32)
|
543 |
+
|
544 |
+
|
545 |
+
@seqio.map_over_dataset()
|
546 |
+
def extract_named_entity(ex):
|
547 |
+
qs = ex["questions"]
|
548 |
+
return {
|
549 |
+
"image": ex["image"],
|
550 |
+
"metadata/image_url": ex["url"],
|
551 |
+
"metadata/entity": ex["entity"],
|
552 |
+
"questions": qs["question"],
|
553 |
+
"answers": qs["answer"],
|
554 |
+
}
|
555 |
+
|
556 |
+
@gin.configurable()
|
557 |
+
def extract_individual_vqa(ds, test=False, answer_mode="best"):
|
558 |
+
|
559 |
+
@seqio.map_over_dataset(num_seeds=1)
|
560 |
+
def _extract(ex, seed):
|
561 |
+
if "questions" in ex:
|
562 |
+
question = ex["questions"]
|
563 |
+
else:
|
564 |
+
question = ex["question"]
|
565 |
+
out = dict(
|
566 |
+
image=ex["image"],
|
567 |
+
question=question,
|
568 |
+
)
|
569 |
+
out.update(_add_metadata(ex))
|
570 |
+
out["metadata/question"] = question
|
571 |
+
if ex.get("answers") is not None:
|
572 |
+
out["metadata/references"] = tf.strings.reduce_join(ex["answers"], separator="\n")
|
573 |
+
elif ex.get("answer") is not None:
|
574 |
+
out["metadata/references"] = ex["answer"]
|
575 |
+
|
576 |
+
if not test:
|
577 |
+
if "answer" in ex:
|
578 |
+
answer = ex["answer"]
|
579 |
+
else:
|
580 |
+
answer = ex["answers"]
|
581 |
+
if answer.dtype in [tf.int32, tf.int64]:
|
582 |
+
answer = tf.strings.as_string(answer)
|
583 |
+
if len(answer.shape) == 1 and tf.shape(answer)[0] == 0:
|
584 |
+
answer = tf.expand_dims("", 0)
|
585 |
+
if len(answer.shape) == len(question.shape):
|
586 |
+
pass
|
587 |
+
# Handle questions with multiple answers
|
588 |
+
elif answer_mode == "random":
|
589 |
+
assert len(answer.shape) == 1
|
590 |
+
answer = answer[tf.random.stateless_uniform((), seed, 0, tf.shape(answer)[0], dtype=tf.int32)]
|
591 |
+
elif answer_mode == "best":
|
592 |
+
def _get_best(_answer):
|
593 |
+
vals, _, counts = tf.unique_with_counts(_answer)
|
594 |
+
count_thresh = tf.reduce_max(counts)
|
595 |
+
vals = tf.boolean_mask(vals, counts >= count_thresh)
|
596 |
+
return vals[tf.random.stateless_uniform((), seed, 0, tf.shape(vals)[0], dtype=tf.int32)]
|
597 |
+
if len(answer.shape) == 1:
|
598 |
+
answer = _get_best(answer)
|
599 |
+
elif isinstance(answer, tf.RaggedTensor):
|
600 |
+
n = tf.shape(answer)[0]
|
601 |
+
answer_arr = tf.TensorArray(dtype=tf.string, size=n, element_shape=())
|
602 |
+
for i in range(n):
|
603 |
+
answer_arr = answer_arr.write(i, _get_best(answer[i]))
|
604 |
+
answer = answer_arr.stack()
|
605 |
+
else:
|
606 |
+
answer = tf.map_fn(_get_best, answer)
|
607 |
+
elif answer_mode == "all_segments":
|
608 |
+
out["text"] = answer
|
609 |
+
elif answer_mode == "all_segments_weighted":
|
610 |
+
out["text"] = answer
|
611 |
+
out["text_weights"] = 1.0 / tf.cast(tf.shape(answer)[-1], tf.float32)
|
612 |
+
elif answer_mode == "all":
|
613 |
+
if len(answer.shape) == 1:
|
614 |
+
answer = stateless_shuffle(answer, seed)
|
615 |
+
answer = tf.strings.reduce_join(answer, separator="\n", axis=-1)
|
616 |
+
elif isinstance(answer, tf.RaggedTensor):
|
617 |
+
n = tf.shape(answer)[0]
|
618 |
+
answer_arr = tf.TensorArray(dtype=tf.string, size=n, element_shape=())
|
619 |
+
for i in range(n):
|
620 |
+
answer_arr = answer_arr.write(i, tf.strings.reduce_join(tf.random.shuffle(answer[i]), separator="\n", axis=-1))
|
621 |
+
answer = answer_arr.stack()
|
622 |
+
else:
|
623 |
+
answer = tf.map_fn(tf.random.shuffle, answer)
|
624 |
+
answer = tf.strings.reduce_join(answer, separator="\n", axis=-1)
|
625 |
+
else:
|
626 |
+
raise NotImplementedError()
|
627 |
+
out["text"] = answer
|
628 |
+
return out
|
629 |
+
return _extract(ds)
|
630 |
+
|
631 |
+
|
632 |
+
@seqio.map_over_dataset()
|
633 |
+
def extract_khan_academy(ex):
|
634 |
+
return dict(
|
635 |
+
image=ex["image"],
|
636 |
+
image_url=ex["image_url"],
|
637 |
+
prompt="Answer this question",
|
638 |
+
text=ex["gptResponse"]
|
639 |
+
)
|
640 |
+
|
641 |
+
@seqio.map_over_dataset()
|
642 |
+
def extract_vaia_qa_latex_image(ex, add_short_answer=False, set_short_answer_first=False):
|
643 |
+
if ex["has_image"]:
|
644 |
+
image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
|
645 |
+
image = tf.expand_dims(image, 0)[:1]
|
646 |
+
else:
|
647 |
+
# image = get_blank_image() # blank image
|
648 |
+
image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
|
649 |
+
image = tf.expand_dims(image, 0)[:0]
|
650 |
+
img_h = tf.shape(image)[1]
|
651 |
+
img_w = tf.shape(image)[2]
|
652 |
+
|
653 |
+
if add_short_answer:
|
654 |
+
if set_short_answer_first:
|
655 |
+
answer = tf.strings.join(["Answer: ", ex["short_answer"], "\n\n", ex["answer"]])
|
656 |
+
else:
|
657 |
+
answer = tf.strings.join([ex["answer"], "\n\n", "Answer: ", ex["short_answer"]])
|
658 |
+
else:
|
659 |
+
answer = ex["answer"]
|
660 |
+
out = dict(
|
661 |
+
image=image, # 4-d tensor
|
662 |
+
text=answer,
|
663 |
+
prompt=tf.strings.join([ex["latex_question"], "\n"]),
|
664 |
+
)
|
665 |
+
out["metadata/images"] = image
|
666 |
+
out.update(_add_metadata(ex))
|
667 |
+
out["metadata/batch_id"] = ex["batch_id"]
|
668 |
+
out["metadata/image_size"] = [img_w, img_h]
|
669 |
+
return out
|
670 |
+
|
671 |
+
@seqio.map_over_dataset()
|
672 |
+
def extract_vqa_online(ex):
|
673 |
+
out = dict(
|
674 |
+
image=ex["image"],
|
675 |
+
prompt=tf.strings.join([ex["question"], "\n"]),
|
676 |
+
text=ex["answer"]
|
677 |
+
)
|
678 |
+
out.update(_add_metadata(ex))
|
679 |
+
out["metadata/row_id"] = ex["row_id"]
|
680 |
+
return out
|
681 |
+
|
682 |
+
|
683 |
+
@seqio.map_over_dataset()
|
684 |
+
def extract_scifi_joint(ex):
|
685 |
+
if "qa_pairs" in ex:
|
686 |
+
q = ex["qa_pairs"]
|
687 |
+
else:
|
688 |
+
q = ex["qa"]
|
689 |
+
prompts = tf.concat([["Describe this image in detail."], q["question"]], 0)
|
690 |
+
responses = tf.concat([ex["summary"][None], q["answer"]], 0)
|
691 |
+
return dict(
|
692 |
+
image=ex["image"],
|
693 |
+
prompt=prompts,
|
694 |
+
text=responses,
|
695 |
+
)
|
696 |
+
|
697 |
+
|
698 |
+
def remove_no_qa(ds):
|
699 |
+
def _filter(ex):
|
700 |
+
if "qa_pairs" in ex:
|
701 |
+
q = ex["qa_pairs"]
|
702 |
+
else:
|
703 |
+
q = ex["qa"]
|
704 |
+
return tf.shape(q["question"])[0] > 0
|
705 |
+
return ds.filter(_filter)
|
706 |
+
|
707 |
+
|
708 |
+
@seqio.map_over_dataset()
|
709 |
+
def extract_scifi_qa_exp(ex):
|
710 |
+
return dict(
|
711 |
+
image=ex["image"],
|
712 |
+
question=ex["question"], # Array of questions
|
713 |
+
answer=tf.strings.join([ex["explanation"], " Answer: ", ex["answer"]]),
|
714 |
+
)
|
715 |
+
|
716 |
+
|
717 |
+
@seqio.map_over_dataset(num_seeds=1)
|
718 |
+
def extract_scifi_qa_demo(ex, seed):
|
719 |
+
# if tf.random.stateless_uniform((), 0, 1) > 0.5:
|
720 |
+
answer = tf.strings.join([ex["explanation"], " Answer: ", ex["answer"]])
|
721 |
+
# else:
|
722 |
+
# answer = ex["explanation"]
|
723 |
+
return dict(
|
724 |
+
image=ex["image"],
|
725 |
+
question=ex["question"], # Array of questions
|
726 |
+
answer=answer,
|
727 |
+
)
|
728 |
+
|
729 |
+
|
730 |
+
@seqio.map_over_dataset()
|
731 |
+
def clock_bench_preprocessor(ex):
|
732 |
+
out = dict(
|
733 |
+
image=ex["image"],
|
734 |
+
prompt="What time is being shown?",
|
735 |
+
)
|
736 |
+
for k in ["hour", "minute", "second", "answerable"]:
|
737 |
+
out[f"metadata/{k}"] = ex[k]
|
738 |
+
return out
|
739 |
+
|
740 |
+
|
741 |
+
def deg2rad(x):
|
742 |
+
return x*math.pi/180.0
|
743 |
+
|
744 |
+
|
745 |
+
def get_affine_matrix(center, angle, translate, scale, shear):
|
746 |
+
# From https://github.com/pytorch/vision/blob/f96c42fca53230057b16941b078a0a9eee06e20f/torchvision/transforms/functional.py#L1006
|
747 |
+
rot = deg2rad(angle)
|
748 |
+
sx = deg2rad(shear[0])
|
749 |
+
sy = deg2rad(shear[1])
|
750 |
+
|
751 |
+
cx, cy = center
|
752 |
+
tx, ty = translate
|
753 |
+
|
754 |
+
# RSS without scaling
|
755 |
+
a = tf.cos(rot - sy) / tf.cos(sy)
|
756 |
+
b = -tf.cos(rot - sy) * tf.tan(sx) / tf.cos(sy) - tf.sin(rot)
|
757 |
+
c = tf.sin(rot - sy) / tf.cos(sy)
|
758 |
+
d = -tf.sin(rot - sy) * tf.tan(sx) / tf.cos(sy) + tf.cos(rot)
|
759 |
+
|
760 |
+
matrix = [a, b, 0.0, c, d, 0.0]
|
761 |
+
matrix = [x * scale for x in matrix]
|
762 |
+
# Apply inverse of center translation: RSS * C^-1
|
763 |
+
matrix[2] += matrix[0] * (-cx) + matrix[1] * (-cy)
|
764 |
+
matrix[5] += matrix[3] * (-cx) + matrix[4] * (-cy)
|
765 |
+
# Apply translation and center : T * C * RSS * C^-1
|
766 |
+
matrix[2] += cx + tx
|
767 |
+
matrix[5] += cy + ty
|
768 |
+
return matrix
|
769 |
+
|
770 |
+
|
771 |
+
def quantize_point(coor, max_dim, mode="percent-precision-1"):
|
772 |
+
max_dim = tf.cast(max_dim, tf.float32)
|
773 |
+
coor = tf.cast(coor, tf.float32)
|
774 |
+
x = (coor / max_dim)
|
775 |
+
if mode == "percent-precision-1":
|
776 |
+
return tf.strings.as_string(x*100, precision=1)
|
777 |
+
elif mode == "zero_to_one":
|
778 |
+
return tf.strings.as_string(x, precision=3)
|
779 |
+
elif mode == "1k":
|
780 |
+
return tf.strings.as_string(x*1000, precision=0)
|
781 |
+
else:
|
782 |
+
raise NotImplementedError(mode)
|
783 |
+
|
784 |
+
|
785 |
+
def construct_pointing_format(label_text, alt_text, x_str, y_str):
|
786 |
+
if alt_text is None:
|
787 |
+
alt_text = label_text
|
788 |
+
np = tf.shape(x_str)[0]
|
789 |
+
if np == 0:
|
790 |
+
output = ""
|
791 |
+
elif np == 1:
|
792 |
+
output = tf.strings.join([
|
793 |
+
'<point x="', x_str[0], '" y="', y_str[0], '" alt="',
|
794 |
+
alt_text, '">', label_text, '</point>'
|
795 |
+
])
|
796 |
+
else:
|
797 |
+
ids = tf.strings.as_string(tf.range(1, np + 1, dtype=tf.int32))
|
798 |
+
xs = tf.strings.join(["x", ids, '="', x_str, '"'])
|
799 |
+
ys = tf.strings.join(["y", ids, '="', y_str, '"'])
|
800 |
+
points = tf.strings.reduce_join(tf.reshape(tf.stack([xs, ys], 1), [-1]), separator=' ', axis=-1)
|
801 |
+
output = tf.strings.join(
|
802 |
+
["<points ", points, ' alt="', alt_text, '">', label_text, "</points>"])
|
803 |
+
return output
|
804 |
+
|
805 |
+
|
806 |
+
def order_points(x, y, seed, point_order):
|
807 |
+
if point_order == "natural":
|
808 |
+
return x, y
|
809 |
+
|
810 |
+
if point_order == "random":
|
811 |
+
ix = stateless_permutation(tf.shape(x)[0], seed)
|
812 |
+
elif point_order == "xy":
|
813 |
+
x_float, y_float = tf.strings.to_number(x), tf.strings.to_number(y)
|
814 |
+
ix = tf.argsort(x_float*100000 + y_float)
|
815 |
+
elif point_order == "yx":
|
816 |
+
x_float, y_float = tf.strings.to_number(x), tf.strings.to_number(y)
|
817 |
+
ix = tf.argsort(y_float*100000 + x_float)
|
818 |
+
else:
|
819 |
+
raise NotImplementedError(point_order)
|
820 |
+
return tf.gather(x, ix), tf.gather(y, ix)
|
821 |
+
|
822 |
+
|
823 |
+
@gin.configurable()
|
824 |
+
def points_to_text(x, y, w, h, seed, label=None, alt_text=None, point_mode="percent-precision-1",
|
825 |
+
point_order="xy", point_list_mode="tag"):
|
826 |
+
"""Returns a string encoding of a list of points"""
|
827 |
+
x = quantize_point(x, w, point_mode)
|
828 |
+
y = quantize_point(y, h, point_mode)
|
829 |
+
# Order the quantized points to make the order matches what was generated, this can matter
|
830 |
+
# when points have the same quantized value e.g, (10.001, 20) (10.002, 10) should be
|
831 |
+
# represented (10, 10), (10, 20), but if we sort before quantization we get (10, 20), (10, 10)
|
832 |
+
x, y = order_points(x, y, seed, point_order)
|
833 |
+
if point_list_mode == "tag":
|
834 |
+
return construct_pointing_format(label, alt_text, x, y)
|
835 |
+
elif point_list_mode == "paren":
|
836 |
+
n = tf.shape(x)[0]
|
837 |
+
return tf.strings.reduce_join(tf.strings.join([
|
838 |
+
"(", x, ", ", y, ")"
|
839 |
+
]), separator=", ")
|
840 |
+
# if n == 0:
|
841 |
+
# output = ""
|
842 |
+
# else:
|
843 |
+
# ids = tf.strings.as_string(tf.range(1, np + 1, dtype=tf.int32))
|
844 |
+
# xs = tf.strings.join(["x", ids, '="', x_str, '"'])
|
845 |
+
# ys = tf.strings.join(["y", ids, '="', y_str, '"'])
|
846 |
+
# points = tf.strings.reduce_join(tf.reshape(tf.stack([xs, ys], 1), [-1]), separator=' ', axis=-1)
|
847 |
+
# output = tf.strings.join(
|
848 |
+
# ["<points ", points, ' alt="', alt_text, '">', label_text, "</points>"])
|
849 |
+
# return output
|
850 |
+
else:
|
851 |
+
raise NotImplementedError(point_list_mode)
|
852 |
+
|
853 |
+
|
854 |
+
def points_to_answer(x, y, w, h, seed, label, is_counting, alt_text=None):
|
855 |
+
count = tf.shape(x)[0]
|
856 |
+
if is_counting:
|
857 |
+
if count == 0:
|
858 |
+
return "There are none."
|
859 |
+
else:
|
860 |
+
point_text = points_to_text(x, y, w, h, seed, label, alt_text)
|
861 |
+
return tf.strings.join([
|
862 |
+
"Counting the ", point_text,
|
863 |
+
" shows a total of ",
|
864 |
+
tf.strings.as_string(count),
|
865 |
+
"."
|
866 |
+
])
|
867 |
+
else:
|
868 |
+
if count == 0:
|
869 |
+
return "There are none."
|
870 |
+
else:
|
871 |
+
return points_to_text(x, y, w, h, seed, label, alt_text)
|
872 |
+
|
873 |
+
|
874 |
+
@seqio.map_over_dataset(num_seeds=2)
|
875 |
+
def extract_point_qa(ex, seeds, answer_type="y_major"):
|
876 |
+
ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
|
877 |
+
img_h = tf.shape(ex["image"])[0]
|
878 |
+
img_w = tf.shape(ex["image"])[1]
|
879 |
+
|
880 |
+
questions = ex["questions"]
|
881 |
+
question = questions["question"]
|
882 |
+
n = tf.shape(question)[0]
|
883 |
+
answers = tf.TensorArray(tf.string, size=n, element_shape=())
|
884 |
+
point_text = questions["annotations"]["point_text"]
|
885 |
+
point_seeds = tf.RaggedTensor.from_row_splits(
|
886 |
+
row_splits=point_text.row_splits,
|
887 |
+
values=tf.random.split(seeds[0], num=tf.shape(point_text.values)[0])
|
888 |
+
)
|
889 |
+
for question_ix in range(n):
|
890 |
+
anno = questions["annotations"]
|
891 |
+
answer = questions["answer_with_placeholders"][question_ix]
|
892 |
+
n_anno = tf.shape(anno["point_text"][question_ix])[0]
|
893 |
+
for anno_ix in range(n_anno):
|
894 |
+
points = anno["points"][question_ix, anno_ix]
|
895 |
+
point_text = points_to_answer(
|
896 |
+
points[:, 0], points[:, 1], 100, 100,
|
897 |
+
point_seeds[question_ix, anno_ix],
|
898 |
+
anno["point_text"][question_ix, anno_ix],
|
899 |
+
False,
|
900 |
+
alt_text=anno["alt_text"][question_ix, anno_ix],
|
901 |
+
)
|
902 |
+
answer_split = tf.strings.split(answer, sep="<|POINT|>", maxsplit=1)
|
903 |
+
answer = tf.strings.join([answer_split[0], point_text, answer_split[1]])
|
904 |
+
# Make sure all placeholders where used
|
905 |
+
tf.debugging.assert_equal(tf.shape(tf.strings.split(answer, sep="<|POINT|>"))[0], 1)
|
906 |
+
answers = answers.write(question_ix, answer)
|
907 |
+
|
908 |
+
messages = tf.stack([question, answers.stack()], axis=1)
|
909 |
+
messages = tf.reshape(messages, [-1])
|
910 |
+
conversation_ids = tf.range(tf.shape(messages)[0] // 2, dtype=tf.int32)
|
911 |
+
conversation_ids = tf.repeat(conversation_ids, 2)
|
912 |
+
out = dict(
|
913 |
+
image=ex["image"],
|
914 |
+
messages=tf.RaggedTensor.from_value_rowids(messages, conversation_ids)
|
915 |
+
)
|
916 |
+
ix = stateless_permutation(tf.shape(messages)[0], seeds[1])
|
917 |
+
messages = tf.gather(messages, ix)
|
918 |
+
out.update(_add_metadata(ex))
|
919 |
+
out["metadata/image_size"] = [img_w, img_h]
|
920 |
+
return out
|
921 |
+
|
922 |
+
|
923 |
+
def select_point(mask):
|
924 |
+
bs = tf.shape(mask)[0]
|
925 |
+
valid = tf.cast(mask, tf.float32)
|
926 |
+
h, w = tf.shape(mask)[1], tf.shape(mask)[2]
|
927 |
+
ys = tf.range(h, dtype=tf.int32)
|
928 |
+
xs = tf.range(w, dtype=tf.int32)
|
929 |
+
|
930 |
+
n = tf.reduce_sum(valid, [1, 2])
|
931 |
+
cy = tf.reduce_sum(tf.cast(ys[None, :, None], tf.float32) * valid, [1, 2]) / n # [bs]
|
932 |
+
cx = tf.reduce_sum(tf.cast(xs[None, None, :], tf.float32) * valid, [1, 2]) / n # [bs]
|
933 |
+
|
934 |
+
dist_y = tf.square(tf.range(h, dtype=tf.float32)[None, :] - cy[:, None]) # [bs, h]
|
935 |
+
dist_x = tf.square(tf.range(w, dtype=tf.float32)[None, :] - cx[:, None]) # [bs, w]
|
936 |
+
dist = dist_y[:, :, None] + dist_x[:, None, :] # [batch, h, w]
|
937 |
+
dist = dist + (1 - valid) * 1e12
|
938 |
+
min_dist = tf.argmin(tf.reshape(dist, [bs, -1]), axis=-1) # [batch]
|
939 |
+
w = tf.cast(w, min_dist.dtype)
|
940 |
+
cy = tf.cast(min_dist // w, tf.float32)
|
941 |
+
cx = tf.cast(min_dist % w, tf.float32)
|
942 |
+
return cx, cy
|
943 |
+
|
944 |
+
|
945 |
+
@seqio.map_over_dataset
|
946 |
+
def refexp_pointing(ex):
|
947 |
+
img_h = tf.shape(ex["image"])[0]
|
948 |
+
img_w = tf.shape(ex["image"])[1]
|
949 |
+
objects = ex["objects"]
|
950 |
+
|
951 |
+
# Shuffle objects so what object gets truncated if the sequence gets truncated is randomized
|
952 |
+
refexps = objects['refexp']['raw']
|
953 |
+
bbox = objects["bbox"]
|
954 |
+
mask = tf.squeeze(objects["mask"], -1)
|
955 |
+
|
956 |
+
ix = tf.range(0, tf.shape(refexps)[0], dtype=tf.int32)
|
957 |
+
ix = tf.random.shuffle(ix)
|
958 |
+
refexps = tf.gather(refexps, ix)
|
959 |
+
bbox = tf.gather(bbox, ix)
|
960 |
+
mask = tf.gather(mask, ix)
|
961 |
+
|
962 |
+
cx, cy = select_point(mask)
|
963 |
+
answers = points_to_text(img_h, img_w, cx, cy)
|
964 |
+
|
965 |
+
out = {
|
966 |
+
"image": ex["image"],
|
967 |
+
"refexp": refexps.values,
|
968 |
+
"metadata/image_size": tf.stack([img_w, img_h,]),
|
969 |
+
"text": tf.repeat(answers, refexps.row_lengths()),
|
970 |
+
}
|
971 |
+
if "image_url" in ex:
|
972 |
+
out["metadata/image_url"] = ex["image_url"]
|
973 |
+
return out
|
974 |
+
|
975 |
+
|
976 |
+
@seqio.map_over_dataset
|
977 |
+
def refexp_pointing_inf(ex):
|
978 |
+
img_h = tf.shape(ex["image"])[0]
|
979 |
+
img_w = tf.shape(ex["image"])[1]
|
980 |
+
|
981 |
+
objects = ex["objects"]
|
982 |
+
mask = tf.squeeze(objects["mask"], -1)
|
983 |
+
cx, cy = select_point(mask)
|
984 |
+
answers = points_to_text(img_h, img_w, cx, cy)
|
985 |
+
|
986 |
+
refexps = objects["refexp"]["raw"]
|
987 |
+
|
988 |
+
# We can't use `mask` directly since it is variable size, and thus it
|
989 |
+
# will break batching. Here we serialize it instead
|
990 |
+
serialized_masks = tf.map_fn(tf.io.serialize_tensor, mask, fn_output_signature=tf.string)
|
991 |
+
out = {
|
992 |
+
"image": ex["image"],
|
993 |
+
"refexp": refexps,
|
994 |
+
"metadata/bbox": objects["bbox"],
|
995 |
+
"metadata/answer": answers,
|
996 |
+
"metadata/mask": serialized_masks,
|
997 |
+
"metadata/image_size": tf.stack([img_w, img_h]),
|
998 |
+
}
|
999 |
+
out.update({k: v for k, v in ex.items() if k.startswith("metadata/")})
|
1000 |
+
return out
|
1001 |
+
|
1002 |
+
@seqio.map_over_dataset
|
1003 |
+
def extract_andriod_control_inf(ex, mode):
|
1004 |
+
if mode == "ll":
|
1005 |
+
prompt = tf.strings.join(["low_level: ", ex["metadata/ll_instruction"]])
|
1006 |
+
elif mode == "hl_ll":
|
1007 |
+
prompt = tf.strings.join([
|
1008 |
+
"high_level: ", ex["metadata/hl_instruction"],
|
1009 |
+
" low_level: ", ex["metadata/ll_instruction"]
|
1010 |
+
])
|
1011 |
+
elif mode == "hl":
|
1012 |
+
prompt = tf.strings.join(["high_level: ", ex["metadata/hl_instruction"]])
|
1013 |
+
elif mode == "hl_cot":
|
1014 |
+
prompt = tf.strings.join(["high_level_cot: ", ex["metadata/hl_instruction"]])
|
1015 |
+
else:
|
1016 |
+
raise NotImplementedError()
|
1017 |
+
|
1018 |
+
out = dict(
|
1019 |
+
image=ex["image"],
|
1020 |
+
prompt=prompt,
|
1021 |
+
text=ex["metadata/target_action"]
|
1022 |
+
)
|
1023 |
+
out.update(_add_metadata(ex))
|
1024 |
+
return out
|
1025 |
+
|
1026 |
+
@seqio.map_over_dataset
|
1027 |
+
def extract_android_control(ex):
|
1028 |
+
# Each image has three tasks:
|
1029 |
+
# low level -> action
|
1030 |
+
# high+low level -> action
|
1031 |
+
# high level -> action
|
1032 |
+
# high level -> low level + action (CoT)
|
1033 |
+
out = dict(
|
1034 |
+
image=ex["image"],
|
1035 |
+
prompt=tf.stack([
|
1036 |
+
tf.strings.join(["low_level: ", ex["metadata/ll_instruction"]]),
|
1037 |
+
tf.strings.join([
|
1038 |
+
"high_level: ", ex["metadata/hl_instruction"],
|
1039 |
+
" low_level: ", ex["metadata/ll_instruction"]
|
1040 |
+
]),
|
1041 |
+
tf.strings.join(["high_level: ", ex["metadata/hl_instruction"]]),
|
1042 |
+
tf.strings.join(["high_level_cot: ", ex["metadata/hl_instruction"]]),
|
1043 |
+
]),
|
1044 |
+
text=tf.stack([
|
1045 |
+
ex["metadata/target_action"],
|
1046 |
+
ex["metadata/target_action"],
|
1047 |
+
ex["metadata/target_action"],
|
1048 |
+
tf.strings.join(["Plan: ", ex["metadata/ll_instruction"], " Action: ", ex["metadata/target_action"]]),
|
1049 |
+
])
|
1050 |
+
)
|
1051 |
+
# Only needed if visualizing
|
1052 |
+
# ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
|
1053 |
+
# img_h = tf.shape(ex["image"])[0]
|
1054 |
+
# img_w = tf.shape(ex["image"])[1]
|
1055 |
+
# out["metadata/image_size"] = tf.stack([img_w, img_h,])
|
1056 |
+
out.update(_add_metadata(ex))
|
1057 |
+
return out
|
1058 |
+
|
1059 |
+
|
1060 |
+
@seqio.map_over_dataset(num_seeds=1)
|
1061 |
+
def refexp(ex, seed):
|
1062 |
+
img_h = tf.shape(ex["image"])[0]
|
1063 |
+
img_w = tf.shape(ex["image"])[1]
|
1064 |
+
objects = ex["objects"]
|
1065 |
+
|
1066 |
+
# Shuffle objects so what object gets truncated if the sequence gets truncated is randomized
|
1067 |
+
refexps = objects['refexp']['raw']
|
1068 |
+
bbox = objects["bbox"]
|
1069 |
+
ix = stateless_permutation(tf.shape(refexps)[0], seed)
|
1070 |
+
refexps = tf.gather(refexps, ix)
|
1071 |
+
bbox = tf.gather(bbox, ix)
|
1072 |
+
|
1073 |
+
x2 = bbox[:, 0] + bbox[:, 2]
|
1074 |
+
y2 = bbox[:, 1] + bbox[:, 3]
|
1075 |
+
with tf.control_dependencies([
|
1076 |
+
tf.debugging.assert_equal(tf.reduce_any(x2 <= tf.cast(img_w, tf.float32)), True),
|
1077 |
+
tf.debugging.assert_equal(tf.reduce_any(y2 <= tf.cast(img_h, tf.float32)), True)
|
1078 |
+
]):
|
1079 |
+
answers = points_to_text(
|
1080 |
+
img_h, img_w,
|
1081 |
+
tf.reshape(tf.stack([bbox[:, 0], x2], 1), [-1]),
|
1082 |
+
tf.reshape(tf.stack([bbox[:, 1], y2], 1), [-1]))
|
1083 |
+
answers = tf.strings.reduce_join(tf.reshape(answers, [-1, 2]), separator=" ", axis=1)
|
1084 |
+
|
1085 |
+
out = {
|
1086 |
+
"image": ex["image"],
|
1087 |
+
"refexp": refexps.values,
|
1088 |
+
"metadata/bbox": bbox,
|
1089 |
+
"metadata/image_size": tf.stack([img_w, img_h,]),
|
1090 |
+
"text": tf.repeat(answers, refexps.row_lengths()),
|
1091 |
+
}
|
1092 |
+
|
1093 |
+
if "image_url" in ex:
|
1094 |
+
out["image_url"] = ex["image_url"]
|
1095 |
+
return out
|
1096 |
+
|
1097 |
+
|
1098 |
+
@seqio.map_over_dataset
|
1099 |
+
def refexp_inf(ex):
|
1100 |
+
img_h = tf.shape(ex["image"])[0]
|
1101 |
+
img_w = tf.shape(ex["image"])[1]
|
1102 |
+
out = {
|
1103 |
+
"image": ex["image"],
|
1104 |
+
"refexp": ex["objects"]["refexp"]["raw"],
|
1105 |
+
"metadata/bbox": ex["objects"]["bbox"],
|
1106 |
+
"metadata/image_size": tf.stack([img_w, img_h,]),
|
1107 |
+
}
|
1108 |
+
out.update({k: v for k, v in ex.items() if k.startswith("metadata/")})
|
1109 |
+
return out
|
1110 |
+
|
1111 |
+
|
1112 |
+
def point_text_interleaved(*args):
|
1113 |
+
raise NotImplementedError()
|
1114 |
+
|
1115 |
+
|
1116 |
+
@seqio.map_over_dataset
|
1117 |
+
def web_pointing_preprocessor(ex):
|
1118 |
+
img_h = tf.shape(ex["image"])[0]
|
1119 |
+
img_w = tf.shape(ex["image"])[1]
|
1120 |
+
|
1121 |
+
question = point_text_interleaved(
|
1122 |
+
img_h, img_w, ex["question"], ex["question_points"]["x"], ex["question_points"]["y"])
|
1123 |
+
answer = point_text_interleaved(
|
1124 |
+
img_h, img_w, ex["answer"], ex["answer_points"]["x"], ex["answer_points"]["y"])
|
1125 |
+
answer_points = tf.stack([ex["answer_points"]["x"], ex["answer_points"]["y"]], axis=1)
|
1126 |
+
return {
|
1127 |
+
"question": question,
|
1128 |
+
"answer": answer,
|
1129 |
+
"image": ex["image"],
|
1130 |
+
"metadata/image_size": [img_w, img_h],
|
1131 |
+
"metadata/question_type": ex["question_type"],
|
1132 |
+
"metadata/answer_points": tf.io.serialize_tensor(answer_points),
|
1133 |
+
"metadata/answer": answer,
|
1134 |
+
}
|
1135 |
+
|
1136 |
+
|
1137 |
+
def filter_pointing(ds):
|
1138 |
+
return ds.filter(lambda ex: tf.shape(ex["answer_points"]["x"])[0] >= 1)
|
1139 |
+
|
1140 |
+
|
1141 |
+
def filter_qa(ds):
|
1142 |
+
return ds.filter(lambda ex: tf.shape(ex["answer_points"]["x"])[0] == 0)
|
1143 |
+
|
1144 |
+
# vaia filtering
|
1145 |
+
def filter_image_only(ds):
|
1146 |
+
return ds.filter(lambda ex: ex["has_image"])
|
1147 |
+
|
1148 |
+
def filter_mc(ds):
|
1149 |
+
return ds.filter(lambda ex: ex["is_mc"])
|
1150 |
+
|
1151 |
+
def remove_is_long(ds):
|
1152 |
+
return ds.filter(lambda ex: not ex["is_long"])
|
1153 |
+
|
1154 |
+
def remove_has_multiple_parts(ds):
|
1155 |
+
return ds.filter(lambda ex: not ex["has_multiple_parts"])
|
1156 |
+
|
1157 |
+
|
1158 |
+
def _split(ds: tf.data.Dataset, keys, n_splits=2):
|
1159 |
+
def _map(ex):
|
1160 |
+
n = tf.shape(ex[keys[0]])[0]
|
1161 |
+
if n < n_splits:
|
1162 |
+
return tf.data.Dataset.from_tensors(ex)
|
1163 |
+
else:
|
1164 |
+
# import pdb; pdb.set_trace()
|
1165 |
+
bs = n // n_splits
|
1166 |
+
remainder = n - bs*n_splits
|
1167 |
+
lens = tf.concat([
|
1168 |
+
tf.ones([remainder], dtype=tf.int32),
|
1169 |
+
tf.zeros([n_splits-remainder], dtype=tf.int32),
|
1170 |
+
], axis=0) + bs
|
1171 |
+
tf.debugging.assert_equal(tf.reduce_sum(lens), n)
|
1172 |
+
ends = tf.cumsum(lens)
|
1173 |
+
|
1174 |
+
parts = []
|
1175 |
+
for split_ix in range(n_splits):
|
1176 |
+
part_ex = dict(ex)
|
1177 |
+
e = ends[split_ix]
|
1178 |
+
s = e - lens[split_ix]
|
1179 |
+
for k in keys:
|
1180 |
+
if isinstance(k, tuple):
|
1181 |
+
assert len(k) == 2
|
1182 |
+
part_ex[k[0]][k[1]] = ex[k[0]][k[1]][s:e]
|
1183 |
+
else:
|
1184 |
+
part_ex[k] = ex[k][s:e]
|
1185 |
+
parts.append(part_ex)
|
1186 |
+
|
1187 |
+
ds = tf.data.Dataset.from_tensors(parts[0])
|
1188 |
+
for sub_ds in parts[1:]:
|
1189 |
+
sub_ds = tf.data.Dataset.from_tensors(sub_ds)
|
1190 |
+
ds = ds.concatenate(sub_ds)
|
1191 |
+
return ds
|
1192 |
+
|
1193 |
+
return ds.flat_map(_map)
|
1194 |
+
|
1195 |
+
|
1196 |
+
|
1197 |
+
def split(ds, n=2):
|
1198 |
+
# return ds
|
1199 |
+
return _split(ds, [k for k in [
|
1200 |
+
"question",
|
1201 |
+
"label",
|
1202 |
+
"text",
|
1203 |
+
"entity",
|
1204 |
+
"messages"
|
1205 |
+
] if k in ds.element_spec], n_splits=n)
|
1206 |
+
|
1207 |
+
|
1208 |
+
def split_points(ds, max_points=50):
|
1209 |
+
label = "question" if "question" in ds.element_spec else "label"
|
1210 |
+
return _split(ds, [
|
1211 |
+
"question", label, "notInImage",
|
1212 |
+
("answer_points", "x"),
|
1213 |
+
("answer_points", "y"),
|
1214 |
+
])
|
1215 |
+
|
1216 |
+
|
1217 |
+
@seqio.map_over_dataset
|
1218 |
+
def fix_count_qa(ex):
|
1219 |
+
ex["label"] = ex["label"][::2]
|
1220 |
+
tf.debugging.assert_equal(tf.shape(ex["answer_points"]["x"])[0], tf.shape(ex["label"])[0])
|
1221 |
+
return ex
|
1222 |
+
|
1223 |
+
|
1224 |
+
def filter_points(ds, max_number=40):
|
1225 |
+
|
1226 |
+
def _add_valid(ex):
|
1227 |
+
valid = (
|
1228 |
+
tf.reduce_all(ex["answer_points"]["x"] >= 0.0, axis=-1) &
|
1229 |
+
tf.reduce_all(ex["answer_points"]["x"] <= 100.0, axis=-1) &
|
1230 |
+
tf.reduce_all(ex["answer_points"]["y"] >= 0.0, axis=-1) &
|
1231 |
+
tf.reduce_all(ex["answer_points"]["y"] <= 100.0, axis=-1) &
|
1232 |
+
(ex["answer_points"]["y"].row_lengths() <= max_number)
|
1233 |
+
)
|
1234 |
+
ex["valid"] = valid
|
1235 |
+
return ex
|
1236 |
+
ds = ds.map(_add_valid)
|
1237 |
+
ds = ds.filter(lambda ex: tf.reduce_any(ex["valid"]))
|
1238 |
+
return ds
|
1239 |
+
|
1240 |
+
|
1241 |
+
# def filter_points(ds, max_number=30):
|
1242 |
+
# n_points = ds["answer_points"]["x"].row_lengths()
|
1243 |
+
# parts = tf.TensorArray(tf.int32, size=tf.shape(n_points[0]), element_shape=tf.TensorShape([None]))
|
1244 |
+
# total = 0
|
1245 |
+
# on_row = 0
|
1246 |
+
# for i in range(n_points):
|
1247 |
+
# n = n_points[i]
|
1248 |
+
# if n > max_number:
|
1249 |
+
# continue
|
1250 |
+
# if n + total > max_number:
|
1251 |
+
#
|
1252 |
+
# return ds
|
1253 |
+
|
1254 |
+
|
1255 |
+
@seqio.map_over_dataset(num_seeds=2)
|
1256 |
+
def pointing_preprocessor(ex, sequence_length, seeds, with_count=False):
|
1257 |
+
image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
|
1258 |
+
img_h = tf.shape(image)[0]
|
1259 |
+
img_w = tf.shape(image)[1]
|
1260 |
+
|
1261 |
+
ix = tf.where(ex["valid"])[:, 0]
|
1262 |
+
ix = stateless_shuffle(ix, seeds[0])
|
1263 |
+
if "label" in ex:
|
1264 |
+
question = tf.strings.lower(ex["label"])
|
1265 |
+
else:
|
1266 |
+
question = ex["question"]
|
1267 |
+
question = tf.gather(question, ix) # [n_question]
|
1268 |
+
points_x = tf.gather(ex["answer_points"]["x"], ix) # [n_question, n_points[ragged]]]
|
1269 |
+
points_y = tf.gather(ex["answer_points"]["y"], ix)
|
1270 |
+
not_in_image = tf.gather(ex["notInImage"], ix) # [n_question]
|
1271 |
+
|
1272 |
+
n = tf.shape(points_x)[0]
|
1273 |
+
point_text = tf.TensorArray(dtype=tf.string, size=n, element_shape=()) # [n_question]
|
1274 |
+
point_seeds = tf.random.split(seeds[1], n)
|
1275 |
+
for i in range(n):
|
1276 |
+
answer = points_to_answer(points_x[i], points_y[i], 100, 100, point_seeds[i], question[i], with_count)
|
1277 |
+
point_text = point_text.write(i, answer)
|
1278 |
+
return {
|
1279 |
+
"image": image,
|
1280 |
+
"metadata/image_size": [img_w, img_h],
|
1281 |
+
"entity": question,
|
1282 |
+
"question": question,
|
1283 |
+
"text": point_text.stack(),
|
1284 |
+
}
|
1285 |
+
|
1286 |
+
|
1287 |
+
@seqio.map_over_dataset
|
1288 |
+
def pointing_inf_preprocessor(ex):
|
1289 |
+
ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
|
1290 |
+
img_h = tf.shape(ex["image"])[0]
|
1291 |
+
img_w = tf.shape(ex["image"])[1]
|
1292 |
+
|
1293 |
+
question = ex["question"]
|
1294 |
+
not_in_image = tf.shape(ex["answer_points"]["x"])[0] == 0
|
1295 |
+
|
1296 |
+
# points are stored in normalized format, de-normalize here
|
1297 |
+
points_x = ex["answer_points"]["x"] * tf.cast(img_w, tf.float32) / 100.0
|
1298 |
+
points_y = ex["answer_points"]["y"] * tf.cast(img_h, tf.float32) / 100.0
|
1299 |
+
|
1300 |
+
out = dict(
|
1301 |
+
image=ex["image"],
|
1302 |
+
question=question,
|
1303 |
+
entity=question,
|
1304 |
+
)
|
1305 |
+
out.update(_add_metadata(ex))
|
1306 |
+
out["metadata/not_in_image"] = not_in_image
|
1307 |
+
# We can't use `mask` directly since it is variable size, and thus it
|
1308 |
+
# will break batching. Here we serialize it instead
|
1309 |
+
serialized_masks = tf.map_fn(tf.io.serialize_tensor, ex["masks"], fn_output_signature=tf.string)
|
1310 |
+
serialized_masks = tf.strings.reduce_join(serialized_masks, separator="|||")
|
1311 |
+
out["metadata/mask"] = serialized_masks
|
1312 |
+
out["metadata/question"] = question
|
1313 |
+
out["metadata/answer_points"] = tf.io.serialize_tensor(tf.stack([points_x, points_y], 1))
|
1314 |
+
out["metadata/image_size"] = [img_w, img_h]
|
1315 |
+
|
1316 |
+
return out
|
1317 |
+
|
1318 |
+
|
1319 |
+
@seqio.map_over_dataset(num_seeds=1)
|
1320 |
+
def count_qa_preprocessor_inf(ex, sequence_length, seed):
|
1321 |
+
image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
|
1322 |
+
img_h = tf.shape(image)[0]
|
1323 |
+
img_w = tf.shape(image)[1]
|
1324 |
+
|
1325 |
+
entity = tf.strings.substr(
|
1326 |
+
ex["question"], len("How many "), tf.strings.length(ex["question"]) - len("How many "))
|
1327 |
+
entity = tf.strings.split(entity, sep=" are ", maxsplit=1)[0]
|
1328 |
+
entity = tf.strings.lower(entity)
|
1329 |
+
tf.debugging.assert_equal(tf.strings.length(entity) != 0, True)
|
1330 |
+
|
1331 |
+
return {
|
1332 |
+
"image": image,
|
1333 |
+
"metadata/image_size": [img_w, img_h],
|
1334 |
+
"metadata/count": tf.strings.to_number(ex["answer"]),
|
1335 |
+
"question": ex["question"],
|
1336 |
+
"entity": entity,
|
1337 |
+
}
|
1338 |
+
|
1339 |
+
|
1340 |
+
@seqio.map_over_dataset(num_seeds=1)
|
1341 |
+
def count_qa_preprocessor(ex, sequence_length, seed, with_count=False,
|
1342 |
+
for_inference=False):
|
1343 |
+
point_answer = ex["point_answer"]
|
1344 |
+
numbers_str = tf.strings.regex_replace(point_answer, r'\.$', '')
|
1345 |
+
numbers_str = tf.strings.regex_replace(numbers_str, r'[^\d\.\s]+', '')
|
1346 |
+
numbers_str = tf.strings.strip(numbers_str)
|
1347 |
+
numbers = tf.strings.split(numbers_str)
|
1348 |
+
float_numbers = tf.strings.to_number(numbers, out_type=tf.float32)
|
1349 |
+
coordinates = tf.reshape(float_numbers, (-1, 3))
|
1350 |
+
points_x = coordinates[:, 1]
|
1351 |
+
points_y = coordinates[:, 2]
|
1352 |
+
|
1353 |
+
image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
|
1354 |
+
img_h = tf.shape(image)[0]
|
1355 |
+
img_w = tf.shape(image)[1]
|
1356 |
+
entity = tf.strings.substr(
|
1357 |
+
ex["question"], len("How many "), tf.strings.length(ex["question"]) - len("How many "))
|
1358 |
+
entity = tf.strings.split(entity, sep=" are ", maxsplit=1)[0]
|
1359 |
+
entity = tf.strings.lower(entity)
|
1360 |
+
tf.debugging.assert_equal(tf.strings.length(entity) != 0, True)
|
1361 |
+
count = tf.strings.to_number(ex["answer"], out_type=tf.int32)
|
1362 |
+
if for_inference:
|
1363 |
+
return {
|
1364 |
+
"image": image,
|
1365 |
+
"metadata/image_size": [img_w, img_h],
|
1366 |
+
"metadata/count": count,
|
1367 |
+
"question": ex["question"],
|
1368 |
+
"entity": entity,
|
1369 |
+
}
|
1370 |
+
else:
|
1371 |
+
tf.debugging.assert_equal(count, tf.shape(points_x)[0])
|
1372 |
+
# points are already normalized so use w=1, h=1
|
1373 |
+
answer = points_to_answer(points_x, points_y, 1, 1, seed, entity, with_count)
|
1374 |
+
return {
|
1375 |
+
"image": image,
|
1376 |
+
"metadata/image_size": [img_w, img_h],
|
1377 |
+
"metadata/count": count,
|
1378 |
+
"question": ex["question"],
|
1379 |
+
"entity": entity,
|
1380 |
+
"text": answer,
|
1381 |
+
}
|
1382 |
+
|
1383 |
+
|
1384 |
+
@gin.configurable()
|
1385 |
+
@seqio.map_over_dataset
|
1386 |
+
def cleanup_preprocessor(ex, preprocess=False):
|
1387 |
+
if preprocess:
|
1388 |
+
ex["prompt"] = tf.strings.join(
|
1389 |
+
[
|
1390 |
+
"[[User]]: Correct the spelling and punctuation mistakes on the following transcript based on what appears in the image.\n\n{before} ",
|
1391 |
+
ex["prompt"],
|
1392 |
+
"\n[[Assistant]]: {after}"
|
1393 |
+
]
|
1394 |
+
)
|
1395 |
+
return ex
|
1396 |
+
else:
|
1397 |
+
return ex
|
1398 |
+
|
1399 |
+
|
1400 |
+
@gin.configurable()
|
1401 |
+
@seqio.map_over_dataset
|
1402 |
+
def random_text_preprocessor(ex, preprocess=False):
|
1403 |
+
ex["prompt"] = "What does the text say in this image?"
|
1404 |
+
if preprocess:
|
1405 |
+
ex["prompt"] = tf.strings.join(["[[User]]: ", ex["prompt"], "\n[[Assistant]]:"])
|
1406 |
+
return ex
|
1407 |
+
else:
|
1408 |
+
return ex
|
1409 |
+
|
1410 |
+
|
1411 |
+
@seqio.map_over_dataset(num_seeds=25)
|
1412 |
+
def clock_augmentation(ex, seeds):
|
1413 |
+
seeds = list(seeds)
|
1414 |
+
image = ex["image"]
|
1415 |
+
|
1416 |
+
# Apply shear, rotation, and scale through one affine matrix
|
1417 |
+
height = tf.cast(tf.shape(image)[0], tf.float32)
|
1418 |
+
width = tf.cast(tf.shape(image)[1], tf.float32)
|
1419 |
+
|
1420 |
+
_call_id = [0]
|
1421 |
+
|
1422 |
+
def _rng(_minval=0, _maxval=1, shape=(), dtype=tf.float32):
|
1423 |
+
return tf.random.stateless_uniform(shape, seeds.pop(), _minval, _maxval, dtype=dtype)
|
1424 |
+
|
1425 |
+
sel = _rng(0, 1)
|
1426 |
+
if sel < 0.1:
|
1427 |
+
# Straight on
|
1428 |
+
shear_x = 0.
|
1429 |
+
shear_y = 0.
|
1430 |
+
rotation = 0.
|
1431 |
+
elif sel < 0.5:
|
1432 |
+
# Normal looking
|
1433 |
+
shear_x = _rng(-10, 10)
|
1434 |
+
shear_y = _rng(-10, 10)
|
1435 |
+
rotation = _rng(-25, 25)
|
1436 |
+
else:
|
1437 |
+
# Allowed to be very wonky
|
1438 |
+
# if tf.random.stateless_uniform((), seeds.pop(), 0, 1) > 0.8:
|
1439 |
+
# image = image[:, ::-1]
|
1440 |
+
|
1441 |
+
if _rng() > 0.5:
|
1442 |
+
shear_x = _rng( -30, 30)
|
1443 |
+
shear_y = _rng( -30, 30)
|
1444 |
+
else:
|
1445 |
+
shear_x = _rng( -10, 10)
|
1446 |
+
shear_y = _rng( -10, 10)
|
1447 |
+
rng = _rng( 0, 1)
|
1448 |
+
if rng < 0.2:
|
1449 |
+
rotation = _rng( -25, 25)
|
1450 |
+
elif rng < 0.6:
|
1451 |
+
rotation = _rng( -80, 80)
|
1452 |
+
else:
|
1453 |
+
rotation = _rng( -180, 180)
|
1454 |
+
|
1455 |
+
if _rng() > 0.5:
|
1456 |
+
scale = _rng( 0.3, 2)
|
1457 |
+
else:
|
1458 |
+
scale = _rng( 0.3, 1)
|
1459 |
+
# Pad so upscaling/rotation will not move the image out of bounds
|
1460 |
+
pad = tf.cast(tf.maximum(height, width)*0.5, tf.int32)
|
1461 |
+
image = tf.pad(image, [[pad, pad], [pad, pad], [0, 0]], constant_values=1)
|
1462 |
+
height = tf.cast(tf.shape(image)[0], tf.float32)
|
1463 |
+
width = tf.cast(tf.shape(image)[1], tf.float32)
|
1464 |
+
|
1465 |
+
image = tf.keras.ops.image.affine_transform(
|
1466 |
+
image,
|
1467 |
+
tf.stack(get_affine_matrix(
|
1468 |
+
[height/2, width/2],
|
1469 |
+
rotation,
|
1470 |
+
[0, 0],
|
1471 |
+
1/scale,
|
1472 |
+
[shear_x, shear_y]
|
1473 |
+
) + [0., 0.]),
|
1474 |
+
interpolation='bilinear',
|
1475 |
+
fill_mode='constant',
|
1476 |
+
fill_value=1.,
|
1477 |
+
data_format='channels_last'
|
1478 |
+
)
|
1479 |
+
|
1480 |
+
# Crop, otherwise it would be impossible to put the image at the corner of the image
|
1481 |
+
not_white = tf.logical_not(tf.reduce_all(image > 0.99, -1))
|
1482 |
+
no_white_ix = tf.where(not_white)
|
1483 |
+
top_left = tf.reduce_min(no_white_ix, axis=0)
|
1484 |
+
bottom_right = tf.reduce_max(no_white_ix, axis=0)
|
1485 |
+
image = tf.image.crop_to_bounding_box(
|
1486 |
+
image,
|
1487 |
+
offset_height=tf.cast(top_left[0], tf.int32),
|
1488 |
+
offset_width=tf.cast(top_left[1], tf.int32),
|
1489 |
+
target_height=tf.cast(bottom_right[0] - top_left[0] + 1, tf.int32),
|
1490 |
+
target_width=tf.cast(bottom_right[1] - top_left[1] + 1, tf.int32),
|
1491 |
+
)
|
1492 |
+
|
1493 |
+
# Translate
|
1494 |
+
height, width = tf.shape(image)[0], tf.shape(image)[1]
|
1495 |
+
translation_seed = _rng(0, 1)
|
1496 |
+
if translation_seed < 0.2:
|
1497 |
+
h_pad = _rng(0, height//2, (2,), dtype=tf.int32)
|
1498 |
+
w_pad = _rng(0, width//2, (2,), dtype=tf.int32)
|
1499 |
+
else:
|
1500 |
+
h_pad = _rng(0, height*2, (2,), dtype=tf.int32)
|
1501 |
+
w_pad = _rng(0, width*2, (2,), dtype=tf.int32)
|
1502 |
+
image = tf.pad(image, [[h_pad[0], w_pad[0]], [h_pad[1], w_pad[1]], [0, 0]],
|
1503 |
+
constant_values=1)
|
1504 |
+
|
1505 |
+
# Random background color
|
1506 |
+
# color_rng = tf.random.stateless_uniform((4,), seeds.pop(), 0, 1)
|
1507 |
+
# random_color = color_rng[:3]
|
1508 |
+
# valid = tf.reduce_all(tf.reduce_sum(tf.abs(random_color[None, None, :] - image), -1) > 0.03)
|
1509 |
+
# if color_rng[0] < 0.2 and valid:
|
1510 |
+
# image = tf.where(tf.reduce_all(image < 0.99, axis=-1, keepdims=True),
|
1511 |
+
# image, image * 0 + random_color[None, None, :])
|
1512 |
+
|
1513 |
+
# Mild color hitter
|
1514 |
+
image = tf.image.stateless_random_hue(image, max_delta=0.05, seed=seeds.pop())
|
1515 |
+
image = tf.image.stateless_random_brightness(image, max_delta=0.15, seed=seeds.pop())
|
1516 |
+
image = tf.image.stateless_random_saturation(image, 0.8, 1.2, seed=seeds.pop())
|
1517 |
+
image = tf.image.stateless_random_contrast(image, 0.8, 1.2, seed=seeds.pop())
|
1518 |
+
|
1519 |
+
# ex["metadata/unaugmented_image"] = ex["image"]
|
1520 |
+
ex["image"] = image
|
1521 |
+
return ex
|
1522 |
+
|
1523 |
+
|
1524 |
+
@seqio.map_over_dataset
|
1525 |
+
def clocks_preprocessor(ex):
|
1526 |
+
time_format = ex["time_format"]
|
1527 |
+
shows_seconds = ex["shows_seconds"]
|
1528 |
+
hour, minute, second = [tf.cast(ex[k], tf.int32) for k in ["hour", "minute", "second"]]
|
1529 |
+
if hour == 0: # Midnight of the previous day
|
1530 |
+
am_pm = "PM"
|
1531 |
+
hour_str = 12
|
1532 |
+
hour = 24
|
1533 |
+
elif hour > 12:
|
1534 |
+
am_pm = "PM"
|
1535 |
+
hour_str = hour - 12
|
1536 |
+
else:
|
1537 |
+
hour_str = hour
|
1538 |
+
am_pm = "AM"
|
1539 |
+
hour_str = tf.strings.as_string(hour_str)
|
1540 |
+
minute_str = tf.strings.as_string(minute)
|
1541 |
+
if tf.strings.length(minute_str) == 1:
|
1542 |
+
minute_str = tf.strings.join(["0", minute_str])
|
1543 |
+
|
1544 |
+
second_str = tf.strings.as_string(second)
|
1545 |
+
if tf.strings.length(second_str) == 1:
|
1546 |
+
second_str = tf.strings.join(["0", second_str])
|
1547 |
+
|
1548 |
+
prefix = "The time shown is "
|
1549 |
+
|
1550 |
+
if time_format == "The time is not shown":
|
1551 |
+
text = "The time is not shown in the image."
|
1552 |
+
hour, minute, second = -1, -1, -1
|
1553 |
+
else:
|
1554 |
+
if not shows_seconds:
|
1555 |
+
second = -1
|
1556 |
+
if time_format == "12 hour clock (without AM/PM)" and shows_seconds:
|
1557 |
+
if hour > 12:
|
1558 |
+
hour = hour - 12
|
1559 |
+
time = tf.strings.join([hour_str, ":", minute_str, ":", second_str])
|
1560 |
+
elif time_format == "12 hour clock (with AM/PM)" and shows_seconds:
|
1561 |
+
time = tf.strings.join([hour_str, ":", minute_str, ":", second_str, " ", am_pm])
|
1562 |
+
elif time_format == "12 hour clock (with AM/PM)" and not shows_seconds:
|
1563 |
+
time = tf.strings.join([hour_str, ":", minute_str, " ", am_pm])
|
1564 |
+
elif time_format == "12 hour clock (without AM/PM)" and not shows_seconds:
|
1565 |
+
if hour > 12:
|
1566 |
+
hour = hour - 12
|
1567 |
+
time = tf.strings.join([hour_str, ":", minute_str])
|
1568 |
+
else:
|
1569 |
+
time = "" # Should never occur, but needed for tf analysis
|
1570 |
+
tf.debugging.assert_equal(tf.strings.length(time) > 0, True)
|
1571 |
+
text = tf.strings.join(["The time shown is ", time])
|
1572 |
+
image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
|
1573 |
+
image = tf.image.convert_image_dtype(image, tf.float32)[:-120] # remove the black shadow at the bottom
|
1574 |
+
return {
|
1575 |
+
"image": image,
|
1576 |
+
"prompt": "What time is being shown?",
|
1577 |
+
"text": text,
|
1578 |
+
"metadata/time_format": time_format,
|
1579 |
+
"metadata/hour": hour,
|
1580 |
+
"metadata/minute": minute,
|
1581 |
+
"metadata/text": text,
|
1582 |
+
"metadata/second": second,
|
1583 |
+
}
|
1584 |
+
|
1585 |
+
|
1586 |
+
@seqio.map_over_dataset()
|
1587 |
+
def atlas_obscura_preprocessor(ex):
|
1588 |
+
out = dict(
|
1589 |
+
image=ex["image"],
|
1590 |
+
prompt="Where was this picture taken?",
|
1591 |
+
text=tf.strings.join([
|
1592 |
+
ex["place"],
|
1593 |
+
" in ",
|
1594 |
+
ex["city"]
|
1595 |
+
])
|
1596 |
+
)
|
1597 |
+
out["metadata/image_url"] = ex["image_url"]
|
1598 |
+
out["metadata/references"] = out["text"]
|
1599 |
+
return out
|
1600 |
+
|
1601 |
+
|
1602 |
+
@seqio.map_over_dataset()
|
1603 |
+
def famous_birthdays_preprocessor(ex):
|
1604 |
+
out = dict(
|
1605 |
+
image=ex["image"],
|
1606 |
+
image_url=ex["image_url"],
|
1607 |
+
prompt="Who is this?",
|
1608 |
+
text=ex["name"]
|
1609 |
+
)
|
1610 |
+
out["metadata/references"] = out["text"]
|
1611 |
+
return out
|
1612 |
+
|
1613 |
+
|
1614 |
+
@seqio.map_over_dataset()
|
1615 |
+
def mild_color_aug_preprocessor(ex):
|
1616 |
+
if "image_url" in ex: # URL won't show the augmentations
|
1617 |
+
del ex["image_url"]
|
1618 |
+
# ex["metadata/unaugmented_image"] = ex["image"]
|
1619 |
+
ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
|
1620 |
+
ex["image"] = mild_color_aug(ex["image"])
|
1621 |
+
return ex
|
1622 |
+
|
1623 |
+
|
1624 |
+
def build_text_with_points(text, points, img_h, img_w):
|
1625 |
+
points = points_to_text(img_h, img_w, points[:, 0], points[:, 1])
|
1626 |
+
parts = tf.strings.split(text, sep="<ANS>")
|
1627 |
+
with_points = tf.strings.reduce_join(tf.reshape(tf.stack([
|
1628 |
+
parts,
|
1629 |
+
tf.pad(points, [[0, 1]], constant_values=""),
|
1630 |
+
], 1), [-1]), separator="")
|
1631 |
+
return tf.strings.split(with_points, "\n\n")
|
1632 |
+
|
1633 |
+
|
1634 |
+
@seqio.map_over_dataset()
|
1635 |
+
def synth_count_preprocessor(example):
|
1636 |
+
image_shape = tf.shape(example["image"])
|
1637 |
+
h, w = image_shape[0], image_shape[1]
|
1638 |
+
questions = build_text_with_points(example["questions"], example["question_points"], h, w)
|
1639 |
+
answers = build_text_with_points(example["answers"], example["answer_points"], h, w)
|
1640 |
+
keep_q = tf.strings.regex_full_match(questions, "How many.*")
|
1641 |
+
keep_ans = tf.strings.regex_full_match(answers, "There are [0-9]+.*")
|
1642 |
+
keep = tf.logical_and(keep_q, keep_ans)
|
1643 |
+
questions = tf.boolean_mask(questions, keep)
|
1644 |
+
answers = tf.boolean_mask(answers, keep)
|
1645 |
+
ix = tf.range(0, tf.shape(answers)[0], dtype=tf.int32)
|
1646 |
+
ix = tf.random.shuffle(ix)
|
1647 |
+
return dict(
|
1648 |
+
image=example["image"],
|
1649 |
+
prompt=tf.gather(questions, ix),
|
1650 |
+
text=tf.gather(answers, ix),
|
1651 |
+
)
|
1652 |
+
|
1653 |
+
|
1654 |
+
def synth_count_inf_preprocessor(ds):
|
1655 |
+
|
1656 |
+
@seqio.map_over_dataset(num_seeds=1)
|
1657 |
+
def get_two(example, seed):
|
1658 |
+
image_shape = tf.shape(example["image"])
|
1659 |
+
h, w = image_shape[0], image_shape[1]
|
1660 |
+
questions = build_text_with_points(example["questions"], example["question_points"], h, w)
|
1661 |
+
answers = build_text_with_points(example["answers"], example["answer_points"], h, w)
|
1662 |
+
keep_q = tf.strings.regex_full_match(questions, "How many.*")
|
1663 |
+
keep_ans = tf.strings.regex_full_match(answers, "There are [0-9]+.*")
|
1664 |
+
keep = tf.logical_and(keep_q, keep_ans)
|
1665 |
+
questions = tf.boolean_mask(questions, keep)
|
1666 |
+
answers = tf.boolean_mask(answers, keep)
|
1667 |
+
|
1668 |
+
ix = stateless_permutation(tf.shape(answers)[0], seed)[:2]
|
1669 |
+
return {
|
1670 |
+
"image": example["image"],
|
1671 |
+
"prompt": tf.gather(questions, ix),
|
1672 |
+
"metadata/references": tf.gather(answers, ix),
|
1673 |
+
}
|
1674 |
+
|
1675 |
+
ds = get_two(ds)
|
1676 |
+
return flatten_parts(ds, ["prompt", "metadata/references"])
|
1677 |
+
|
1678 |
+
|
1679 |
+
def mild_color_aug(image):
|
1680 |
+
image = tf.image.random_hue(image, max_delta=0.05)
|
1681 |
+
image = tf.image.random_brightness(image, max_delta=0.15)
|
1682 |
+
image = tf.image.random_saturation(image, 0.7, 1.3)
|
1683 |
+
image = tf.image.random_contrast(image, 0.8, 1.2)
|
1684 |
+
return image
|
1685 |
+
|
1686 |
+
|
1687 |
+
@seqio.map_over_dataset()
|
1688 |
+
def name_entity_augmentation(ex, p_high_color=0.7):
|
1689 |
+
ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
|
1690 |
+
image = ex["image"]
|
1691 |
+
image = tf.image.convert_image_dtype(image, tf.float32)
|
1692 |
+
|
1693 |
+
# Horizontal flip
|
1694 |
+
if tf.random.uniform((), 0, 1) > 0.85:
|
1695 |
+
image = image[:, ::-1]
|
1696 |
+
|
1697 |
+
# Random crop
|
1698 |
+
height = tf.cast(tf.shape(image)[0], tf.float32)
|
1699 |
+
width = tf.cast(tf.shape(image)[1], tf.float32)
|
1700 |
+
crop_rng = tf.random.uniform((), 0, 1)
|
1701 |
+
if crop_rng < 0.2:
|
1702 |
+
pass
|
1703 |
+
else:
|
1704 |
+
if crop_rng < 0.4:
|
1705 |
+
h_crop = height * 0.15
|
1706 |
+
w_crop = width * 0.15
|
1707 |
+
else:
|
1708 |
+
h_crop = height * 0.4
|
1709 |
+
w_crop = width * 0.4
|
1710 |
+
crop_h = tf.cast(tf.random.uniform((2,), 0, h_crop/2), tf.int32)
|
1711 |
+
crop_w = tf.cast(tf.random.uniform((2,), 0, w_crop/2), tf.int32)
|
1712 |
+
image = image[crop_h[0]:-crop_h[1]-1, crop_w[0]:-crop_w[1]-1]
|
1713 |
+
height = tf.cast(tf.shape(image)[0], tf.float32)
|
1714 |
+
width = tf.cast(tf.shape(image)[1], tf.float32)
|
1715 |
+
|
1716 |
+
if tf.random.uniform(()) > p_high_color:
|
1717 |
+
image = tf.image.random_hue(image, max_delta=0.05)
|
1718 |
+
image = tf.image.random_brightness(image, max_delta=0.15)
|
1719 |
+
image = tf.image.random_saturation(image, 0.7, 1.3)
|
1720 |
+
image = tf.image.random_contrast(image, 0.8, 1.2)
|
1721 |
+
else:
|
1722 |
+
image = tf.image.random_hue(image, max_delta=0.1)
|
1723 |
+
image = tf.image.random_brightness(image, max_delta=0.3)
|
1724 |
+
image = tf.image.random_saturation(image, 0.0, 2.0)
|
1725 |
+
image = tf.image.random_contrast(image, 0.2, 1.5)
|
1726 |
+
|
1727 |
+
# Apply shear, rotation, and scale through one affine matrix
|
1728 |
+
sel = tf.random.uniform((), 0, 1)
|
1729 |
+
if sel < 0.1:
|
1730 |
+
pass
|
1731 |
+
else:
|
1732 |
+
if sel < 0.15: # Scale only
|
1733 |
+
shear_x = 0
|
1734 |
+
shear_y = 0
|
1735 |
+
rotation = 0
|
1736 |
+
if sel < 0.7: # Mild
|
1737 |
+
shear_x = tf.random.uniform((), -2, 2)
|
1738 |
+
shear_y = tf.random.uniform((), -2, 2)
|
1739 |
+
rotation = tf.random.uniform((), -5, 5)
|
1740 |
+
else: # Severe
|
1741 |
+
shear_x = tf.random.uniform((), -10, 10)
|
1742 |
+
shear_y = tf.random.uniform((), -10, 10)
|
1743 |
+
rotation = tf.random.uniform((), -20, 20)
|
1744 |
+
|
1745 |
+
max_scale = 1.2
|
1746 |
+
scale = tf.random.uniform((), 0.4, max_scale)
|
1747 |
+
|
1748 |
+
# Pad so upscaling/rotation will not move the image out of bounds
|
1749 |
+
pad = tf.cast(tf.maximum(height, width)*0.2, tf.int32)
|
1750 |
+
image = tf.pad(image, [[pad, pad], [pad, pad], [0, 0]], constant_values=1)
|
1751 |
+
|
1752 |
+
image = tf.keras.ops.image.affine_transform(
|
1753 |
+
image,
|
1754 |
+
tf.stack(get_affine_matrix(
|
1755 |
+
[height/2, width/2],
|
1756 |
+
rotation,
|
1757 |
+
[0, 0],
|
1758 |
+
1/scale,
|
1759 |
+
[shear_x, shear_y]
|
1760 |
+
) + [0., 0.]),
|
1761 |
+
interpolation='bilinear',
|
1762 |
+
fill_mode='constant',
|
1763 |
+
fill_value=1.,
|
1764 |
+
data_format='channels_last'
|
1765 |
+
)
|
1766 |
+
|
1767 |
+
# Crop, otherwise it would be impossible to put the image at the corner of the image
|
1768 |
+
not_white = tf.logical_not(tf.reduce_all(image > 0.99, -1))
|
1769 |
+
no_white_ix = tf.where(not_white)
|
1770 |
+
top_left = tf.reduce_min(no_white_ix, axis=0)
|
1771 |
+
bottom_right = tf.reduce_max(no_white_ix, axis=0)
|
1772 |
+
|
1773 |
+
# Very low chance center crop will get nothing but white space, we just skip
|
1774 |
+
if (
|
1775 |
+
(bottom_right[0] - top_left[0]) > 1 and (bottom_right[1] - top_left[1]) > 1
|
1776 |
+
):
|
1777 |
+
image = tf.image.crop_to_bounding_box(
|
1778 |
+
image,
|
1779 |
+
offset_height=tf.cast(top_left[0], tf.int32),
|
1780 |
+
offset_width=tf.cast(top_left[1], tf.int32),
|
1781 |
+
target_height=tf.cast(bottom_right[0] - top_left[0] + 1, tf.int32),
|
1782 |
+
target_width=tf.cast(bottom_right[1] - top_left[1] + 1, tf.int32),
|
1783 |
+
)
|
1784 |
+
|
1785 |
+
# Translate
|
1786 |
+
height, width = tf.shape(image)[0], tf.shape(image)[1]
|
1787 |
+
if tf.random.uniform((), 0, 1) < 0.1:
|
1788 |
+
h_pad = tf.zeros((2,), dtype=tf.int32)
|
1789 |
+
w_pad = tf.zeros((2,), dtype=tf.int32)
|
1790 |
+
elif tf.random.uniform((), 0, 1) < 0.8:
|
1791 |
+
h_pad = tf.random.uniform((2,), 0, 50, dtype=tf.int32)
|
1792 |
+
w_pad = tf.random.uniform((2,), 0, 50, dtype=tf.int32)
|
1793 |
+
else:
|
1794 |
+
pad = tf.cast(tf.maximum(height, width), tf.int32)
|
1795 |
+
h_pad = tf.random.uniform((2,), 0, pad, dtype=tf.int32)
|
1796 |
+
w_pad = tf.random.uniform((2,), 0, pad, dtype=tf.int32)
|
1797 |
+
image = tf.pad(image, [[h_pad[0], w_pad[0]], [h_pad[1], w_pad[1]], [0, 0]],
|
1798 |
+
constant_values=1)
|
1799 |
+
|
1800 |
+
if "image_url" in ex: # URL won't show the augmentations
|
1801 |
+
del ex["image_url"]
|
1802 |
+
# ex["metadata/unaugmented_image"] = ex["image"]
|
1803 |
+
ex["image"] = image
|
1804 |
+
return ex
|
1805 |
+
|
1806 |
+
|
1807 |
+
@seqio.map_over_dataset()
|
1808 |
+
def wiki_art_preprocessor(ex):
|
1809 |
+
out = dict(
|
1810 |
+
image=ex["image"],
|
1811 |
+
prompt="What is this?",
|
1812 |
+
text=ex["question"]
|
1813 |
+
)
|
1814 |
+
out["metadata/title"] = ex["title"]
|
1815 |
+
out["metadata/gt"] = ex["question"]
|
1816 |
+
out["metadata/artist"] = ex["artist"]
|
1817 |
+
out["metadata/painting_url"] = ex["painting_url"]
|
1818 |
+
# if "metadata/unaugmented_image" in ex:
|
1819 |
+
# out["metadata/unaugmented_image"] = ex["metadata/unaugmented_image"]
|
1820 |
+
return out
|
1821 |
+
|
1822 |
+
@seqio.map_over_dataset()
|
1823 |
+
def oscar_preprocessor(ex):
|
1824 |
+
out = dict(
|
1825 |
+
image=ex["image"],
|
1826 |
+
prompt=ex["question"]
|
1827 |
+
)
|
1828 |
+
out.update(_add_metadata(ex))
|
1829 |
+
out["metadata/question"] = ex["question"]
|
1830 |
+
out["metadata/answer"] = ex["answer"]
|
1831 |
+
out["metadata/category"] = ex["category"]
|
1832 |
+
return out
|
1833 |
+
|
1834 |
+
|
1835 |
+
@seqio.map_over_dataset()
|
1836 |
+
def tulu_preprocessor(ex):
|
1837 |
+
return {
|
1838 |
+
"messages": ex["messages"]["content"],
|
1839 |
+
}
|
1840 |
+
# logging.info("Debugging tulue")
|
1841 |
+
# return {"messages": ex["messages"]["content"], "text_weights": 1e-6}
|
1842 |
+
|
1843 |
+
|
1844 |
+
WIKI_DATA_QUESTION = "What is this? Respond with just a proper name."
|
1845 |
+
|
1846 |
+
|
1847 |
+
@seqio.map_over_dataset()
|
1848 |
+
def extract_wiki_data(ex):
|
1849 |
+
return dict(
|
1850 |
+
image=ex["image"],
|
1851 |
+
image_url=ex["image_url"],
|
1852 |
+
prompt=[
|
1853 |
+
WIKI_DATA_QUESTION,
|
1854 |
+
"What is this? Respond with the proper name of the main focus of the image and a few details about it."
|
1855 |
+
],
|
1856 |
+
text=[
|
1857 |
+
tf.strings.strip(tf.strings.regex_replace(ex["question"], r"\(.*\)", "")),
|
1858 |
+
ex["gptResponse"],
|
1859 |
+
]
|
1860 |
+
)
|
1861 |
+
|
1862 |
+
|
1863 |
+
@seqio.map_over_dataset()
|
1864 |
+
def extract_wiki_data_name(ex):
|
1865 |
+
target = tf.strings.strip(tf.strings.regex_replace(ex["question"], r"\(.*\)", ""))
|
1866 |
+
out = dict(
|
1867 |
+
image=ex["image"],
|
1868 |
+
image_url=ex["image_url"],
|
1869 |
+
prompt=WIKI_DATA_QUESTION,
|
1870 |
+
text=target,
|
1871 |
+
)
|
1872 |
+
out["metadata/references"] = target
|
1873 |
+
return out
|
1874 |
+
|
1875 |
+
|
1876 |
+
@seqio.map_over_dataset()
|
1877 |
+
def extract_wiki_data_describe(ex):
|
1878 |
+
out = dict(
|
1879 |
+
image=ex["image"],
|
1880 |
+
image_url=ex["image_url"],
|
1881 |
+
prompt="What is this? Respond with the proper name of the main focus of the image and a few details about it.",
|
1882 |
+
)
|
1883 |
+
out["metadata/references"] = ex["gptResponse"]
|
1884 |
+
return out
|
1885 |
+
|
1886 |
+
|
1887 |
+
@gin.configurable()
|
1888 |
+
def format_multiple_style_qa(ds, types=['multiple_choice', 'short_answer'], styles=['ai2_diagram', 'vqa2'], default_style='vqa2',
|
1889 |
+
strip_instruction=False):
|
1890 |
+
def _extract(ex):
|
1891 |
+
prompt = ex["question"]
|
1892 |
+
out = dict(image=ex["image"])
|
1893 |
+
out.update(_add_metadata(ex))
|
1894 |
+
|
1895 |
+
out["text"] = ex["answer"]
|
1896 |
+
out["metadata/references"] = ex["answer"]
|
1897 |
+
|
1898 |
+
if ex["metadata/question_type"] == 'multiple_choice':
|
1899 |
+
style = styles[0]
|
1900 |
+
else:
|
1901 |
+
style = styles[1]
|
1902 |
+
if strip_instruction:
|
1903 |
+
if ex["metadata/question_type"] == "multiple_choice":
|
1904 |
+
# parts = tf.strings.split(prompt, "\n")
|
1905 |
+
# parts 1 is blank and part -1 is the instruction
|
1906 |
+
# prompt = tf.strings.reduce_join(tf.concat([parts[:1], parts[2:-1]], 0), separator="\n")
|
1907 |
+
prompt = prompt
|
1908 |
+
else:
|
1909 |
+
prompt = tf.strings.split(prompt, "\n")[0]
|
1910 |
+
|
1911 |
+
out["style"] = style
|
1912 |
+
out["prompt"] = prompt
|
1913 |
+
return out
|
1914 |
+
ds = ds.map(_extract, num_parallel_calls=tf.data.experimental.AUTOTUNE)
|
1915 |
+
return ds
|
1916 |
+
|
1917 |
+
|
1918 |
+
@gin.configurable()
|
1919 |
+
def extract_mmmu(ds, types=['multiple-choice', 'open'], styles=['ai2_diagram', 'vqa2'], default_style='ai2_diagram', option_format="abc"):
|
1920 |
+
assert option_format == "abc"
|
1921 |
+
keys_tensor = tf.constant(types, dtype=tf.string)
|
1922 |
+
values_tensor = tf.constant(styles, dtype=tf.string)
|
1923 |
+
table = tf.lookup.StaticHashTable(
|
1924 |
+
tf.lookup.KeyValueTensorInitializer(keys_tensor, values_tensor),
|
1925 |
+
default_value=tf.constant(default_style, dtype=tf.string),
|
1926 |
+
)
|
1927 |
+
def _extract(ex):
|
1928 |
+
out = dict(image=tf.expand_dims(ex["image_1"], 0))
|
1929 |
+
out.update(_add_metadata(ex))
|
1930 |
+
style = table.lookup(ex["metadata/question_type"])
|
1931 |
+
out["style"] = style
|
1932 |
+
out["text"] = ex["answer"]
|
1933 |
+
out["metadata/references"] = ex["answer"]
|
1934 |
+
|
1935 |
+
if style == styles[0]:
|
1936 |
+
abc = tf.constant(list("abcdefghi".upper()))
|
1937 |
+
options = ex["options"]
|
1938 |
+
num_options = tf.shape(options)[0]
|
1939 |
+
dummy_options = tf.tile(tf.constant([""], dtype=tf.string), [9 - num_options])
|
1940 |
+
out["metadata/options"] = tf.concat([options, dummy_options], axis=0)
|
1941 |
+
out["metadata/options"] = tf.ensure_shape(out["metadata/options"], [9])
|
1942 |
+
|
1943 |
+
short_options = abc[:num_options]
|
1944 |
+
options = tf.stack([short_options, options,], 1)
|
1945 |
+
options = tf.strings.reduce_join(options, axis=-1, separator=": ")
|
1946 |
+
options = tf.strings.reduce_join(options, separator="\n")
|
1947 |
+
out["prompt"] = tf.strings.join([ex["question"], "\n", options, "\n"])
|
1948 |
+
if tf.reduce_sum(tf.cast(tf.strings.regex_full_match(options, "<img='(.*?)'>"), tf.int32)) > 1:
|
1949 |
+
# Following LLaVa, don't use any images if there are multiple images paths
|
1950 |
+
# I think the rationale is that this means the image are answer-options
|
1951 |
+
out["image"] = out["image"][:0]
|
1952 |
+
else:
|
1953 |
+
out["metadata/options"] = tf.constant([""] * 9, dtype=tf.string)
|
1954 |
+
out["prompt"] = ex["question"]
|
1955 |
+
out["image"] = out["image"][:0]
|
1956 |
+
return out
|
1957 |
+
ds = ds.map(_extract, num_parallel_calls=tf.data.experimental.AUTOTUNE)
|
1958 |
+
return ds
|
1959 |
+
|
1960 |
+
@gin.configurable()
|
1961 |
+
def extract_mmmu_cot(ds, types=['multiple-choice', 'open'], styles=['ai2_diagram', 'vqa2'], default_style='ai2_diagram', option_format="abc"):
|
1962 |
+
assert option_format == "abc"
|
1963 |
+
keys_tensor = tf.constant(types, dtype=tf.string)
|
1964 |
+
values_tensor = tf.constant(styles, dtype=tf.string)
|
1965 |
+
table = tf.lookup.StaticHashTable(
|
1966 |
+
tf.lookup.KeyValueTensorInitializer(keys_tensor, values_tensor),
|
1967 |
+
default_value=tf.constant(default_style, dtype=tf.string),
|
1968 |
+
)
|
1969 |
+
def _extract(ex):
|
1970 |
+
# out = dict(image=tf.expand_dims(ex["image_with_question"], 0))
|
1971 |
+
out = dict(image=tf.expand_dims(ex["image_1"], 0))
|
1972 |
+
out.update(_add_metadata(ex))
|
1973 |
+
style = table.lookup(ex["metadata/question_type"])
|
1974 |
+
# out["style"] = style
|
1975 |
+
out["text"] = ex["answer"]
|
1976 |
+
out["metadata/question"] = ex["question"]
|
1977 |
+
out["metadata/references"] = ex["answer"]
|
1978 |
+
|
1979 |
+
if style == styles[0]:
|
1980 |
+
abc = tf.constant(list("abcdefghi".upper()))
|
1981 |
+
options = ex["options"]
|
1982 |
+
num_options = tf.shape(options)[0]
|
1983 |
+
dummy_options = tf.tile(tf.constant([""], dtype=tf.string), [9 - num_options])
|
1984 |
+
out["metadata/options"] = tf.concat([options, dummy_options], axis=0)
|
1985 |
+
out["metadata/options"] = tf.ensure_shape(out["metadata/options"], [9])
|
1986 |
+
|
1987 |
+
short_options = abc[:num_options]
|
1988 |
+
options = tf.stack([short_options, options,], 1)
|
1989 |
+
options = tf.strings.reduce_join(options, axis=-1, separator=": ")
|
1990 |
+
options = tf.strings.reduce_join(options, separator="\n")
|
1991 |
+
out["prompt"] = tf.strings.join([ex["question"], "\n", options, "\n"])
|
1992 |
+
# out["prompt"] = ex["question"]
|
1993 |
+
if tf.reduce_sum(tf.cast(tf.strings.regex_full_match(options, "<img='(.*?)'>"), tf.int32)) > 1:
|
1994 |
+
# Following LLaVa, don't use any images if there are multiple images paths
|
1995 |
+
# I think the rationale is that this means the image are answer-options
|
1996 |
+
out["image"] = out["image"][:0]
|
1997 |
+
else:
|
1998 |
+
out["metadata/options"] = tf.constant([""] * 9, dtype=tf.string)
|
1999 |
+
out["prompt"] = ex["question"]
|
2000 |
+
# out["image"] = out["image"][:0]
|
2001 |
+
return out
|
2002 |
+
ds = ds.map(_extract, num_parallel_calls=tf.data.experimental.AUTOTUNE)
|
2003 |
+
return ds
|
2004 |
+
|
2005 |
+
|
2006 |
+
@seqio.map_over_dataset
|
2007 |
+
def reformat_math_vista(ex):
|
2008 |
+
query = ex["query"]
|
2009 |
+
query = tf.strings.split(query, sep="Question:")[-1]
|
2010 |
+
query = tf.strings.strip(tf.strings.split(query, sep="Hint:")[0])
|
2011 |
+
ex["query"] = query
|
2012 |
+
return ex
|
2013 |
+
|
2014 |
+
|
2015 |
+
@seqio.map_over_dataset
|
2016 |
+
def extract_math_vista(ex, styles=['ai2_diagram', 'vqa2']):
|
2017 |
+
out = dict(image=ex["image"])
|
2018 |
+
out.update(_add_metadata(ex))
|
2019 |
+
|
2020 |
+
is_mc = ex["metadata/question_type"] == 'multi_choice'
|
2021 |
+
if is_mc:
|
2022 |
+
style = styles[0]
|
2023 |
+
abc = tf.constant(list("abcdefghi".upper()))
|
2024 |
+
options = ex["choices"]
|
2025 |
+
num_options = tf.shape(options)[0]
|
2026 |
+
dummy_options = tf.tile(tf.constant([""], dtype=tf.string), [9 - num_options])
|
2027 |
+
out["metadata/options"] = tf.concat([options, dummy_options], axis=0)
|
2028 |
+
out["metadata/options"] = tf.ensure_shape(out["metadata/options"], [9])
|
2029 |
+
|
2030 |
+
if ex["metadata/split"] != "test":
|
2031 |
+
short_options = abc[:num_options]
|
2032 |
+
answer_short_option = tf.boolean_mask(short_options, options == ex["answer"])[0]
|
2033 |
+
out["text"] = answer_short_option
|
2034 |
+
else:
|
2035 |
+
out["text"] = ex["answer"]
|
2036 |
+
else:
|
2037 |
+
style = styles[1]
|
2038 |
+
out["metadata/options"] = tf.constant([""] * 9, dtype=tf.string)
|
2039 |
+
out["text"] = ex["answer"]
|
2040 |
+
out["style"] = style
|
2041 |
+
out["prompt"] = ex["query"]
|
2042 |
+
out["metadata/query"] = ex["query"]
|
2043 |
+
out["metadata/references"] = ex["answer"]
|
2044 |
+
return out
|
2045 |
+
|
2046 |
+
|
2047 |
+
NO_POINT_PREFIX = [
|
2048 |
+
"No pointing: ",
|
2049 |
+
"No pointing: ",
|
2050 |
+
"no pointing:\n",
|
2051 |
+
"No pointing:\n",
|
2052 |
+
"Not pointing:\n",
|
2053 |
+
"No Points: ",
|
2054 |
+
"No Points: ",
|
2055 |
+
"NO POINTING\n",
|
2056 |
+
"No pontiing\n",
|
2057 |
+
"No Points:\n ",
|
2058 |
+
"No pointing\n",
|
2059 |
+
"Do not point. ",
|
2060 |
+
"Refrain from pointing. ",
|
2061 |
+
"Avoid generating points . ",
|
2062 |
+
"For this question, do not use points. ",
|
2063 |
+
"Refrain from using points:\n",
|
2064 |
+
"Don't include points in your response. ",
|
2065 |
+
"Don't point. ",
|
2066 |
+
"Don't use points. ",
|
2067 |
+
"Please don't use points.\n\n",
|
2068 |
+
"Please don't use points.\n\n",
|
2069 |
+
"Respond without using points. ",
|
2070 |
+
"Respond without pointing:\n",
|
2071 |
+
"Do not generate ponits: ",
|
2072 |
+
"Do not point. ",
|
2073 |
+
"Do not point\n",
|
2074 |
+
"no pointing\n\n",
|
2075 |
+
"Answer without points: ",
|
2076 |
+
"Answer this question without pointing: ",
|
2077 |
+
"Answer without poiints. ",
|
2078 |
+
"answer without points: ",
|
2079 |
+
"answer with text only, do not points\n"
|
2080 |
+
]
|
2081 |
+
assert all(x[-1].isspace() for x in NO_POINT_PREFIX)
|
2082 |
+
NO_POINT_PREFIX_TF = tf.constant(NO_POINT_PREFIX)
|
2083 |
+
|
2084 |
+
|
2085 |
+
def prefix_how_many(messages, seed):
|
2086 |
+
question = messages[0]
|
2087 |
+
if tf.strings.regex_full_match(tf.strings.lower(question), "how many.*"):
|
2088 |
+
ix = tf.random.stateless_uniform((), seed, 0, len(NO_POINT_PREFIX), tf.int32)
|
2089 |
+
question = tf.strings.join([NO_POINT_PREFIX_TF[ix], question])
|
2090 |
+
return tf.concat([tf.expand_dims(question, 0), messages[1:]], axis=0)
|
2091 |
+
else:
|
2092 |
+
return messages
|
2093 |
+
|
2094 |
+
|
2095 |
+
@seqio.map_over_dataset(num_seeds=1)
|
2096 |
+
def prefix_how_many_messages(ex, seed):
|
2097 |
+
messages = ex["messages"]
|
2098 |
+
n = tf.shape(messages)[0]
|
2099 |
+
seeds = tf.random.split(seed, n)
|
2100 |
+
message_arr = tf.TensorArray(dtype=tf.string, size=n, element_shape=(None,))
|
2101 |
+
for i in range(n):
|
2102 |
+
message_arr = message_arr.write(i, prefix_how_many(messages[i], seeds[i]))
|
2103 |
+
ex["messages"] = tf.RaggedTensor.from_row_splits(
|
2104 |
+
values=message_arr.concat(), row_splits=messages.row_splits)
|
2105 |
+
return ex
|
2106 |
+
|
2107 |
+
|
2108 |
+
def filter_single_turn(ds):
|
2109 |
+
@seqio.map_over_dataset
|
2110 |
+
def _filter(ex):
|
2111 |
+
multi_turn = ex["messages"].row_lengths() > 2
|
2112 |
+
ex["messages"] = tf.ragged.boolean_mask(ex["messages"], multi_turn)
|
2113 |
+
return ex
|
2114 |
+
|
2115 |
+
ds = _filter(ds)
|
2116 |
+
ds = ds.filter(lambda x: tf.shape(x["messages"])[0] > 0)
|
2117 |
+
return ds
|
2118 |
+
|
2119 |
+
|
2120 |
+
@seqio.map_over_dataset(num_seeds=1)
|
2121 |
+
def extract_cockatoo_qa_v2(ex, seed):
|
2122 |
+
messages = tf.RaggedTensor.from_value_rowids(ex["messages"], ex["conversation_ids"])
|
2123 |
+
ix = stateless_permutation(tf.shape(messages)[0], seed)
|
2124 |
+
messages = tf.gather(messages, ix)
|
2125 |
+
out = dict(
|
2126 |
+
image=ex["image"],
|
2127 |
+
messages=messages
|
2128 |
+
)
|
2129 |
+
out.update(_add_metadata(ex))
|
2130 |
+
return out
|
2131 |
+
|
2132 |
+
|
2133 |
+
def format_mmbench(ds):
|
2134 |
+
|
2135 |
+
def _trim(ex):
|
2136 |
+
num_passes = tf.shape(ex["id"])[0]
|
2137 |
+
ex["choices"] = ex["choices"][:num_passes, :num_passes]
|
2138 |
+
ex["answer"] = ex["answer"][:num_passes]
|
2139 |
+
return ex
|
2140 |
+
|
2141 |
+
ds = ds.map(_trim)
|
2142 |
+
ds = flatten_parts(ds, ["id", "query", "choices", "answer"])
|
2143 |
+
|
2144 |
+
def _extract(ex):
|
2145 |
+
out = dict(image=ex["image"])
|
2146 |
+
out.update(_add_metadata(ex))
|
2147 |
+
out["prompt"] = ex["query"]
|
2148 |
+
out["text"] = ex["answer"]
|
2149 |
+
options = ex["choices"]
|
2150 |
+
tf.debugging.assert_equal(tf.reduce_any(tf.strings.regex_full_match(options, ".*\|\|\|.*")), False)
|
2151 |
+
out["metadata/options"] = tf.strings.reduce_join(options, separator="|||")
|
2152 |
+
out["metadata/question"] = ex["question"]
|
2153 |
+
out["metadata/references"] = ex["answer"]
|
2154 |
+
return out
|
2155 |
+
|
2156 |
+
ds = ds.map(_extract, num_parallel_calls=tf.data.experimental.AUTOTUNE)
|
2157 |
+
return ds
|
2158 |
+
|
2159 |
+
|
2160 |
+
@seqio.map_over_dataset
|
2161 |
+
def extract_lvis(ex, class_name_file="gs://oe-training-chrisc/cockatoo/data/lvis_class_names.json"):
|
2162 |
+
with tf.io.gfile.GFile(class_name_file) as f:
|
2163 |
+
class_names = json.load(f)
|
2164 |
+
class_names_arr = [None]*len(class_names)
|
2165 |
+
for k, v in class_names.items():
|
2166 |
+
class_names_arr[int(k)] = v
|
2167 |
+
assert all(x is not None for x in class_names_arr)
|
2168 |
+
class_names_arr = tf.constant(class_names_arr)
|
2169 |
+
|
2170 |
+
return dict(
|
2171 |
+
image=ex["image"],
|
2172 |
+
bbox=ex["objects"]["bbox"],
|
2173 |
+
label=tf.gather(class_names_arr, ex["objects"]["label"]),
|
2174 |
+
)
|
2175 |
+
|
2176 |
+
|
2177 |
+
def extract_open_images_boxes(ds):
|
2178 |
+
# ds = ds.filter(lambda ex: tf.logical_or(
|
2179 |
+
# tf.shape(ex["cap/cap_caption"])[0] > 0,
|
2180 |
+
# tf.shape(ex["detection/bbox"])[0] > 0
|
2181 |
+
# ))
|
2182 |
+
ds = ds.filter(lambda ex: tf.shape(ex["cap/cap_caption"])[0] > 0)
|
2183 |
+
|
2184 |
+
@seqio.map_over_dataset
|
2185 |
+
def _map(ex):
|
2186 |
+
bbox = tf.reshape(ex["detection/bbox"], (-1, 4))
|
2187 |
+
bbox = tf.stack([
|
2188 |
+
bbox[:, 2],
|
2189 |
+
bbox[:, 0],
|
2190 |
+
bbox[:, 3],
|
2191 |
+
bbox[:, 1]
|
2192 |
+
], 1)
|
2193 |
+
return dict(
|
2194 |
+
image=tf.image.decode_jpeg(ex["image"]),
|
2195 |
+
bbox=bbox,
|
2196 |
+
label=ex["detection/label"],
|
2197 |
+
caption=tf.strings.reduce_join(ex["cap/cap_caption"], separator="\n")
|
2198 |
+
)
|
2199 |
+
|
2200 |
+
return _map(ds)
|
2201 |
+
|
2202 |
+
|
2203 |
+
@seqio.map_over_dataset
|
2204 |
+
def region_captions_to_dense(ex):
|
2205 |
+
if "captions" in ex:
|
2206 |
+
captions = ex["captions"]["text"]
|
2207 |
+
boxes = ex["captions"]["bbox"]
|
2208 |
+
else:
|
2209 |
+
captions = ex["label"]
|
2210 |
+
boxes = ex["bbox"]
|
2211 |
+
|
2212 |
+
|
2213 |
+
sh = tf.cast(tf.shape(ex["image"])[:2], tf.float32)
|
2214 |
+
# image_h, image_w = sh[0], sh[1]
|
2215 |
+
w = boxes[:, 2] - boxes[:, 0]
|
2216 |
+
h = boxes[:, 3] - boxes[:, 1]
|
2217 |
+
|
2218 |
+
cx = tf.cast(boxes[:, 0] + w/2, tf.float32)
|
2219 |
+
cy = tf.cast(boxes[:, 1] + h/2, tf.float32)
|
2220 |
+
# w = w / image_w
|
2221 |
+
# h = h / image_h
|
2222 |
+
coor = tf.strings.reduce_join(
|
2223 |
+
float_to_text(tf.stack([cx, cy, w, h], 1)), separator=",", axis=1)
|
2224 |
+
|
2225 |
+
area = w*h
|
2226 |
+
if tf.random.uniform(()) < 0.5:
|
2227 |
+
coor_text = "before"
|
2228 |
+
captions = tf.strings.join([coor, captions], separator=": ")
|
2229 |
+
else:
|
2230 |
+
coor_text = "after"
|
2231 |
+
captions = tf.strings.join([captions, coor], separator=": ")
|
2232 |
+
|
2233 |
+
ix = tf.random.uniform((), 0, 6, tf.int32)
|
2234 |
+
center = boxes
|
2235 |
+
if ix == 0:
|
2236 |
+
order_text = "left"
|
2237 |
+
sort_by = boxes[:, 0]
|
2238 |
+
elif ix == 1:
|
2239 |
+
order_text = "right"
|
2240 |
+
sort_by = -boxes[:, 2]
|
2241 |
+
elif ix == 2:
|
2242 |
+
order_text = "top"
|
2243 |
+
sort_by = boxes[:, 1]
|
2244 |
+
elif ix == 3:
|
2245 |
+
order_text = "bottom"
|
2246 |
+
sort_by = -boxes[:, 3]
|
2247 |
+
elif ix == 4:
|
2248 |
+
order_text = "largest"
|
2249 |
+
sort_by = area
|
2250 |
+
else:
|
2251 |
+
order_text = "smallest"
|
2252 |
+
sort_by = -area
|
2253 |
+
ixs = tf.argsort(sort_by)
|
2254 |
+
captions = tf.gather(captions, ixs)
|
2255 |
+
text = tf.strings.join([
|
2256 |
+
order_text,
|
2257 |
+
coor_text,
|
2258 |
+
tf.strings.reduce_join(captions, separator="\n")
|
2259 |
+
], separator="; ")
|
2260 |
+
|
2261 |
+
if "caption" in ex:
|
2262 |
+
if tf.random.uniform(()) > 0.5:
|
2263 |
+
text = tf.strings.join([text, "\ncaption: ", ex["caption"]])
|
2264 |
+
else:
|
2265 |
+
text = tf.strings.join(["caption: ", ex["caption"], "\n", text])
|
2266 |
+
|
2267 |
+
return dict(
|
2268 |
+
image=ex["image"],
|
2269 |
+
text=text
|
2270 |
+
)
|
2271 |
+
|
2272 |
+
|
2273 |
+
@seqio.map_over_dataset()
|
2274 |
+
def join_captions(ex):
|
2275 |
+
text = tf.random.shuffle(ex['text'])
|
2276 |
+
ex["text"] = tf.strings.reduce_join(text, separator="\n")
|
2277 |
+
return ex
|
2278 |
+
|
2279 |
+
|
2280 |
+
@seqio.map_over_dataset(num_seeds=1)
|
2281 |
+
def extract_figureqa(ex, seed):
|
2282 |
+
questions = ex["questions"]
|
2283 |
+
n = stateless_permutation(tf.shape(questions["question"])[0], seed)
|
2284 |
+
return dict(
|
2285 |
+
image=ex["image"],
|
2286 |
+
questions=tf.gather(questions["question"], n),
|
2287 |
+
question_id=tf.gather(questions["question_id"], n),
|
2288 |
+
answer=tf.gather(tf.strings.as_string(questions["answer"]), n)
|
2289 |
+
)
|
2290 |
+
|
2291 |
+
|
2292 |
+
@seqio.map_over_dataset
|
2293 |
+
def convert_figureqa_answer(ex):
|
2294 |
+
keys_tensor = tf.constant(["0", "1"])
|
2295 |
+
values_tensor = tf.constant(["no", "yes"])
|
2296 |
+
table = tf.lookup.StaticHashTable(
|
2297 |
+
tf.lookup.KeyValueTensorInitializer(keys_tensor, values_tensor),
|
2298 |
+
default_value=tf.constant("nan", dtype=tf.string),
|
2299 |
+
)
|
2300 |
+
answer = table.lookup(ex["answer"])
|
2301 |
+
ex["answer"] = answer
|
2302 |
+
return ex
|
2303 |
+
|
2304 |
+
|
2305 |
+
@seqio.map_over_dataset()
|
2306 |
+
def build_question_with_hint(ex):
|
2307 |
+
hint = ex["hint"]
|
2308 |
+
if tf.strings.length(hint) > 0:
|
2309 |
+
ex["question"] = tf.strings.join([hint, ex["question"]], separator="\n")
|
2310 |
+
return ex
|
2311 |
+
|
2312 |
+
@seqio.map_over_dataset()
|
2313 |
+
def build_question_with_context(ex):
|
2314 |
+
context = ex["context"]
|
2315 |
+
if tf.strings.length(context) > 0:
|
2316 |
+
ex["question"] = tf.strings.join([context, ex["question"]], separator="\n")
|
2317 |
+
return ex
|
2318 |
+
|
2319 |
+
|
2320 |
+
def max_words(ds, max_words):
|
2321 |
+
return ds.filter(lambda x: x["n_words"] <= max_words)
|
2322 |
+
|
2323 |
+
|
2324 |
+
@seqio.map_over_dataset
|
2325 |
+
def format_pdfa_eng_wds(example):
|
2326 |
+
return dict(
|
2327 |
+
image=example["image"],
|
2328 |
+
text=tf.strings.reduce_join(example["lines"]["text"], separator="\n"),
|
2329 |
+
)
|
2330 |
+
|
2331 |
+
|
2332 |
+
@gin.configurable()
|
2333 |
+
def accuracy_conditioned_joint(ds, sequence_length, is_eval=False, eval_quality=17,
|
2334 |
+
transcript_quality=None):
|
2335 |
+
# v2: Transcripts no longer get a quality score
|
2336 |
+
is_training = sequence_length.get('is_training', True)
|
2337 |
+
if not is_training:
|
2338 |
+
if is_eval:
|
2339 |
+
prompt = f"quality {eval_quality}:"
|
2340 |
+
else:
|
2341 |
+
prompt = f"quality 17:"
|
2342 |
+
|
2343 |
+
@seqio.map_over_dataset
|
2344 |
+
def _with_prompt(ex):
|
2345 |
+
out = dict(
|
2346 |
+
image=ex["image"],
|
2347 |
+
url=ex["url"],
|
2348 |
+
prompt=prompt,
|
2349 |
+
)
|
2350 |
+
if "text" in ex:
|
2351 |
+
out["text"] = ex["text"]
|
2352 |
+
elif "caption" in ex:
|
2353 |
+
out["text"] = ex["caption"]
|
2354 |
+
return out
|
2355 |
+
return _with_prompt(ds)
|
2356 |
+
|
2357 |
+
elif is_eval:
|
2358 |
+
raise ValueError("is_eval=True and is_training=False")
|
2359 |
+
|
2360 |
+
# each transcript
|
2361 |
+
@seqio.map_over_dataset
|
2362 |
+
def _with_transcript(ex):
|
2363 |
+
if tf.shape(ex["edited_captions"]["caption"])[0] > 0:
|
2364 |
+
edited_caption = ex["edited_captions"]["caption"][0]
|
2365 |
+
n = ex["edited_captions"]["n_edits"][0]
|
2366 |
+
else:
|
2367 |
+
edited_caption = ""
|
2368 |
+
n = 0
|
2369 |
+
text = [
|
2370 |
+
ex["caption"],
|
2371 |
+
ex["transcripts"][tf.random.uniform((), 0, tf.shape(ex["transcripts"])[0], dtype=tf.int32)],
|
2372 |
+
edited_caption
|
2373 |
+
]
|
2374 |
+
edit_quality = 17 - n
|
2375 |
+
prompt = [
|
2376 |
+
"quality 17:",
|
2377 |
+
"" if transcript_quality is None else f"quality: {edit_quality}:",
|
2378 |
+
tf.strings.join(["quality ", tf.strings.as_string(edit_quality), ":"])
|
2379 |
+
]
|
2380 |
+
return dict(
|
2381 |
+
image=ex["image"],
|
2382 |
+
text=tf.stack(text, 0),
|
2383 |
+
url=ex["url"],
|
2384 |
+
prompt=tf.stack(prompt, 0),
|
2385 |
+
style=["long_caption", "transcript", "long_caption"]
|
2386 |
+
)
|
2387 |
+
return _with_transcript(ds)
|
2388 |
+
|
2389 |
+
|
2390 |
+
def select_dense_caption_sample(ds, samples=200):
|
2391 |
+
def compute_hash(string: str) -> str:
|
2392 |
+
return hashlib.sha256(string.encode("utf-8")).hexdigest()
|
2393 |
+
|
2394 |
+
with tf.io.gfile.GFile("gs://oe-training-chrisc/cockatoo/data/dense-caption-eval-v0-final-data.json") as f:
|
2395 |
+
data = json.load(f)
|
2396 |
+
for ex in data:
|
2397 |
+
ex["image_id"] = compute_hash(ex["image"])
|
2398 |
+
data.sort(key=lambda x: x["image_id"])
|
2399 |
+
np.random.RandomState(12312).shuffle(data)
|
2400 |
+
keep = tf.constant([x["image"] for x in data[:samples]])
|
2401 |
+
|
2402 |
+
def _keep(ex):
|
2403 |
+
return tf.reduce_any(ex["url"] == keep)
|
2404 |
+
ds = ds.filter(_keep)
|
2405 |
+
ds = tf.data.experimental.assert_cardinality(samples)(ds)
|
2406 |
+
return ds
|
2407 |
+
|
2408 |
+
@seqio.map_over_dataset()
|
2409 |
+
def charxiv_preprocessor(ex):
|
2410 |
+
question_names = ["descriptive_q1", "descriptive_q2", "descriptive_q3", "descriptive_q4", "reasoning_q"]
|
2411 |
+
answer_names = ["descriptive_a1", "descriptive_a2", "descriptive_a3", "descriptive_a4", "reasoning_a"]
|
2412 |
+
|
2413 |
+
questions = [ex[name] for name in question_names]
|
2414 |
+
answers = [ex[name] for name in answer_names]
|
2415 |
+
|
2416 |
+
return dict(
|
2417 |
+
image=ex["image"],
|
2418 |
+
question=tf.stack(questions, 0),
|
2419 |
+
answer=tf.stack(answers, 0)
|
2420 |
+
)
|
2421 |
+
|
2422 |
+
@seqio.map_over_dataset()
|
2423 |
+
def charxiv_descriptive_preprocessor(ex):
|
2424 |
+
question_names = ["descriptive_q1", "descriptive_q2", "descriptive_q3", "descriptive_q4"]
|
2425 |
+
answer_names = ["descriptive_a1", "descriptive_a2", "descriptive_a3", "descriptive_a4"]
|
2426 |
+
|
2427 |
+
questions = [ex[name] for name in question_names]
|
2428 |
+
answers = [ex[name] for name in answer_names]
|
2429 |
+
|
2430 |
+
return dict(
|
2431 |
+
image=ex["image"],
|
2432 |
+
question=tf.stack(questions, 0),
|
2433 |
+
answer=tf.stack(answers, 0)
|
2434 |
+
)
|
2435 |
+
|
2436 |
+
@seqio.map_over_dataset()
|
2437 |
+
def charxiv_reasoning_preprocessor(ex):
|
2438 |
+
return dict(
|
2439 |
+
image=ex["image"],
|
2440 |
+
question=ex["reasoning_q"],
|
2441 |
+
answer=ex["reasoning_a"]
|
2442 |
+
)
|
2443 |
+
|
2444 |
+
@seqio.map_over_dataset()
|
2445 |
+
def tablevqa_preprocessor(ex):
|
2446 |
+
return dict(
|
2447 |
+
image=ex["image"],
|
2448 |
+
question=ex["question"],
|
2449 |
+
answer=ex["gt"]
|
2450 |
+
)
|
2451 |
+
|
2452 |
+
@seqio.map_over_dataset()
|
2453 |
+
def vtabfact_preprocessor(ex):
|
2454 |
+
return dict(
|
2455 |
+
image=ex["image"],
|
2456 |
+
question=tf.strings.join([ex["question"], "Answer with yes or no."], separator="\n"),
|
2457 |
+
answer=ex["gt"]
|
2458 |
+
)
|
2459 |
+
|
2460 |
+
@seqio.map_over_dataset()
|
2461 |
+
def nutrition_fact_preprocessor(ex):
|
2462 |
+
question_names = ["descriptive_q", "reasoning_q"]
|
2463 |
+
answer_names = ["descriptive_a", "reasoning_a"]
|
2464 |
+
|
2465 |
+
questions = [ex[name] for name in question_names]
|
2466 |
+
answers = [ex[name] for name in answer_names]
|
2467 |
+
|
2468 |
+
return dict(
|
2469 |
+
image=ex["image"],
|
2470 |
+
question=tf.stack(questions, 0),
|
2471 |
+
answer=tf.stack(answers, 0)
|
2472 |
+
)
|
prompts.py
ADDED
@@ -0,0 +1,385 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
import tensorflow as tf
|
4 |
+
|
5 |
+
IMAGE_PROMPT = "<|image|>"
|
6 |
+
|
7 |
+
|
8 |
+
GENERAL_PROMPTS_V1 = {
|
9 |
+
"short_answer": [
|
10 |
+
"Answer this question very briefly\n{question}",
|
11 |
+
"{question} Answer with a few words",
|
12 |
+
"{question} Response very briefly",
|
13 |
+
"{question} Answer directly without any details, explanation, or elaboration",
|
14 |
+
"I have a question about this image, please answer it very briefly: {question}",
|
15 |
+
"Question: {question} Short Answer:",
|
16 |
+
"Question: {question}\nShort Answer:",
|
17 |
+
'{question}\nAnswer the question as briefly as possible.',
|
18 |
+
'Answer very briefly:\n{question}',
|
19 |
+
'The question "{question}" can be answered using the image. A short answer is',
|
20 |
+
"{question} Based on the image, respond to this question with a short answer:",
|
21 |
+
"{question} Short answer:",
|
22 |
+
"{question} A short answer to the question is",
|
23 |
+
"Give a short, matter-of-fact answer to this question: {question}",
|
24 |
+
"Give me a simple, direct answer to this question, do not elaborate or explain your answer:\n{question}"
|
25 |
+
],
|
26 |
+
"short_caption": [
|
27 |
+
'Caption the image with 1 or two sentences',
|
28 |
+
'Write a very short description of this image.',
|
29 |
+
'Briefly describe the image.',
|
30 |
+
'Look and this image, and then summarize it in a sentence or two.',
|
31 |
+
'Write a brief caption describing the image',
|
32 |
+
'Brief Caption:'
|
33 |
+
'A short image caption:',
|
34 |
+
'A short image description',
|
35 |
+
'Briefly describe the content of the image.',
|
36 |
+
'Can you give me one sentence summary of the picture?',
|
37 |
+
'How would you describe this image in a sentence or two?',
|
38 |
+
],
|
39 |
+
"long_caption": [
|
40 |
+
'Describe this image.',
|
41 |
+
'Describe this image',
|
42 |
+
'describe the image',
|
43 |
+
'Write a long description of this image.',
|
44 |
+
'caption the picture',
|
45 |
+
'Caption',
|
46 |
+
'caption',
|
47 |
+
'Construct a long caption for this image',
|
48 |
+
'Generate a caption',
|
49 |
+
'Create a detailed caption',
|
50 |
+
'Write a long caption',
|
51 |
+
'Describe this image in detail',
|
52 |
+
'Describe this',
|
53 |
+
'describe this',
|
54 |
+
'Caption this',
|
55 |
+
'What can be seen in this image?',
|
56 |
+
'What do you see in the image?',
|
57 |
+
'Look at this photo carefully and then tell me about it in detail',
|
58 |
+
'Write a long description of this image',
|
59 |
+
'Tell me about this picture.',
|
60 |
+
'Write a paragraph about this image.',
|
61 |
+
'Look at this image carefully and then describe it in detail',
|
62 |
+
'Generate a long caption about this image.'
|
63 |
+
],
|
64 |
+
"long_caption_no_pointing": [
|
65 |
+
'Describe this image in detail, but without any pointing.',
|
66 |
+
'Write a long description of this image, do not produce any points.',
|
67 |
+
'Tell me about this picture, use plain text only.',
|
68 |
+
'Generate a plain text description of this caption',
|
69 |
+
"What is in this image?\nNo pointing\nGive lots of detail"
|
70 |
+
"Write a long caption.\nDo not use image coordinates\nOutput a full paragraph"
|
71 |
+
],
|
72 |
+
"transcript": [
|
73 |
+
'Describe this image as if you are a person speaking',
|
74 |
+
'Imagine you are a person talking about this image. Generate a transcript of what you would say.',
|
75 |
+
"Generate an audio transcript of a person describing this image",
|
76 |
+
"Create a transcript of a human describing this image out load",
|
77 |
+
"Describe this in this style of a human talking",
|
78 |
+
],
|
79 |
+
"refexp": [
|
80 |
+
'What region does \"{refexp}\" refer to?',
|
81 |
+
],
|
82 |
+
"count_bench": [
|
83 |
+
'How many {object} are there?',
|
84 |
+
],
|
85 |
+
"refexp_pointing": [
|
86 |
+
'Where is the \"{refexp}\"?',
|
87 |
+
'Point to {refexp}',
|
88 |
+
'point at {refexp}',
|
89 |
+
'Find the {refexp}.',
|
90 |
+
'Which object in the image does \"{refexp}\" refer to?',
|
91 |
+
'Locate the object \"{refexp}\" refers to.',
|
92 |
+
'Point to the object that best matches the expression:\n{refexp}\n',
|
93 |
+
'What object could be described as: {refexp}.\nPoint:',
|
94 |
+
'Referring Expression: {refexp}.\nPoint:',
|
95 |
+
'Expression: {refexp}\nPoint to the refexp',
|
96 |
+
'Task: Point to the object that best matches the expression.\nExpression: {refexp}\nPoint:',
|
97 |
+
'Instruction: Locate the object that matches the expression by returning a point.\nReferring Expression: {refexp}\n',
|
98 |
+
'Help me find an object in this image by pointing to the {refexp}',
|
99 |
+
'What point of the image might the expression \'{refexp}\' refer to?',
|
100 |
+
],
|
101 |
+
"plain": ["{question}"],
|
102 |
+
"multiple_choice": [
|
103 |
+
"{question}\n{options}\nReturn only the letter of the best answer option",
|
104 |
+
"Answer this question by naming one of the provided options:\n{question}\n{options}",
|
105 |
+
"{question}\n{options}\nWhat option best answers the question?",
|
106 |
+
"{question}\n{options}\nReturn the best answer option",
|
107 |
+
"Look at the options, then return the letter of the option that best answers the question.\nQuesiton: {question}\nOptions: {options}",
|
108 |
+
"{question}? Select an answer option from:\n{options}",
|
109 |
+
"{question}\nSelect an answer option from:\n{options}\n\n",
|
110 |
+
"Question: {question}? Options: {options} Answer:",
|
111 |
+
"Answer the question by selecting an answer options\nQuestion: {question}\nOptions: {options}",
|
112 |
+
"{question}?\n{options}\nReturn only the letter of the correct answer",
|
113 |
+
"Help me answer this question: \"{question}\", by stating which of the following options is correct\n{options}."
|
114 |
+
],
|
115 |
+
"binary": ["{question}\nAnswer with 'yes' or 'no'"],
|
116 |
+
"pointing": [
|
117 |
+
"Point to {entity}\nPlease say 'This isn't in the image.' if it is not in the image.",
|
118 |
+
"Point to all occurrences of \"{entity}\"",
|
119 |
+
"Point to any {entity} in the image",
|
120 |
+
"Point to any {entity} in the image.",
|
121 |
+
"Point: Where are the {entity}",
|
122 |
+
"Show me where the {entity} are",
|
123 |
+
"Can you show me where the {entity} are?",
|
124 |
+
"Show me where the {entity} are",
|
125 |
+
"Show me where a {entity} is",
|
126 |
+
"Show me where a {entity} is.",
|
127 |
+
"If there are any {entity} in the image? Show me where they are.",
|
128 |
+
"Where are the {entity}?",
|
129 |
+
"Generate a list of points showing where the {entity} are.",
|
130 |
+
"Find the \"{entity}\".",
|
131 |
+
"Find a \"{entity}\".",
|
132 |
+
"Locate all {entity}.",
|
133 |
+
"Locate an {entity}.",
|
134 |
+
"Locate a {entity}.",
|
135 |
+
"Locate every {entity}.",
|
136 |
+
"Locate {entity}.",
|
137 |
+
"Locate the {entity}.",
|
138 |
+
"Object: {entity}\nInstruction: Point to the object.",
|
139 |
+
"find {entity}",
|
140 |
+
"find {entity}.",
|
141 |
+
"Point to every {entity}",
|
142 |
+
"find any {entity} in the picture",
|
143 |
+
"Find the {entity}",
|
144 |
+
"Find any {entity}",
|
145 |
+
"Point to a {entity}",
|
146 |
+
"Point to an {entity}",
|
147 |
+
"Look for {entity} in the image and show me where they are.",
|
148 |
+
"Help me find an object in the image by pointing to them.\nObject: {entity}.",
|
149 |
+
"I am looking for {entity}, where can they be found in the image?",
|
150 |
+
"Can you see any {entity} in the image? Point to them.",
|
151 |
+
"Point out each {entity} in the image.",
|
152 |
+
"Point out every {entity} in the image.",
|
153 |
+
"Point to the {entity} in the image.",
|
154 |
+
"Locate each {entity} in the image.",
|
155 |
+
"Can you point out all {entity} in this image?",
|
156 |
+
"Please find {entity} and show me where they are.",
|
157 |
+
"If there are any {entity} present, indicate their positions.",
|
158 |
+
"If there is a {entity} present, indicate its positions.",
|
159 |
+
"show me all visible {entity}",
|
160 |
+
],
|
161 |
+
"point_count": [
|
162 |
+
"How many {entity} are there?",
|
163 |
+
"How many {entity}?",
|
164 |
+
"How many {entity}.",
|
165 |
+
"how many {entity}.",
|
166 |
+
"how many {entity}?",
|
167 |
+
"How many {entity} are there in the image?",
|
168 |
+
"Tell me how many {entity} there are",
|
169 |
+
"Tell me how many {entity} there are and point to them.",
|
170 |
+
"how many {entity}",
|
171 |
+
"Tell me where each {entity} is.",
|
172 |
+
"Tell me how many {entity} are in the image",
|
173 |
+
"count {entity}",
|
174 |
+
"count every {entity}",
|
175 |
+
"count each {entity}",
|
176 |
+
"count {entity}.",
|
177 |
+
"Count the {entity}.",
|
178 |
+
"How many {entity} do you see?",
|
179 |
+
"How many {entity} are visible?",
|
180 |
+
"Count all the {entity}",
|
181 |
+
"how mmny {entity}?",
|
182 |
+
"Count every {entity} in the picture.",
|
183 |
+
"Count all the {entity}",
|
184 |
+
"Count each {entity}",
|
185 |
+
"Point to and count the {entity} in the picture.",
|
186 |
+
"Point and count {entity}",
|
187 |
+
"Point to every {entity}",
|
188 |
+
"Locate the {entity} and count them",
|
189 |
+
"Locate every {entity} and count them",
|
190 |
+
"Find all the {entity}. How many are there?",
|
191 |
+
"Find each {entity}. How many are there?",
|
192 |
+
"Point at {entity} and then tell me the count.",
|
193 |
+
"What is the total number of {entity} in the image?",
|
194 |
+
"In all the picture, how many {entity} are there?",
|
195 |
+
"Point at the {entity} and then count them.",
|
196 |
+
"Point to all the visible {entity} output the total count.",
|
197 |
+
"Point to all the {entity} visible and output the total count. \nPlease say 'This isn't in the image.' if it is not in the image.",
|
198 |
+
"Point to all occurrences of \"{entity}\" and output the total count.",
|
199 |
+
"Show me where the {entity} are and output the total count.",
|
200 |
+
"Where are the {entity}? How many are there?",
|
201 |
+
"Generate list of points showing where the {entity} are and output the total count.",
|
202 |
+
"Object: {entity}\nInstruction: Point to the object and output the total count.",
|
203 |
+
"find any {entity} in the picture and output the total count.",
|
204 |
+
"Can you see any {entity} in the image? Point to them and output the total count.",
|
205 |
+
"Can you point out all {entity} in this image? How many are there?",
|
206 |
+
"If there are any {entity} present, indicate their positions and output the total count.",
|
207 |
+
"How many {entity} are there in the image? Point to them and output the total count.",
|
208 |
+
"How many {entity} are there in the image?",
|
209 |
+
"Give me the count of {entity} in the image.",
|
210 |
+
"How many {entity} are visible in the image?",
|
211 |
+
"How many {entity} are there?",
|
212 |
+
"In the image, how many {entity} are there?",
|
213 |
+
"Can you count the number of {entity} in the image?",
|
214 |
+
"Can you count every {entity} in the picture?",
|
215 |
+
"Can you see any {entity} in the image? How many are there?",
|
216 |
+
"Are there any {entity} in the image? How many are there?",
|
217 |
+
"If you see any {entity} in the image, give me the count. Otherwise, say 'This isn't in the image.'",
|
218 |
+
"Object: {entity}\nInstruction: How many are there?",
|
219 |
+
],
|
220 |
+
|
221 |
+
# vaia
|
222 |
+
"detailed_solution": [
|
223 |
+
"Answer the question providing a step by step solution and answer in the end.\n"
|
224 |
+
"Provide a step-by-step solution to the question, ending with your final answer.\n",
|
225 |
+
"Please provide a step-by-step solution to the question shown in the image.\n",
|
226 |
+
"Give a detailed explanation for the question, concluding with your final answer.\n",
|
227 |
+
"Solve the problem presented in the question with a thorough explanation. Give me your final answer at the end.\n",
|
228 |
+
"Please analyze the question and provide a complete solution, finishing with your final answer.\n",
|
229 |
+
"Work through the problem, offering detailed reasoning before stating your final answer.\n",
|
230 |
+
"Interpret the question and guide me through the solution, concluding with your answer.\n",
|
231 |
+
"Review the question and deliver a well-explained solution, making sure to include your final answer.\n",
|
232 |
+
"Examine the question: provide a detailed explanation followed by your final answer.\n"
|
233 |
+
],
|
234 |
+
|
235 |
+
# vaia first answer with short_answer
|
236 |
+
"detailed_solution_answer_first": [
|
237 |
+
"Answer the question directly, then provide a step-by-step solution.\n",
|
238 |
+
"Please provide the answer first, followed by a step-by-step solution to the question shown in the image.\n",
|
239 |
+
"Give the final answer first, then provide a detailed explanation for the question.\n",
|
240 |
+
"Provide the final answer, then solve the problem presented in the question with a thorough explanation.\n",
|
241 |
+
"First, give the final answer, then analyze the question and provide a complete solution.\n",
|
242 |
+
"State the final answer first, then work through the problem, offering detailed reasoning.\n",
|
243 |
+
"Provide the final answer, then interpret the question and guide me through the solution.\n",
|
244 |
+
"Give the final answer first, then review the question and deliver a well-explained solution.\n",
|
245 |
+
"First, provide the final answer, then examine the question and give a detailed explanation.\n"
|
246 |
+
],
|
247 |
+
|
248 |
+
# vqa_online
|
249 |
+
"detailed_answer": [
|
250 |
+
"Answer the question providing a step-by-step explanation and answer in the end.\n",
|
251 |
+
"Provide a step-by-step explanation to the question, ending with your final answer.\n",
|
252 |
+
"Please provide a step-by-step explanation to the question shown in the image.\n",
|
253 |
+
"Give a detailed explanation for the question, concluding with your final answer.\n",
|
254 |
+
"Address the problem presented in the question with a thorough explanation. Give me your final answer at the end.\n",
|
255 |
+
"Please analyze the question and provide a complete explanation, finishing with your final answer.\n",
|
256 |
+
"Work through the problem, offering detailed reasoning before stating your final answer.\n",
|
257 |
+
"Interpret the question and guide me through the explanation, concluding with your answer.\n",
|
258 |
+
"Review the question and deliver a well-explained answer, making sure to include your final answer.\n",
|
259 |
+
"Examine the question: provide a detailed explanation followed by your final answer.\n"
|
260 |
+
],
|
261 |
+
}
|
262 |
+
|
263 |
+
GENERAL_PROMPTS_V1["pointing_tag"] = [txt + " Make the alt text and the inside of the tag the target label." for txt in GENERAL_PROMPTS_V1["pointing"]]
|
264 |
+
|
265 |
+
STYLE_TO_GENERAL_PROMPT = {
|
266 |
+
"vqa2": "short_answer",
|
267 |
+
"coco_captioning": "short_caption",
|
268 |
+
"gqa": "short_answer",
|
269 |
+
"ocr_vqa": "short_answer",
|
270 |
+
"tally_qa": "short_answer",
|
271 |
+
"text_vqa": "short_answer",
|
272 |
+
"okvqa": "short_answer",
|
273 |
+
"chart_qa": "short_answer",
|
274 |
+
"doc_qa": "short_answer",
|
275 |
+
"info_qa": "short_answer",
|
276 |
+
"science_qa": "multiple_choice",
|
277 |
+
"ai2_diagram": "multiple_choice",
|
278 |
+
"a_okvqa_mc": "multiple_choice",
|
279 |
+
"a_okvqa_da": "short_answer",
|
280 |
+
"long_caption": "long_caption",
|
281 |
+
"web_pointing": "plain",
|
282 |
+
"count_bench": "count_bench",
|
283 |
+
"refexp": "refexp",
|
284 |
+
"refexp_pointing": "refexp_pointing",
|
285 |
+
"vtabfact": "binary",
|
286 |
+
"vwtq": "short_answer",
|
287 |
+
"vwtq_syn": "short_answer",
|
288 |
+
"fintabnetqa": "short_answer",
|
289 |
+
"scifi_charts": "short_answer",
|
290 |
+
"scifi_charts_qa": "short_answer",
|
291 |
+
"charxiv_descriptive": "short_answer",
|
292 |
+
"charxiv_reasoning": "short_answer",
|
293 |
+
"pointing": "pointing",
|
294 |
+
"pointing_tag": "pointing_tag",
|
295 |
+
"point_count": "point_count",
|
296 |
+
"plain": "plain",
|
297 |
+
}
|
298 |
+
|
299 |
+
|
300 |
+
# def maybe_format_options(example, option_style="basic"):
|
301 |
+
# abc = tf.constant(list("abcdefg".upper()))
|
302 |
+
# if option_style == "random-v1":
|
303 |
+
# letter_option_sep = [": ", ". ", ")"]
|
304 |
+
# option_sep = ["\n", "\n", "\n", " ", ". ", ".\n", "; ", ", "]
|
305 |
+
# option_sep = tf.constant(option_sep)[tf.random.uniform((), 0, len(option_sep), tf.int32)]
|
306 |
+
# elif option_style == "basic":
|
307 |
+
# letter_option_sep = ": "
|
308 |
+
# option_sep = "\n"
|
309 |
+
# else:
|
310 |
+
# raise NotImplementedError(option_style)
|
311 |
+
#
|
312 |
+
# options = example["options"]
|
313 |
+
# short_options = abc[:tf.shape(options)[0]]
|
314 |
+
# sep = tf.constant(letter_option_sep)[tf.random.uniform((), 0, len(letter_option_sep), tf.int32)]
|
315 |
+
#
|
316 |
+
# options = tf.stack([short_options, options,], 1)
|
317 |
+
#
|
318 |
+
# options = tf.strings.reduce_join(options, axis=-1, separator=sep)
|
319 |
+
#
|
320 |
+
# options = tf.strings.reduce_join(options, separator=option_sep)
|
321 |
+
# example["options"] = options
|
322 |
+
# tf.debugging.assert_equal(tf.reduce_any(tf.strings.regex_full_match(options, ".*\|\|\|.*")), False)
|
323 |
+
# example["metadata/option_names"] = tf.strings.reduce_join(short_options, separator="|||")
|
324 |
+
#
|
325 |
+
# if "answer_idx" in example:
|
326 |
+
# if example["answer_idx"] < 0:
|
327 |
+
# example["text"] = "?"
|
328 |
+
# else:
|
329 |
+
# example["text"] = short_options[example["answer_idx"]]
|
330 |
+
# example["metadata/answer_idx"] = example["answer_idx"]
|
331 |
+
# return example
|
332 |
+
|
333 |
+
|
334 |
+
def apply_keyword_prompt(prompts, example, seed=None, weights=None, keywords=None):
|
335 |
+
if isinstance(prompts, list):
|
336 |
+
assert keywords is None
|
337 |
+
all_keywords = [sorted(re.findall("{([^{}]+)}", x)) for x in prompts]
|
338 |
+
keywords = all_keywords[0]
|
339 |
+
assert len(keywords) == len(set(keywords)), f"Repeated keywords in {keywords}"
|
340 |
+
assert all(keywords == x for x in all_keywords), f"Inconsistent keywords in prompts {all_keywords}"
|
341 |
+
assert not any("{" not in word[1:-1] and "}" in word[1:-1] for word in keywords)
|
342 |
+
|
343 |
+
for k in keywords:
|
344 |
+
assert k in example, f"Example missing expected field {k}, example={example}"
|
345 |
+
prompts = tf.constant(prompts)
|
346 |
+
|
347 |
+
multiple = False
|
348 |
+
if "text" in example and len(example["text"].shape) > 0:
|
349 |
+
multiple = True
|
350 |
+
|
351 |
+
if weights is not None:
|
352 |
+
weights = tf.expand_dims(tf.math.log(weights), 0)
|
353 |
+
|
354 |
+
if seed is None:
|
355 |
+
raise ValueError()
|
356 |
+
|
357 |
+
if not multiple:
|
358 |
+
if weights is None:
|
359 |
+
prompt = prompts[tf.random.stateless_uniform((), seed, 0, len(prompts), dtype=tf.int32)]
|
360 |
+
else:
|
361 |
+
prompt = prompts[tf.random.stateless_categorical(weights, 1, seed, 0, len(prompts), dtype=tf.int32)][0, 0]
|
362 |
+
for keyword in keywords:
|
363 |
+
# We use split not regex_replace because regex_replace has issues with
|
364 |
+
# value strings with backslashes
|
365 |
+
res = tf.strings.split(prompt, "{"+keyword+"}", maxsplit=2)
|
366 |
+
prompt = tf.strings.join([res[0], example[keyword], res[1]])
|
367 |
+
return prompt
|
368 |
+
else:
|
369 |
+
n_prompts = tf.shape(example["text"])[0]
|
370 |
+
if weights is None:
|
371 |
+
ix = tf.random.stateless_uniform(
|
372 |
+
(n_prompts,), seed, 0, tf.shape(prompts)[0], dtype=tf.int32)
|
373 |
+
else:
|
374 |
+
ix = tf.random.stateless_categorical(
|
375 |
+
weights, tf.shape(prompts)[0], seed, 0, len(prompts), dtype=tf.int32)[0]
|
376 |
+
prompt = tf.gather(prompts, ix)
|
377 |
+
out = tf.TensorArray(dtype=tf.string, size=n_prompts, element_shape=())
|
378 |
+
for i in range(n_prompts):
|
379 |
+
modified = prompt[i]
|
380 |
+
for keyword in keywords:
|
381 |
+
res = tf.strings.split(modified, "{"+keyword+"}", maxsplit=2)
|
382 |
+
modified = tf.strings.join([res[0], example[keyword][i], res[1]])
|
383 |
+
out = out.write(i, modified)
|
384 |
+
return out.stack()
|
385 |
+
|
seqio_tokenizer.py
ADDED
@@ -0,0 +1,659 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2023 The SeqIO Authors.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
"""Vocabularies."""
|
16 |
+
|
17 |
+
import abc
|
18 |
+
import dataclasses
|
19 |
+
import functools
|
20 |
+
import hashlib
|
21 |
+
import threading
|
22 |
+
from typing import Any, ClassVar, Dict, Iterable, Optional, Sequence, Union, List, Tuple
|
23 |
+
|
24 |
+
import numpy as np
|
25 |
+
from absl import logging
|
26 |
+
import tensorflow.compat.v2 as tf
|
27 |
+
|
28 |
+
from sentencepiece import sentencepiece_model_pb2
|
29 |
+
import sentencepiece as sentencepiece_processor
|
30 |
+
|
31 |
+
PAD_ID = -1 # -1 for llama tokenizer
|
32 |
+
|
33 |
+
|
34 |
+
class Vocabulary(metaclass=abc.ABCMeta):
|
35 |
+
"""Abstract class for all vocabularies.
|
36 |
+
|
37 |
+
Subclasses must implement methods for converting between strings and tokens
|
38 |
+
both in pure python (`_encode`/`_decode`) and in TensorFlow
|
39 |
+
(`_encode_tf`/`_decode_tf`).
|
40 |
+
|
41 |
+
Subclasses are responsible for reserving PAD_ID=0 as well as optionally
|
42 |
+
reserving EOS_ID and UNK_ID
|
43 |
+
|
44 |
+
`_base_vocab_size` should account for PAD, EOS, and UNK but not `extra_ids`.
|
45 |
+
"""
|
46 |
+
|
47 |
+
def __init__(self, extra_ids: int = 0):
|
48 |
+
"""Vocabulary constructor.
|
49 |
+
|
50 |
+
Args:
|
51 |
+
extra_ids: The number of extra IDs to reserve.
|
52 |
+
"""
|
53 |
+
self._extra_ids = extra_ids or 0
|
54 |
+
|
55 |
+
@property
|
56 |
+
def bos_token_id(self) -> Optional[int]:
|
57 |
+
raise NotImplementedError("need to implement bos_id")
|
58 |
+
|
59 |
+
@property
|
60 |
+
@abc.abstractmethod
|
61 |
+
def eos_token_id(self) -> Optional[int]:
|
62 |
+
raise NotImplementedError("need to implement eos_id")
|
63 |
+
|
64 |
+
@property
|
65 |
+
def pad_id(self) -> int:
|
66 |
+
return PAD_ID
|
67 |
+
|
68 |
+
@property
|
69 |
+
@abc.abstractmethod
|
70 |
+
def unk_id(self) -> Optional[int]:
|
71 |
+
raise NotImplementedError("need to implement unk_id")
|
72 |
+
|
73 |
+
@property
|
74 |
+
def extra_ids(self) -> int:
|
75 |
+
return self._extra_ids
|
76 |
+
|
77 |
+
@property
|
78 |
+
def vocab_size(self) -> int:
|
79 |
+
"""Vocabulary size, including extra ids."""
|
80 |
+
return self._base_vocab_size + self.extra_ids
|
81 |
+
|
82 |
+
@property
|
83 |
+
@abc.abstractmethod
|
84 |
+
def _base_vocab_size(self) -> int:
|
85 |
+
"""Vocabulary size, excluding extra ids but including PAD/EOS/UNK."""
|
86 |
+
# TODO(fjord): add a check that pad_id and unk_id (if present)
|
87 |
+
# are less than _base_vocab_size.
|
88 |
+
raise NotImplementedError
|
89 |
+
|
90 |
+
@abc.abstractmethod
|
91 |
+
def _encode(self, s: str) -> Sequence[int]:
|
92 |
+
raise NotImplementedError
|
93 |
+
|
94 |
+
def encode(self, s: Union[Sequence[int], str]) -> Sequence[int]:
|
95 |
+
"""Tokenizes string to an int sequence, without adding EOS."""
|
96 |
+
return self._encode(s)
|
97 |
+
|
98 |
+
@abc.abstractmethod
|
99 |
+
def _decode(self, ids):
|
100 |
+
raise NotImplementedError
|
101 |
+
|
102 |
+
def decode(self, ids: Iterable[int], truncate_at_eos=True):
|
103 |
+
"""Detokenizes int32 iterable to a string, up through first EOS."""
|
104 |
+
clean_ids = list(ids)
|
105 |
+
|
106 |
+
if self.unk_id is not None:
|
107 |
+
vocab_size = self._base_vocab_size
|
108 |
+
clean_ids = [self.unk_id if i >= vocab_size else i for i in clean_ids]
|
109 |
+
|
110 |
+
if truncate_at_eos and (self.eos_token_id is not None and self.eos_token_id in clean_ids):
|
111 |
+
clean_ids = clean_ids[: clean_ids.index(self.eos_token_id) + 1]
|
112 |
+
|
113 |
+
return self._decode(clean_ids)
|
114 |
+
|
115 |
+
@abc.abstractmethod
|
116 |
+
def _encode_tf(self, s: tf.Tensor) -> tf.Tensor:
|
117 |
+
raise NotImplementedError
|
118 |
+
|
119 |
+
def encode_tf(self, s: tf.Tensor) -> tf.Tensor:
|
120 |
+
"""Tokenizes string Scalar to an int32 Tensor, without adding EOS."""
|
121 |
+
return self._encode_tf(s)
|
122 |
+
|
123 |
+
@abc.abstractmethod
|
124 |
+
def _decode_tf(self, ids: tf.Tensor) -> tf.Tensor:
|
125 |
+
raise NotImplementedError
|
126 |
+
|
127 |
+
def decode_tf(self, ids: tf.Tensor) -> tf.Tensor:
|
128 |
+
"""Detokenizes int32 batched Tensor through first EOS."""
|
129 |
+
clean_ids = ids
|
130 |
+
|
131 |
+
if self.unk_id is not None:
|
132 |
+
base_vocab_size = self._base_vocab_size
|
133 |
+
clean_ids = tf.where(
|
134 |
+
tf.less(clean_ids, base_vocab_size), clean_ids, self.unk_id
|
135 |
+
)
|
136 |
+
|
137 |
+
if self.eos_id is not None:
|
138 |
+
# Replace everything after the first eos_id with pad_id.
|
139 |
+
after_eos = tf.cumsum(
|
140 |
+
tf.cast(tf.equal(clean_ids, self.eos_id), tf.int32),
|
141 |
+
exclusive=True,
|
142 |
+
axis=-1,
|
143 |
+
)
|
144 |
+
clean_ids = tf.where(tf.cast(after_eos, tf.bool), self.pad_id, clean_ids)
|
145 |
+
|
146 |
+
return self._decode_tf(clean_ids)
|
147 |
+
|
148 |
+
|
149 |
+
class PassThroughVocabulary(Vocabulary):
|
150 |
+
"""Vocabulary that passes through inputs unchanged."""
|
151 |
+
|
152 |
+
def __init__(self, size: int, eos_id: Optional[Any] = None):
|
153 |
+
"""PassThroughVocabulary constructor.
|
154 |
+
|
155 |
+
Args:
|
156 |
+
size: the full size of the vocabulary.
|
157 |
+
eos_id: the end-of-sequence token.
|
158 |
+
"""
|
159 |
+
self._size = size
|
160 |
+
self._eos_id = eos_id
|
161 |
+
super().__init__()
|
162 |
+
|
163 |
+
@property
|
164 |
+
def _base_vocab_size(self):
|
165 |
+
return self._size
|
166 |
+
|
167 |
+
def _encode(self, s: Sequence[Any]) -> Sequence[Any]:
|
168 |
+
return s
|
169 |
+
|
170 |
+
def _decode(self, ids: Sequence[Any]) -> Sequence[Any]:
|
171 |
+
return ids
|
172 |
+
|
173 |
+
def _encode_tf(self, s: tf.Tensor) -> tf.Tensor:
|
174 |
+
return s
|
175 |
+
|
176 |
+
def _decode_tf(self, ids: tf.Tensor) -> tf.Tensor:
|
177 |
+
return ids
|
178 |
+
|
179 |
+
@property
|
180 |
+
def eos_id(self) -> Optional[Any]:
|
181 |
+
return self._eos_id
|
182 |
+
|
183 |
+
@property
|
184 |
+
def unk_id(self) -> Optional[Any]:
|
185 |
+
return None
|
186 |
+
|
187 |
+
def __eq__(self, other):
|
188 |
+
if not isinstance(other, PassThroughVocabulary):
|
189 |
+
return False
|
190 |
+
return self._size == other._size and self.eos_id == other.eos_id
|
191 |
+
|
192 |
+
def __str__(self) -> str:
|
193 |
+
return f"PassThroughVocabulary(size={self._size}, eos_id={self.eos_id})"
|
194 |
+
|
195 |
+
|
196 |
+
class UnigramVocabulary(Vocabulary):
|
197 |
+
"""Vocabulary that does table-lookup of unigrams."""
|
198 |
+
|
199 |
+
def __init__(self, unigrams: Sequence[str]):
|
200 |
+
"""UnigramVocabulary constructor.
|
201 |
+
|
202 |
+
Args:
|
203 |
+
unigrams: the collection of in-vocabulary tokens. This collection should
|
204 |
+
not include PAD or UNK, which are automatically assigned ids and managed
|
205 |
+
as possible decode tokens.
|
206 |
+
"""
|
207 |
+
|
208 |
+
super().__init__()
|
209 |
+
unigrams_as_list = list(unigrams)
|
210 |
+
self._unigram_by_id = ["PAD"] + unigrams_as_list + ["UNK"]
|
211 |
+
self._id_by_unigram = {u: i for i, u in enumerate(self._unigram_by_id)}
|
212 |
+
initializer = tf.lookup.KeyValueTensorInitializer(
|
213 |
+
keys=tf.constant(["PAD"] + unigrams_as_list),
|
214 |
+
# One extra value because the leading 0 corresponds to PAD
|
215 |
+
values=tf.constant(range(len(unigrams) + 1), dtype=tf.int64),
|
216 |
+
)
|
217 |
+
self._id_by_unigram_tf = tf.lookup.StaticVocabularyTable(
|
218 |
+
initializer, num_oov_buckets=1
|
219 |
+
)
|
220 |
+
self._unigram_by_id_tf = tf.constant(self._unigram_by_id)
|
221 |
+
|
222 |
+
def _encode(self, s: str) -> Sequence[int]:
|
223 |
+
return [self._id_by_unigram.get(s, self.unk_id)]
|
224 |
+
|
225 |
+
def _encode_tf(self, s: tf.Tensor) -> tf.Tensor:
|
226 |
+
tf_ids = self._id_by_unigram_tf.lookup(s)
|
227 |
+
return tf.expand_dims(tf.dtypes.cast(tf_ids, tf.int32), -1)
|
228 |
+
|
229 |
+
def _decode(self, ids: Sequence[int]) -> str:
|
230 |
+
return " ".join(self._unigram_by_id[id] for id in ids)
|
231 |
+
|
232 |
+
def _decode_tf(self, ids: tf.Tensor) -> tf.Tensor:
|
233 |
+
return self._unigram_by_id_tf[ids[0]]
|
234 |
+
|
235 |
+
@property
|
236 |
+
def _base_vocab_size(self):
|
237 |
+
return len(self._unigram_by_id)
|
238 |
+
|
239 |
+
@property
|
240 |
+
def eos_id(self):
|
241 |
+
return None
|
242 |
+
|
243 |
+
@property
|
244 |
+
def unk_id(self):
|
245 |
+
return self._base_vocab_size - 1
|
246 |
+
|
247 |
+
|
248 |
+
class SentencePieceVocabulary(Vocabulary):
|
249 |
+
"""Wrapper for nlp/sentencepiece encoder.
|
250 |
+
|
251 |
+
Assumes the model was built using flags to reserve ID=0 for padding, ID=1 for
|
252 |
+
EOS, and ID=2 for UNK.
|
253 |
+
|
254 |
+
If using extra ids, you can represent them in string-form as `<extra_id_0>`,
|
255 |
+
`<extra_id_1>`, etc. They will be indexed starting from the end of the
|
256 |
+
vocabulary to match how the masking preprocessors are set up.
|
257 |
+
|
258 |
+
IMPORTANT NOTE: these placeholders only work properly when they are used at
|
259 |
+
word starts (e.g., "I like peanut butter and <extra_id_0> sandwiches." or
|
260 |
+
"I like peanut butter and <extra_id_0>ly sandwiches" are both okay, but
|
261 |
+
"I like peanut butter and jel<extra_id_0> sandwiches" is not.).
|
262 |
+
"""
|
263 |
+
|
264 |
+
@dataclasses.dataclass
|
265 |
+
class _ModelContext:
|
266 |
+
tokenizer: sentencepiece_processor.SentencePieceProcessor
|
267 |
+
sp_model: bytes
|
268 |
+
|
269 |
+
_load_model_lock: ClassVar[threading.Lock] = threading.Lock()
|
270 |
+
|
271 |
+
def __init__(
|
272 |
+
self,
|
273 |
+
sentencepiece_model_file: str,
|
274 |
+
extra_ids: int = 0,
|
275 |
+
normalizer_spec_overrides: Optional[
|
276 |
+
sentencepiece_model_pb2.NormalizerSpec
|
277 |
+
] = None,
|
278 |
+
reverse_extra_ids: bool = False,
|
279 |
+
extra_tokens: Tuple[str] = None,
|
280 |
+
hack_to_t5_start_tokens: bool = False,
|
281 |
+
):
|
282 |
+
"""Create a SentencePieceVocabulary.
|
283 |
+
|
284 |
+
Optionally, specify a number of extra ids to add to the end of the
|
285 |
+
vocabulary for use as sentinels.
|
286 |
+
|
287 |
+
Args:
|
288 |
+
sentencepiece_model_file: path of the sentence piece model.
|
289 |
+
extra_ids: number of extra ids to include.
|
290 |
+
normalizer_spec_overrides: If not None, this proto will be merged into the
|
291 |
+
model's normalizer and denormalizer specs. Thus, any options set on this
|
292 |
+
object will override the values of those options in the loaded model.
|
293 |
+
reverse_extra_ids: if True, extra_ids are numbered in descending order, so
|
294 |
+
the first extra_id has the highest number. This is done for
|
295 |
+
compatibility with span_corruption mask generation in T5.
|
296 |
+
"""
|
297 |
+
self._sentencepiece_model_file = sentencepiece_model_file
|
298 |
+
self._normalizer_spec_overrides = normalizer_spec_overrides
|
299 |
+
self._reverse_extra_ids = reverse_extra_ids
|
300 |
+
self._model: Optional[SentencePieceVocabulary._ModelContext] = None
|
301 |
+
self._extra_tokens = extra_tokens
|
302 |
+
self._hack_to_t5_start_tokens = hack_to_t5_start_tokens
|
303 |
+
super().__init__(extra_ids=extra_ids)
|
304 |
+
|
305 |
+
def __getstate__(self):
|
306 |
+
state = self.__dict__.copy()
|
307 |
+
# Gin config makes a deep copy of the keyword arguments of configurables.
|
308 |
+
# When a SentencePieceVocabulary vocabulary is used as a keyword argument
|
309 |
+
# in a Gin configurable, it must be picklable. We therefore remove
|
310 |
+
# _model; will be initialized lazily as needed.
|
311 |
+
del state["_model"]
|
312 |
+
return state
|
313 |
+
|
314 |
+
def __setstate__(self, state):
|
315 |
+
self.__dict__.update(state)
|
316 |
+
self._model = None
|
317 |
+
|
318 |
+
def load_model(self) -> None:
|
319 |
+
_ = self._model_context()
|
320 |
+
|
321 |
+
def _model_context(
|
322 |
+
self,
|
323 |
+
) -> _ModelContext:
|
324 |
+
"""Loads model if not yet loaded and returns the model context.
|
325 |
+
|
326 |
+
Returns:
|
327 |
+
The model context as a tuple of (tokenizer, sp_model).
|
328 |
+
"""
|
329 |
+
if self._model:
|
330 |
+
return self._model
|
331 |
+
|
332 |
+
normalizer_spec_overrides_serialized = (
|
333 |
+
self._normalizer_spec_overrides.SerializeToString(deterministic=True)
|
334 |
+
if self._normalizer_spec_overrides
|
335 |
+
else None
|
336 |
+
)
|
337 |
+
|
338 |
+
self._model = self._load_model(
|
339 |
+
self._sentencepiece_model_file,
|
340 |
+
self._extra_ids,
|
341 |
+
normalizer_spec_overrides_serialized,
|
342 |
+
self._reverse_extra_ids,
|
343 |
+
extra_tokens=self._extra_tokens,
|
344 |
+
hack_to_t5_start_tokens=self._hack_to_t5_start_tokens,
|
345 |
+
)
|
346 |
+
return self._model
|
347 |
+
|
348 |
+
@classmethod
|
349 |
+
@functools.lru_cache(maxsize=None)
|
350 |
+
def _load_model(
|
351 |
+
cls,
|
352 |
+
sentencepiece_model_file: str,
|
353 |
+
extra_ids: int,
|
354 |
+
normalizer_spec_overrides_serialized: Optional[bytes] = None,
|
355 |
+
reverse_extra_ids: bool = True,
|
356 |
+
extra_tokens: Tuple[str] = None,
|
357 |
+
hack_to_t5_start_tokens=False,
|
358 |
+
) -> _ModelContext:
|
359 |
+
"""Load SPM, Python tokenizer, and cache results to the class definition."""
|
360 |
+
# SentencePieceProcessor::LoadFromSerializedProto is not thread-safe.
|
361 |
+
# Without a lock, users may randomly see SIGSEGV on
|
362 |
+
# sentencepiece::ModelInterface::pad_piece when using the vocabulary in
|
363 |
+
# SeqIO preprocessors.
|
364 |
+
with cls._load_model_lock:
|
365 |
+
# Handle cases where SP can't load the file, but gfile can.
|
366 |
+
with tf.io.gfile.GFile(sentencepiece_model_file, "rb") as f:
|
367 |
+
sp_model = f.read()
|
368 |
+
model = sentencepiece_model_pb2.ModelProto.FromString(sp_model)
|
369 |
+
|
370 |
+
if hack_to_t5_start_tokens:
|
371 |
+
# PAD token would still be 0 same as BOS for consistency as previous!
|
372 |
+
unk = model.pieces[0]
|
373 |
+
bos = model.pieces[1]
|
374 |
+
eos = model.pieces[2]
|
375 |
+
model.pieces.remove(unk)
|
376 |
+
model.pieces.remove(bos)
|
377 |
+
model.pieces.remove(eos)
|
378 |
+
model.pieces.insert(0, bos) # BOS is token 0
|
379 |
+
model.pieces.insert(1, eos) # EOS is token 1
|
380 |
+
model.pieces.insert(2, unk) # UNK is token 2
|
381 |
+
|
382 |
+
# Add placeholder strings for extra IDs.
|
383 |
+
if extra_ids:
|
384 |
+
# By default, we them in reverse order to match span corruption.
|
385 |
+
if reverse_extra_ids:
|
386 |
+
extra_id_tokens = reversed(range(extra_ids))
|
387 |
+
else:
|
388 |
+
extra_id_tokens = range(extra_ids)
|
389 |
+
|
390 |
+
for i in extra_id_tokens:
|
391 |
+
model.pieces.add(
|
392 |
+
piece=f"▁<extra_id_{i}>",
|
393 |
+
score=0.0,
|
394 |
+
type=sentencepiece_model_pb2.ModelProto.SentencePiece.USER_DEFINED,
|
395 |
+
)
|
396 |
+
|
397 |
+
if extra_tokens:
|
398 |
+
for s in extra_tokens:
|
399 |
+
model.pieces.add(
|
400 |
+
piece=f"▁"+s,
|
401 |
+
score=0.0,
|
402 |
+
type=sentencepiece_model_pb2.ModelProto.SentencePiece.USER_DEFINED,
|
403 |
+
)
|
404 |
+
|
405 |
+
if normalizer_spec_overrides_serialized is not None:
|
406 |
+
normalizer_spec_overrides = (
|
407 |
+
sentencepiece_model_pb2.NormalizerSpec.FromString(
|
408 |
+
normalizer_spec_overrides_serialized
|
409 |
+
)
|
410 |
+
)
|
411 |
+
|
412 |
+
model.normalizer_spec.MergeFrom(normalizer_spec_overrides)
|
413 |
+
model.denormalizer_spec.MergeFrom(normalizer_spec_overrides)
|
414 |
+
sp_model = model.SerializeToString()
|
415 |
+
# Load Python tokenizer and ensure the EOS and PAD IDs are correct.
|
416 |
+
tokenizer = sentencepiece_processor.SentencePieceProcessor()
|
417 |
+
tokenizer.LoadFromSerializedProto(sp_model)
|
418 |
+
if tokenizer.pad_id() != PAD_ID:
|
419 |
+
logging.warning(
|
420 |
+
(
|
421 |
+
"T5 library uses PAD_ID=%s, which is different from the "
|
422 |
+
"sentencepiece vocabulary, which defines pad_id=%s"
|
423 |
+
),
|
424 |
+
PAD_ID,
|
425 |
+
tokenizer.pad_id(),
|
426 |
+
)
|
427 |
+
|
428 |
+
return cls._ModelContext(tokenizer=tokenizer, sp_model=sp_model)
|
429 |
+
|
430 |
+
@property
|
431 |
+
def num_extra_tokens(self):
|
432 |
+
if self._extra_tokens:
|
433 |
+
return len(self._extra_tokens)
|
434 |
+
return 0
|
435 |
+
|
436 |
+
@property
|
437 |
+
def bos_id(self) -> Optional[int]:
|
438 |
+
return self.tokenizer.bos_id()
|
439 |
+
|
440 |
+
@property
|
441 |
+
def bos_token_id(self) -> Optional[int]:
|
442 |
+
return self.tokenizer.bos_id()
|
443 |
+
|
444 |
+
@property
|
445 |
+
def eos_token_id(self) -> Optional[int]:
|
446 |
+
return self.tokenizer.eos_id()
|
447 |
+
|
448 |
+
@property
|
449 |
+
def eos_id(self) -> Optional[int]:
|
450 |
+
return self.tokenizer.eos_id()
|
451 |
+
|
452 |
+
@property
|
453 |
+
def unk_id(self) -> Optional[int]:
|
454 |
+
return self.tokenizer.unk_id()
|
455 |
+
|
456 |
+
@property
|
457 |
+
def sp_model(self) -> Optional[bytes]:
|
458 |
+
"""Retrieve the SPM."""
|
459 |
+
return self._model_context().sp_model
|
460 |
+
|
461 |
+
@property
|
462 |
+
def sentencepiece_model_file(self) -> str:
|
463 |
+
return self._sentencepiece_model_file
|
464 |
+
|
465 |
+
@property
|
466 |
+
def tokenizer(self) -> sentencepiece_processor.SentencePieceProcessor:
|
467 |
+
"""Returns the Python tokenizer."""
|
468 |
+
return self._model_context().tokenizer
|
469 |
+
|
470 |
+
@property
|
471 |
+
def tf_tokenizer(self):
|
472 |
+
"""Instantiate and return a TF tokenizer."""
|
473 |
+
import tensorflow_text as tf_text # import here to keep the dependency optional
|
474 |
+
return tf_text.SentencepieceTokenizer(model=self.sp_model)
|
475 |
+
|
476 |
+
@property
|
477 |
+
def vocab_size(self):
|
478 |
+
return self._base_vocab_size
|
479 |
+
|
480 |
+
@property
|
481 |
+
def _base_vocab_size(self):
|
482 |
+
"""Number of ids (including 0=PAD, 1=EOS, and 2=UNK).
|
483 |
+
|
484 |
+
Returns:
|
485 |
+
an integer, the vocabulary size
|
486 |
+
"""
|
487 |
+
return self.tokenizer.GetPieceSize()
|
488 |
+
|
489 |
+
def _encode(self, s):
|
490 |
+
"""Encode a python string as a list of integers.
|
491 |
+
|
492 |
+
Args:
|
493 |
+
s: a string
|
494 |
+
|
495 |
+
Returns:
|
496 |
+
a list of integers (not terminated by EOS)
|
497 |
+
"""
|
498 |
+
return self.tokenizer.EncodeAsIds(s)
|
499 |
+
|
500 |
+
def _decode(self, ids):
|
501 |
+
"""Decode a list of integers to a python string.
|
502 |
+
|
503 |
+
Args:
|
504 |
+
ids: a list of integers (not terminated by EOS)
|
505 |
+
|
506 |
+
Returns:
|
507 |
+
a string
|
508 |
+
"""
|
509 |
+
# convert all the extra ids (sentinels) to UNK=2
|
510 |
+
unk_id = self.tokenizer.unk_id()
|
511 |
+
piece_size = self.tokenizer.GetPieceSize()
|
512 |
+
ids = [unk_id if i >= piece_size else int(i) for i in ids]
|
513 |
+
return self.tokenizer.DecodeIds(ids)
|
514 |
+
|
515 |
+
def _encode_tf(self, s):
|
516 |
+
"""Encode a tf.Scalar string to a tf.Tensor.
|
517 |
+
|
518 |
+
This will be necessary for on-the-fly tokenization.
|
519 |
+
|
520 |
+
Args:
|
521 |
+
s: a tf.Scalar with dtype tf.string
|
522 |
+
|
523 |
+
Returns:
|
524 |
+
a 1d tf.Tensor with dtype tf.int32
|
525 |
+
"""
|
526 |
+
return self.tf_tokenizer.tokenize(s)
|
527 |
+
|
528 |
+
def _decode_tf(self, ids):
|
529 |
+
"""Decode in TensorFlow.
|
530 |
+
|
531 |
+
Args:
|
532 |
+
ids: a 1d or 2d tf.Tensor with dtype tf.int32
|
533 |
+
|
534 |
+
Returns:
|
535 |
+
a 1d or 2d tf.Tensor with dtype tf.string
|
536 |
+
"""
|
537 |
+
return self.tf_tokenizer.detokenize(ids)
|
538 |
+
|
539 |
+
def __eq__(self, other):
|
540 |
+
if not isinstance(other, SentencePieceVocabulary):
|
541 |
+
return False
|
542 |
+
try:
|
543 |
+
their_md5 = hashlib.md5(other.sp_model).hexdigest()
|
544 |
+
# If other has no sp_model attribute, we can't test for equality
|
545 |
+
except AttributeError:
|
546 |
+
return False
|
547 |
+
if self.sp_model is None:
|
548 |
+
return False
|
549 |
+
our_md5 = hashlib.md5(self.sp_model).hexdigest()
|
550 |
+
return our_md5 == their_md5
|
551 |
+
|
552 |
+
def __str__(self) -> str:
|
553 |
+
return (
|
554 |
+
f"SentencePieceVocabulary(file={self.sentencepiece_model_file}, "
|
555 |
+
f"extra_ids={self._extra_ids}, "
|
556 |
+
f"spm_md5={hashlib.md5(self.sp_model).hexdigest()})"
|
557 |
+
)
|
558 |
+
|
559 |
+
@property
|
560 |
+
def adds_space(self):
|
561 |
+
return True
|
562 |
+
|
563 |
+
|
564 |
+
class HfTokenizerWrapper:
|
565 |
+
def __init__(self, tokenizer, bos_token_id=None, adds_space=False):
|
566 |
+
"""
|
567 |
+
tokenizer: Tokenizer to wrap
|
568 |
+
bos_token_id: BOS token id to use if not `tokenizer.bos_token_id`
|
569 |
+
adds_space: If concatenating interdependently tokenized pieces of text, will the tokens
|
570 |
+
already including a seerating space?
|
571 |
+
"""
|
572 |
+
self.adds_space = adds_space
|
573 |
+
self.tokenizer = tokenizer
|
574 |
+
if bos_token_id is None:
|
575 |
+
self.bos_token_id = tokenizer.bos_token_id
|
576 |
+
else:
|
577 |
+
self.bos_token_id = bos_token_id
|
578 |
+
self.eos_token_id = self.tokenizer.eos_token_id
|
579 |
+
self.pad_id = -1
|
580 |
+
|
581 |
+
def encode(self, x: str):
|
582 |
+
return self.tokenizer.encode(x, add_special_tokens=False)
|
583 |
+
|
584 |
+
def decode(self, x: List[int], truncate_at_eos=True):
|
585 |
+
x = [int(t) for t in x]
|
586 |
+
|
587 |
+
if self.eos_token_id == self.bos_token_id and (len(x) > 0 and x[0] == self.eos_token_id):
|
588 |
+
# Assume an EOS at the start is functioning as BOS
|
589 |
+
x = x[1:]
|
590 |
+
|
591 |
+
if truncate_at_eos:
|
592 |
+
# Follow seqio and automatically cut off at EOS
|
593 |
+
try:
|
594 |
+
eos_ix = x.index(self.eos_token_id)
|
595 |
+
x = x[:eos_ix]
|
596 |
+
except ValueError:
|
597 |
+
pass
|
598 |
+
return self.tokenizer.decode(x, skip_special_tokens=True)
|
599 |
+
|
600 |
+
|
601 |
+
def vocab_size(self):
|
602 |
+
return len(self.tokenizer)
|
603 |
+
|
604 |
+
def encode_tf(self, x):
|
605 |
+
if isinstance(x, str) or len(x.shape) == 0:
|
606 |
+
def _enc(_data):
|
607 |
+
_data = _data.item() if isinstance(_data, np.ndarray) else _data
|
608 |
+
return self.tokenizer.encode(_data.decode("utf-8"), add_special_tokens=False, return_tensors="np")[0].astype(np.int32)
|
609 |
+
return tf.ensure_shape(tf.numpy_function(_enc, [x], tf.int32, stateful=False), [None])
|
610 |
+
|
611 |
+
flattened = tf.reshape(x, [-1])
|
612 |
+
|
613 |
+
def _enc(_data):
|
614 |
+
tokens = [self.tokenizer.encode(x.decode("utf-8"), add_special_tokens=False, return_tensors="np")[0].astype(np.int32)
|
615 |
+
for x in _data]
|
616 |
+
if len(tokens) == 0:
|
617 |
+
return np.zeros((0,), dtype=np.int32), np.zeros((0,), dtype=np.int32)
|
618 |
+
else:
|
619 |
+
return np.concatenate(tokens, 0), np.array([len(x) for x in tokens]).astype(np.int32)
|
620 |
+
if not (isinstance(x, str) or x.dtype == tf.string):
|
621 |
+
raise ValueError("Input be a string or a string numpy array")
|
622 |
+
text, lens = tf.numpy_function(_enc, [flattened], (tf.int32, tf.int32), stateful=False)
|
623 |
+
lens = tf.ensure_shape(lens, [None])
|
624 |
+
text = tf.ensure_shape(text, [None])
|
625 |
+
if len(x.shape) == 2:
|
626 |
+
n = x.shape[1]
|
627 |
+
assert n is not None
|
628 |
+
return tf.RaggedTensor.from_nested_row_lengths(
|
629 |
+
text,
|
630 |
+
[tf.ones(tf.shape(x)[0], dtype=lens.dtype)*n, lens]
|
631 |
+
)
|
632 |
+
else:
|
633 |
+
return tf.RaggedTensor.from_row_lengths(text, lens)
|
634 |
+
|
635 |
+
|
636 |
+
class OLMoTokenizerWrapper(HfTokenizerWrapper):
|
637 |
+
|
638 |
+
def encode(self, x: str):
|
639 |
+
return self.tokenizer.encode(x, add_special_tokens=False)
|
640 |
+
|
641 |
+
def encode_tf(self, x):
|
642 |
+
if isinstance(x, str) or len(x.shape) == 0:
|
643 |
+
def _enc(_data):
|
644 |
+
return np.asarray(self.tokenizer.encode(_data.numpy().decode("utf-8"), add_special_tokens=False), dtype=np.int32)
|
645 |
+
out = tf.py_function(_enc, (x,), tf.int32)
|
646 |
+
return tf.ensure_shape(out, [None])
|
647 |
+
else:
|
648 |
+
def _enc(_data):
|
649 |
+
tokens = [self.tokenizer.encode(x.decode("utf-8"), add_special_tokens=False)
|
650 |
+
for x in _data.numpy()]
|
651 |
+
if len(tokens) == 0:
|
652 |
+
return np.zeros((0,), dtype=np.int32), np.zeros((0,), dtype=np.int32)
|
653 |
+
else:
|
654 |
+
return np.concatenate(tokens, 0), np.array([len(x) for x in tokens])
|
655 |
+
text, lens = tf.py_function(_enc, (x,), (tf.int32, tf.int32))
|
656 |
+
lens = tf.ensure_shape(lens, [None])
|
657 |
+
text = tf.ensure_shape(text, [None])
|
658 |
+
return tf.RaggedTensor.from_row_lengths(text, lens)
|
659 |
+
|
tasks.py
ADDED
@@ -0,0 +1,2548 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Module that can be imported to register all tasks
|
2 |
+
import dataclasses
|
3 |
+
import functools
|
4 |
+
import logging
|
5 |
+
import os
|
6 |
+
from collections import OrderedDict
|
7 |
+
from typing import List, Dict, Any
|
8 |
+
|
9 |
+
import seqio
|
10 |
+
from seqio import dataset_providers
|
11 |
+
import tensorflow_datasets as tfds
|
12 |
+
|
13 |
+
from .data_utils import _strip_metadata, build_tokenizer
|
14 |
+
from .preprocesssors import *
|
15 |
+
from .preprocesssors import _preprocess_scifi
|
16 |
+
|
17 |
+
|
18 |
+
@dataclasses.dataclass
|
19 |
+
class TaskSpec:
|
20 |
+
name: str
|
21 |
+
source: seqio.DataSourceInterface
|
22 |
+
preprocessors: List
|
23 |
+
style: str
|
24 |
+
inference_preprocessors: List = None
|
25 |
+
inference_only: bool = False
|
26 |
+
decode_image: bool = False
|
27 |
+
shuffle_after: Optional[int] = None
|
28 |
+
ignore_errors: bool = False
|
29 |
+
|
30 |
+
|
31 |
+
MULTITASK_TFDS_DATA_DIR = "/weka/oe-training-default/mm-olmo/tensorflow_datasets"
|
32 |
+
|
33 |
+
TASKS: Dict[str, TaskSpec] = {}
|
34 |
+
|
35 |
+
|
36 |
+
def add_task(
|
37 |
+
name,
|
38 |
+
source: seqio.DataSourceInterface,
|
39 |
+
preprocessors: List,
|
40 |
+
style: str,
|
41 |
+
inf_preprocessor=None,
|
42 |
+
inf_only=False,
|
43 |
+
decode_image=False,
|
44 |
+
shuffle_after=None,
|
45 |
+
ignore_errors=False
|
46 |
+
):
|
47 |
+
TASKS[name] = TaskSpec(
|
48 |
+
name, source, preprocessors, style, inf_preprocessor, inf_only, decode_image,
|
49 |
+
shuffle_after, ignore_errors)
|
50 |
+
|
51 |
+
|
52 |
+
@seqio.map_over_dataset
|
53 |
+
def add_image_size(ex):
|
54 |
+
if ex["image"].dtype == tf.string:
|
55 |
+
ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
|
56 |
+
img_h = tf.shape(ex["image"])[0]
|
57 |
+
img_w = tf.shape(ex["image"])[1]
|
58 |
+
ex["metadata/image_size"] = [img_w, img_h]
|
59 |
+
|
60 |
+
|
61 |
+
@dataclasses.dataclass
|
62 |
+
class TaskDatasetBuilder:
|
63 |
+
"""tf.data.Dataset builder for task after shuffling, sharding, and initial model pre-processing
|
64 |
+
have been applied"""
|
65 |
+
# This class is a simplified and customized version of seqio.Task
|
66 |
+
#
|
67 |
+
# The main differences are:
|
68 |
+
# 1: Does not prefetch by default, which wastes a small amount of RAM if we are using the
|
69 |
+
# dataset in a mixture which can just have its own top-level prefetch
|
70 |
+
# 2: Reduce threshold for memory caching which is way too high for image datasets by default
|
71 |
+
# 3: Can customize when shuffling occurs to help minimizes RAM usage, in general shuffling
|
72 |
+
# should happen before building image crops and tokenization so the shuffle and
|
73 |
+
# dataset checkpoint take less memory
|
74 |
+
# 4: Don't decoding images until after shuffling for the same reason
|
75 |
+
# 5: Support splitting with tfds.map_split so we never have to fall back to example sharding
|
76 |
+
# not default at the moment since its not well tested
|
77 |
+
# 6: Removes caching/output feature spec stuff from seqio that we don't need
|
78 |
+
|
79 |
+
name: str
|
80 |
+
source: Any
|
81 |
+
preprocessors: List
|
82 |
+
keep_metadata: bool
|
83 |
+
shuffle_after: int
|
84 |
+
sharding: str = "tfds_split"
|
85 |
+
decode_image: bool = False
|
86 |
+
ignore_errors: bool = False
|
87 |
+
|
88 |
+
def get_dataset(
|
89 |
+
self, # pytype: disable=signature-mismatch # overriding-default-value-checks
|
90 |
+
sequence_length: Optional[Mapping[str, int]] = None,
|
91 |
+
split: str = tfds.Split.TRAIN,
|
92 |
+
shuffle: bool = True,
|
93 |
+
shuffle_buffer_size: Optional[int] = 1000,
|
94 |
+
seed: Optional[int] = None,
|
95 |
+
shard_info: Optional[seqio.ShardInfo] = None,
|
96 |
+
num_epochs: Optional[int] = 1,
|
97 |
+
try_in_mem_cache: bool = True,
|
98 |
+
trim_output_features: bool=True
|
99 |
+
) -> tf.data.Dataset:
|
100 |
+
source = self.source
|
101 |
+
|
102 |
+
if self.sharding == "seqio":
|
103 |
+
if source.supports_arbitrary_sharding:
|
104 |
+
shard_data_source = True
|
105 |
+
elif shard_info:
|
106 |
+
# Whether we should shard at source or on the examples from the source.
|
107 |
+
shard_data_source = (
|
108 |
+
len(source.list_shards(split=split)) >= shard_info.num_shards
|
109 |
+
)
|
110 |
+
logging.info(
|
111 |
+
"Sharding at the %s: %d of %d",
|
112 |
+
"data source" if shard_data_source else "examples",
|
113 |
+
shard_info.index + 1,
|
114 |
+
shard_info.num_shards,
|
115 |
+
)
|
116 |
+
else:
|
117 |
+
# Call get_dataset on the source without a shard_info.
|
118 |
+
shard_data_source = True
|
119 |
+
shard_info = None
|
120 |
+
|
121 |
+
if "image" in source.tfds_dataset.info.features:
|
122 |
+
if not self.decode_image:
|
123 |
+
source.tfds_dataset._decoders = dict(image=tfds.decode.SkipDecoding())
|
124 |
+
|
125 |
+
if shard_data_source:
|
126 |
+
ds = source.get_dataset(
|
127 |
+
split=split, shuffle=shuffle, seed=seed, shard_info=shard_info)
|
128 |
+
else:
|
129 |
+
ds = source.get_dataset(split=split, shuffle=shuffle, seed=seed)
|
130 |
+
ds = ds.shard(shard_info.num_shards, shard_info.index)
|
131 |
+
elif self.sharding == "tfds_split":
|
132 |
+
# Shard with `tfds.even_splits`, which is seems to be recommended for mult-host training
|
133 |
+
# https://github.com/tensorflow/datasets/blob/master/docs/splits.md#tfdseven_splits--multi-host-training
|
134 |
+
assert isinstance(self.source, seqio.TfdsDataSource)
|
135 |
+
loader: seqio.LazyTfdsLoader = self.source.tfds_dataset
|
136 |
+
dataset, data_dir = loader.get_split_params(split)
|
137 |
+
shard_split = loader._map_split(split)
|
138 |
+
if shard_info and shard_info.num_shards > 1:
|
139 |
+
shard_split = tfds.even_splits(shard_split, n=shard_info.num_shards, drop_remainder=False)[shard_info.index]
|
140 |
+
else:
|
141 |
+
shard_split = shard_split
|
142 |
+
read_config = loader.read_config
|
143 |
+
read_config.shuffle_seed = seed
|
144 |
+
read_config.skip_prefetch = True
|
145 |
+
read_config.input_context = None
|
146 |
+
# Don't decode images until after shuffling to save RAM
|
147 |
+
if "image" in loader.info.features:
|
148 |
+
decoders = dict(image=tfds.decode.SkipDecoding())
|
149 |
+
else:
|
150 |
+
decoders = None
|
151 |
+
ds = tfds.load(
|
152 |
+
dataset,
|
153 |
+
split=shard_split,
|
154 |
+
data_dir=data_dir,
|
155 |
+
shuffle_files=shuffle,
|
156 |
+
download=True,
|
157 |
+
try_gcs=True,
|
158 |
+
read_config=read_config,
|
159 |
+
decoders=decoders
|
160 |
+
)
|
161 |
+
else:
|
162 |
+
raise NotImplementedError(self.sharding)
|
163 |
+
|
164 |
+
num_shards = shard_info.num_shards if shard_info else 1
|
165 |
+
if try_in_mem_cache and (
|
166 |
+
source.num_input_examples(split)
|
167 |
+
and source.num_input_examples(split)
|
168 |
+
< 10000 * num_shards
|
169 |
+
):
|
170 |
+
logging.info(f"Automatically caching small dataset in memory: {self.name}:{split}")
|
171 |
+
ds = ds.cache()
|
172 |
+
|
173 |
+
# We repeat before calling any (potentially) stochastic
|
174 |
+
# preprocessors in order to take new samples each epoch.
|
175 |
+
if num_epochs != 1:
|
176 |
+
ds = ds.repeat(num_epochs)
|
177 |
+
|
178 |
+
preprocessors = [
|
179 |
+
seqio.add_kwargs_to_transform(
|
180 |
+
_fn,
|
181 |
+
sequence_length=sequence_length,
|
182 |
+
output_features=None,
|
183 |
+
) for _fn in self.preprocessors
|
184 |
+
]
|
185 |
+
|
186 |
+
with seqio.utils.map_seed_manager(seed):
|
187 |
+
for fn in preprocessors[:self.shuffle_after]:
|
188 |
+
ds = fn(ds)
|
189 |
+
|
190 |
+
# Strip metadata before shuffling if possible so its doesn't waste space
|
191 |
+
if not self.keep_metadata:
|
192 |
+
ds = _strip_metadata(ds)
|
193 |
+
|
194 |
+
if shuffle:
|
195 |
+
if shuffle_buffer_size is None:
|
196 |
+
raise ValueError("Shuffle is true, but shuffle_buffer_size is None")
|
197 |
+
else:
|
198 |
+
ds = ds.shuffle(shuffle_buffer_size, seed=seed)
|
199 |
+
|
200 |
+
for fn in preprocessors[self.shuffle_after:]:
|
201 |
+
ds = fn(ds)
|
202 |
+
|
203 |
+
if self.ignore_errors:
|
204 |
+
ds = ds.ignore_errors(log_warning=True)
|
205 |
+
|
206 |
+
if trim_output_features:
|
207 |
+
ds = seqio.trim_dataset(ds, sequence_length, sequence_length)
|
208 |
+
|
209 |
+
return ds
|
210 |
+
|
211 |
+
|
212 |
+
def get_task(preprocessor, name, is_training, for_inference,
|
213 |
+
include_metadata=None, style_override=None) -> TaskDatasetBuilder:
|
214 |
+
"""Get a builder for task `name` that is pre-processed by `preprocessor`"""
|
215 |
+
|
216 |
+
task_spec = TASKS[name]
|
217 |
+
if for_inference is None:
|
218 |
+
for_inference = task_spec.inference_only
|
219 |
+
elif task_spec.inference_only and not for_inference:
|
220 |
+
raise ValueError(f"Inference=only task {task_spec.name} can only be used in inference mode")
|
221 |
+
|
222 |
+
if include_metadata is None:
|
223 |
+
include_metadata = for_inference
|
224 |
+
|
225 |
+
if preprocessor is not None:
|
226 |
+
style = style_override if style_override else task_spec.style
|
227 |
+
preprocessor = preprocessor.get_preprocessor(
|
228 |
+
is_training, for_inference, style, include_metadata)
|
229 |
+
preprocessor = [preprocessor]
|
230 |
+
else:
|
231 |
+
preprocessor = []
|
232 |
+
task_preprocessors = task_spec.preprocessors
|
233 |
+
if for_inference and task_spec.inference_preprocessors is not None:
|
234 |
+
task_preprocessors = task_spec.inference_preprocessors
|
235 |
+
if isinstance(task_spec.source, seqio.TfdsDataSource):
|
236 |
+
from seqio.utils import _TFDS_DATA_DIR_OVERRIDE
|
237 |
+
if _TFDS_DATA_DIR_OVERRIDE:
|
238 |
+
# Stop annoying override warnings flooding the log
|
239 |
+
task_spec.source.tfds_dataset._data_dir = None
|
240 |
+
|
241 |
+
return TaskDatasetBuilder(
|
242 |
+
task_spec.name,
|
243 |
+
task_spec.source,
|
244 |
+
task_preprocessors + preprocessor,
|
245 |
+
keep_metadata=include_metadata,
|
246 |
+
shuffle_after=(task_spec.shuffle_after if task_spec.shuffle_after
|
247 |
+
else len(task_spec.preprocessors)),
|
248 |
+
sharding="seqio",
|
249 |
+
decode_image=task_spec.decode_image,
|
250 |
+
ignore_errors=task_spec.ignore_errors,
|
251 |
+
)
|
252 |
+
|
253 |
+
|
254 |
+
add_task(
|
255 |
+
"coco_caption_2017",
|
256 |
+
source=seqio.TfdsDataSource(
|
257 |
+
tfds_name="coco_all:1.0.1",
|
258 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
259 |
+
),
|
260 |
+
preprocessors=[
|
261 |
+
functools.partial(rekey, key_map={
|
262 |
+
"image/filename": ["image/filename"],
|
263 |
+
"image": ["image"],
|
264 |
+
"text": ["captions", "text"]
|
265 |
+
}),
|
266 |
+
functools.partial(flatten_parts, parts=["text"]),
|
267 |
+
],
|
268 |
+
inf_preprocessor=[
|
269 |
+
functools.partial(rekey, key_map={
|
270 |
+
"image/filename": ["image/filename"],
|
271 |
+
"image": ["image"],
|
272 |
+
"text": ["captions", "text"]
|
273 |
+
})
|
274 |
+
],
|
275 |
+
style="coco_captioning",
|
276 |
+
)
|
277 |
+
|
278 |
+
|
279 |
+
add_task(
|
280 |
+
"coco_captioning_karpathy",
|
281 |
+
source=seqio.TfdsDataSource(
|
282 |
+
tfds_name="coco_captioning_karpathy:1.0.2",
|
283 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
284 |
+
splits={"train": "train", "validation": "val", "test": "test"}
|
285 |
+
),
|
286 |
+
preprocessors=[
|
287 |
+
rename(text="captions"),
|
288 |
+
functools.partial(flatten_parts, parts=["text"]),
|
289 |
+
],
|
290 |
+
inf_preprocessor=[add_coco_url],
|
291 |
+
style="coco_captioning",
|
292 |
+
)
|
293 |
+
|
294 |
+
|
295 |
+
add_task(
|
296 |
+
"synth_counting",
|
297 |
+
source=seqio.TfdsDataSource(
|
298 |
+
tfds_name="synth_counting:0.0.3",
|
299 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
300 |
+
splits={"train": "train[5120:]", "validation": "train[:5120]"}
|
301 |
+
),
|
302 |
+
preprocessors=[synth_count_preprocessor],
|
303 |
+
inf_preprocessor=[synth_count_inf_preprocessor],
|
304 |
+
style="synth_counting",
|
305 |
+
)
|
306 |
+
|
307 |
+
|
308 |
+
add_task(
|
309 |
+
"khan_academy",
|
310 |
+
source=seqio.TfdsDataSource(
|
311 |
+
tfds_name="khan_academy:1.0.0",
|
312 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
313 |
+
splits={"train": "train[1024:]", "validation": "train[:1024]"}
|
314 |
+
),
|
315 |
+
preprocessors=[extract_khan_academy],
|
316 |
+
style="khan_academy",
|
317 |
+
)
|
318 |
+
|
319 |
+
for name, src in [
|
320 |
+
("vaia_qa_latex_image_math_subset", seqio.TfdsDataSource(
|
321 |
+
tfds_name=f"vaia_qa_latex_image_short_answer:0.1.2",
|
322 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
323 |
+
splits={"train": "train", "validation": "validation"}
|
324 |
+
)),
|
325 |
+
("vaia_qa_latex_image_all", seqio.TfdsDataSource(
|
326 |
+
tfds_name=f"vaia_qa_latex_image_short_answer:0.1.3",
|
327 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
328 |
+
splits={"train": "train", "validation": "validation"}
|
329 |
+
)),
|
330 |
+
]:
|
331 |
+
add_task(
|
332 |
+
f"{name}_short_answer",
|
333 |
+
source=src,
|
334 |
+
preprocessors=[
|
335 |
+
remove_is_long,
|
336 |
+
remove_has_multiple_parts,
|
337 |
+
functools.partial(extract_vaia_qa_latex_image, add_short_answer=True),
|
338 |
+
],
|
339 |
+
style="vaia_qa",
|
340 |
+
)
|
341 |
+
add_task(
|
342 |
+
f"{name}_short_answer_first",
|
343 |
+
source=src,
|
344 |
+
preprocessors=[
|
345 |
+
remove_is_long,
|
346 |
+
remove_has_multiple_parts,
|
347 |
+
functools.partial(extract_vaia_qa_latex_image, add_short_answer=True, set_short_answer_first=True),
|
348 |
+
],
|
349 |
+
style="vaia_qa_short_answer_first",
|
350 |
+
)
|
351 |
+
add_task(
|
352 |
+
f"{name}_mc_only_short_answer",
|
353 |
+
source=src,
|
354 |
+
preprocessors=[
|
355 |
+
remove_is_long,
|
356 |
+
remove_has_multiple_parts,
|
357 |
+
filter_mc,
|
358 |
+
functools.partial(extract_vaia_qa_latex_image, add_short_answer=True),
|
359 |
+
],
|
360 |
+
style="vaia_qa_short_answer",
|
361 |
+
)
|
362 |
+
add_task(
|
363 |
+
f"{name}_mc_only_short_answer_first",
|
364 |
+
source=src,
|
365 |
+
preprocessors=[
|
366 |
+
remove_is_long,
|
367 |
+
remove_has_multiple_parts,
|
368 |
+
filter_mc,
|
369 |
+
functools.partial(extract_vaia_qa_latex_image, add_short_answer=True, set_short_answer_first=True),
|
370 |
+
],
|
371 |
+
style="vaia_qa_short_answer_first",
|
372 |
+
)
|
373 |
+
add_task(
|
374 |
+
f"{name}_image_only_short_answer",
|
375 |
+
source=src,
|
376 |
+
preprocessors=[
|
377 |
+
image_only,
|
378 |
+
remove_is_long,
|
379 |
+
remove_has_multiple_parts,
|
380 |
+
functools.partial(extract_vaia_qa_latex_image, add_short_answer=True),
|
381 |
+
],
|
382 |
+
style="vaia_qa_short_answer",
|
383 |
+
)
|
384 |
+
add_task(
|
385 |
+
f"{name}_image_only_short_answer_first",
|
386 |
+
source=src,
|
387 |
+
preprocessors=[
|
388 |
+
image_only,
|
389 |
+
remove_is_long,
|
390 |
+
remove_has_multiple_parts,
|
391 |
+
functools.partial(extract_vaia_qa_latex_image, add_short_answer=True, set_short_answer_first=True),
|
392 |
+
],
|
393 |
+
style="vaia_qa_short_answer_first",
|
394 |
+
)
|
395 |
+
|
396 |
+
add_task(
|
397 |
+
"vqa_online",
|
398 |
+
source=seqio.TfdsDataSource(
|
399 |
+
tfds_name="vqa_online:1.0.1",
|
400 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
401 |
+
splits={"train": "train", "validation": "validation", "test": "validation"}
|
402 |
+
),
|
403 |
+
preprocessors=[
|
404 |
+
build_question_with_context,
|
405 |
+
extract_vqa_online,
|
406 |
+
],
|
407 |
+
style="vqa_online",
|
408 |
+
)
|
409 |
+
|
410 |
+
add_task(
|
411 |
+
"vqa_online_gpt_longQ_longA",
|
412 |
+
source=seqio.TfdsDataSource(
|
413 |
+
tfds_name="vqa_online_gpt_parsed:1.1.0",
|
414 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
415 |
+
splits={"train": "train", "validation": "validation", "test": "validation"}
|
416 |
+
),
|
417 |
+
preprocessors=[
|
418 |
+
rename(question="question_long", answer="answer_long"),
|
419 |
+
extract_vqa_online,
|
420 |
+
],
|
421 |
+
style="vqa_online",
|
422 |
+
)
|
423 |
+
|
424 |
+
|
425 |
+
add_task(
|
426 |
+
"famous_birthdays",
|
427 |
+
source=seqio.TfdsDataSource(
|
428 |
+
tfds_name="famous_birth_days:1.0.0",
|
429 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
430 |
+
splits={"train": "train[5120:]", "validation": "train[:5120]"}
|
431 |
+
),
|
432 |
+
preprocessors=[
|
433 |
+
famous_birthdays_preprocessor,
|
434 |
+
functools.partial(name_entity_augmentation, p_high_color=0.0),
|
435 |
+
],
|
436 |
+
style="famous_birthdays",
|
437 |
+
)
|
438 |
+
|
439 |
+
|
440 |
+
add_task(
|
441 |
+
"wiki_art",
|
442 |
+
source=seqio.TfdsDataSource(
|
443 |
+
tfds_name="wiki_art:1.0.0",
|
444 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
445 |
+
splits={"train": "train[5120:]", "validation": "train[:5120]"}
|
446 |
+
),
|
447 |
+
preprocessors=[name_entity_augmentation, wiki_art_preprocessor],
|
448 |
+
style="wiki_art",
|
449 |
+
)
|
450 |
+
|
451 |
+
add_task(
|
452 |
+
"wiki_art_no_aug",
|
453 |
+
source=seqio.TfdsDataSource(
|
454 |
+
tfds_name="wiki_art:1.0.0",
|
455 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
456 |
+
splits={"train": "train[5120:]", "validation": "train[:5120]"}
|
457 |
+
),
|
458 |
+
preprocessors=[wiki_art_preprocessor],
|
459 |
+
style="wiki_art",
|
460 |
+
)
|
461 |
+
|
462 |
+
add_task(
|
463 |
+
"atlas_obscura",
|
464 |
+
source=seqio.TfdsDataSource(
|
465 |
+
tfds_name="atlas_obscura:1.0.0",
|
466 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
467 |
+
splits={"train": "train[5120:]", "validation": "train[:5120]"}
|
468 |
+
),
|
469 |
+
preprocessors=[
|
470 |
+
atlas_obscura_preprocessor,
|
471 |
+
mild_color_aug_preprocessor
|
472 |
+
],
|
473 |
+
style="atlas_obscura",
|
474 |
+
)
|
475 |
+
|
476 |
+
|
477 |
+
add_task(
|
478 |
+
"clocks",
|
479 |
+
source=seqio.TfdsDataSource(
|
480 |
+
tfds_name="clocks:1.0.1",
|
481 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
482 |
+
),
|
483 |
+
preprocessors=[
|
484 |
+
clocks_preprocessor,
|
485 |
+
clock_augmentation
|
486 |
+
],
|
487 |
+
style="clocks",
|
488 |
+
shuffle_after=0
|
489 |
+
)
|
490 |
+
|
491 |
+
|
492 |
+
add_task(
|
493 |
+
"count_bench",
|
494 |
+
source=seqio.TfdsDataSource(
|
495 |
+
tfds_name="count_bench:1.0.0",
|
496 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
497 |
+
),
|
498 |
+
preprocessors=[
|
499 |
+
count_bench_preprocessor,
|
500 |
+
],
|
501 |
+
style="count_bench",
|
502 |
+
)
|
503 |
+
|
504 |
+
|
505 |
+
add_task(
|
506 |
+
"tulu_v2_sft",
|
507 |
+
source=seqio.TfdsDataSource(
|
508 |
+
tfds_name="allenai__tulu_v2_sft_mixture:1.0.0",
|
509 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
510 |
+
),
|
511 |
+
preprocessors=[tulu_preprocessor],
|
512 |
+
style="tulu_v2",
|
513 |
+
)
|
514 |
+
|
515 |
+
|
516 |
+
# Pointing / Point+Count datasets
|
517 |
+
for is_count in [True, False]:
|
518 |
+
if is_count:
|
519 |
+
task = "point_count"
|
520 |
+
else:
|
521 |
+
task = "pointing"
|
522 |
+
add_task(
|
523 |
+
task,
|
524 |
+
source=seqio.TfdsDataSource(
|
525 |
+
tfds_name="pointing:1.0.1",
|
526 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
527 |
+
splits={"train": "train", "validation": "validation"}
|
528 |
+
),
|
529 |
+
preprocessors=[
|
530 |
+
filter_points,
|
531 |
+
functools.partial(pointing_preprocessor, with_count=is_count),
|
532 |
+
split
|
533 |
+
],
|
534 |
+
style=task,
|
535 |
+
)
|
536 |
+
add_task(
|
537 |
+
task + "_eval", # pointing validation set
|
538 |
+
source=seqio.TfdsDataSource(
|
539 |
+
tfds_name="pointing:1.0.2",
|
540 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
541 |
+
),
|
542 |
+
preprocessors=[
|
543 |
+
filter_points,
|
544 |
+
functools.partial(pointing_preprocessor, with_count=is_count),
|
545 |
+
split
|
546 |
+
],
|
547 |
+
style=task,
|
548 |
+
)
|
549 |
+
add_task(
|
550 |
+
task + "_high_freq",
|
551 |
+
source=seqio.TfdsDataSource(
|
552 |
+
tfds_name="count_qa:0.0.2",
|
553 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
554 |
+
splits=dict(
|
555 |
+
train="train[2048:]",
|
556 |
+
validation="train[:2048]"
|
557 |
+
)
|
558 |
+
),
|
559 |
+
preprocessors=[
|
560 |
+
filter_points,
|
561 |
+
fix_count_qa, # Fix a tfrecord bug TODO fix the underlying records
|
562 |
+
functools.partial(pointing_preprocessor, with_count=is_count),
|
563 |
+
split,
|
564 |
+
],
|
565 |
+
style=task,
|
566 |
+
)
|
567 |
+
add_task(
|
568 |
+
"fast_flickr_count_qa_" + task,
|
569 |
+
source=seqio.TfdsDataSource(
|
570 |
+
tfds_name="fast_flickr_count_qa:1.0.4",
|
571 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
572 |
+
),
|
573 |
+
preprocessors=[
|
574 |
+
functools.partial(count_qa_preprocessor, with_count=is_count),
|
575 |
+
],
|
576 |
+
inf_preprocessor=[
|
577 |
+
functools.partial(count_qa_preprocessor, with_count=is_count, for_inference=True),
|
578 |
+
],
|
579 |
+
style=task,
|
580 |
+
)
|
581 |
+
|
582 |
+
|
583 |
+
add_task(
|
584 |
+
"countbench_qa",
|
585 |
+
source=seqio.TfdsDataSource(
|
586 |
+
tfds_name="countbench_qa:1.2.0",
|
587 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
588 |
+
),
|
589 |
+
inf_only=True,
|
590 |
+
preprocessors=[
|
591 |
+
count_qa_preprocessor_inf,
|
592 |
+
],
|
593 |
+
style="point_count",
|
594 |
+
)
|
595 |
+
|
596 |
+
|
597 |
+
add_task(
|
598 |
+
f"pointing_test", # pointing set with segmentation ground truths
|
599 |
+
source=seqio.TfdsDataSource(
|
600 |
+
tfds_name="pointing:1.0.3",
|
601 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
602 |
+
),
|
603 |
+
preprocessors=[
|
604 |
+
pointing_inf_preprocessor
|
605 |
+
],
|
606 |
+
style=task,
|
607 |
+
inf_only=True,
|
608 |
+
)
|
609 |
+
|
610 |
+
|
611 |
+
add_task(
|
612 |
+
"point_qa",
|
613 |
+
source=seqio.TfdsDataSource(
|
614 |
+
tfds_name="point_qa:0.0.5",
|
615 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
616 |
+
splits=dict(
|
617 |
+
train="train[512:]",
|
618 |
+
validation="train[:512]"
|
619 |
+
)
|
620 |
+
),
|
621 |
+
preprocessors=[extract_point_qa, split],
|
622 |
+
style="point_qa",
|
623 |
+
)
|
624 |
+
|
625 |
+
add_task(
|
626 |
+
"clocks_no_aug",
|
627 |
+
source=seqio.TfdsDataSource(
|
628 |
+
tfds_name="clocks:1.0.1",
|
629 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
630 |
+
),
|
631 |
+
preprocessors=[
|
632 |
+
clocks_preprocessor
|
633 |
+
],
|
634 |
+
style="clocks",
|
635 |
+
)
|
636 |
+
|
637 |
+
|
638 |
+
add_task(
|
639 |
+
"clock_bench",
|
640 |
+
source=seqio.TfdsDataSource(
|
641 |
+
tfds_name="clock_bench:1.0.0",
|
642 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
643 |
+
),
|
644 |
+
preprocessors=[
|
645 |
+
clock_bench_preprocessor
|
646 |
+
],
|
647 |
+
inf_only=True,
|
648 |
+
style="clocks",
|
649 |
+
)
|
650 |
+
|
651 |
+
add_task(
|
652 |
+
"wiki_data",
|
653 |
+
source=seqio.TfdsDataSource(
|
654 |
+
tfds_name="cockatoo_wiki:1.0.0",
|
655 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
656 |
+
splits={"train": "train[10240:]", "validation": "train[:5120]", "test": "train[5120:10240]"}
|
657 |
+
),
|
658 |
+
preprocessors=[extract_wiki_data],
|
659 |
+
style="wiki_data",
|
660 |
+
)
|
661 |
+
|
662 |
+
|
663 |
+
add_task(
|
664 |
+
"wiki_data_name",
|
665 |
+
source=seqio.TfdsDataSource(
|
666 |
+
tfds_name="cockatoo_wiki:1.0.0",
|
667 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
668 |
+
splits={"train": "train[10240:]", "validation": "train[:5120]", "test": "train[5120:10240]"}
|
669 |
+
),
|
670 |
+
preprocessors=[
|
671 |
+
extract_wiki_data_name,
|
672 |
+
mild_color_aug_preprocessor
|
673 |
+
],
|
674 |
+
style="wiki_data",
|
675 |
+
)
|
676 |
+
|
677 |
+
add_task(
|
678 |
+
"wiki_data_describe",
|
679 |
+
source=seqio.TfdsDataSource(
|
680 |
+
tfds_name="cockatoo_wiki:1.0.0",
|
681 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
682 |
+
splits={"train": "train[10240:]", "validation": "train[:5120]", "test": "train[5120:10240]"}
|
683 |
+
),
|
684 |
+
preprocessors=[extract_wiki_data_describe],
|
685 |
+
inf_only=True,
|
686 |
+
style="wiki_data",
|
687 |
+
)
|
688 |
+
|
689 |
+
add_task(
|
690 |
+
"wiki_data_describe",
|
691 |
+
source=seqio.TfdsDataSource(
|
692 |
+
tfds_name="cockatoo_wiki:1.0.0",
|
693 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
694 |
+
splits={"train": "train[10240:]", "validation": "train[:5120]", "test": "train[5120:10240]"}
|
695 |
+
),
|
696 |
+
preprocessors=[extract_wiki_data_describe],
|
697 |
+
inf_only=True,
|
698 |
+
style="wiki_data",
|
699 |
+
)
|
700 |
+
|
701 |
+
|
702 |
+
for name, src in [
|
703 |
+
("scifi_charts", seqio.TfdsDataSource(
|
704 |
+
tfds_name="sci_fi_charts:1.0.6",
|
705 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
706 |
+
splits={"train": "train[1024:]", "validation": "train[:1024]"}
|
707 |
+
)),
|
708 |
+
("scifi_table", seqio.TfdsDataSource(
|
709 |
+
tfds_name="sci_fi_table:1.0.3",
|
710 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
711 |
+
splits={"train": "train[1024:]", "validation": "train[:1024]"}
|
712 |
+
)),
|
713 |
+
("scifi_document", seqio.TfdsDataSource(
|
714 |
+
tfds_name="sci_fi_document:1.0.3",
|
715 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
716 |
+
splits={"train": "train[1024:]", "validation": "train[:1024]"}
|
717 |
+
)),
|
718 |
+
("scifi_diagram", seqio.TfdsDataSource(
|
719 |
+
tfds_name="sci_fi_diagram:1.0.0",
|
720 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
721 |
+
splits={"train": "train[1024:]", "validation": "train[:1024]"}
|
722 |
+
)),
|
723 |
+
("scifi_natural", seqio.TfdsDataSource(
|
724 |
+
tfds_name="sci_fi_natural:1.0.1",
|
725 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
726 |
+
splits={"train": "train[128:]", "validation": "train[:128]"}
|
727 |
+
)),
|
728 |
+
("scifi_nutrition", seqio.TfdsDataSource(
|
729 |
+
tfds_name="sci_fi_nutrition:1.0.0",
|
730 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
731 |
+
splits={"train": "train[128:]", "validation": "train[:128]"}
|
732 |
+
))
|
733 |
+
]:
|
734 |
+
add_task(
|
735 |
+
name + "_qa",
|
736 |
+
source=src,
|
737 |
+
preprocessors=[
|
738 |
+
remove_no_qa,
|
739 |
+
_preprocess_scifi,
|
740 |
+
extract_individual_vqa,
|
741 |
+
],
|
742 |
+
inf_preprocessor=[
|
743 |
+
remove_no_qa, _preprocess_scifi,
|
744 |
+
functools.partial(flatten_parts, parts=["question", "answer"]),
|
745 |
+
extract_individual_vqa,
|
746 |
+
],
|
747 |
+
style=name,
|
748 |
+
)
|
749 |
+
add_task(
|
750 |
+
name + "_qa_split",
|
751 |
+
source=src,
|
752 |
+
preprocessors=[
|
753 |
+
remove_no_qa,
|
754 |
+
_preprocess_scifi,
|
755 |
+
extract_individual_vqa,
|
756 |
+
split
|
757 |
+
],
|
758 |
+
inf_preprocessor=[
|
759 |
+
remove_no_qa, _preprocess_scifi,
|
760 |
+
functools.partial(flatten_parts, parts=["question", "answer"]),
|
761 |
+
extract_individual_vqa,
|
762 |
+
],
|
763 |
+
style=name,
|
764 |
+
)
|
765 |
+
add_task(
|
766 |
+
name + "_qa_exp",
|
767 |
+
source=src,
|
768 |
+
preprocessors=[
|
769 |
+
remove_no_qa,
|
770 |
+
_preprocess_scifi,
|
771 |
+
extract_scifi_qa_exp,
|
772 |
+
extract_individual_vqa,
|
773 |
+
],
|
774 |
+
inf_preprocessor=[
|
775 |
+
remove_no_qa, _preprocess_scifi,
|
776 |
+
extract_scifi_qa_exp,
|
777 |
+
functools.partial(flatten_parts, parts=["question", "answer"]),
|
778 |
+
extract_individual_vqa,
|
779 |
+
],
|
780 |
+
style=name + "_qa_exp",
|
781 |
+
)
|
782 |
+
add_task(
|
783 |
+
name + "_qa_exp_split",
|
784 |
+
source=src,
|
785 |
+
preprocessors=[
|
786 |
+
remove_no_qa,
|
787 |
+
_preprocess_scifi,
|
788 |
+
extract_scifi_qa_exp,
|
789 |
+
extract_individual_vqa,
|
790 |
+
split,
|
791 |
+
],
|
792 |
+
inf_preprocessor=[
|
793 |
+
remove_no_qa, _preprocess_scifi,
|
794 |
+
extract_scifi_qa_exp,
|
795 |
+
functools.partial(flatten_parts, parts=["question", "answer"]),
|
796 |
+
extract_individual_vqa,
|
797 |
+
],
|
798 |
+
style=name + "_qa_exp",
|
799 |
+
)
|
800 |
+
add_task(
|
801 |
+
name + "_exp",
|
802 |
+
source=src,
|
803 |
+
preprocessors=[
|
804 |
+
remove_no_qa,
|
805 |
+
_preprocess_scifi,
|
806 |
+
scifi_explanation_only,
|
807 |
+
extract_individual_vqa,
|
808 |
+
split
|
809 |
+
],
|
810 |
+
style=name + "_exp"
|
811 |
+
)
|
812 |
+
add_task(
|
813 |
+
name + "_demo",
|
814 |
+
source=src,
|
815 |
+
preprocessors=[
|
816 |
+
remove_no_qa,
|
817 |
+
_preprocess_scifi,
|
818 |
+
extract_scifi_qa_demo,
|
819 |
+
extract_individual_vqa,
|
820 |
+
split
|
821 |
+
],
|
822 |
+
style="scifi_demo"
|
823 |
+
)
|
824 |
+
|
825 |
+
|
826 |
+
add_task(
|
827 |
+
"chart_qa_scifi",
|
828 |
+
source=seqio.TfdsDataSource(
|
829 |
+
tfds_name="chart_qa:1.0.2",
|
830 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
831 |
+
splits={"train": "train", "validation": "val", "test": "test"}
|
832 |
+
),
|
833 |
+
preprocessors=[
|
834 |
+
rename(question="query", answer="label", **{"metadata/is_human": "is_human"}),
|
835 |
+
extract_individual_vqa,
|
836 |
+
],
|
837 |
+
style="scifi_charts_qa_exp",
|
838 |
+
)
|
839 |
+
|
840 |
+
|
841 |
+
add_task(
|
842 |
+
"chart_qa_prompting",
|
843 |
+
source=seqio.TfdsDataSource(
|
844 |
+
tfds_name="chart_qa:1.0.2",
|
845 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
846 |
+
splits={"train": "train", "validation": "val", "test": "test"}
|
847 |
+
),
|
848 |
+
preprocessors=[
|
849 |
+
rename(question="query", answer="label", **{"metadata/is_human": "is_human"}),
|
850 |
+
chartqa_prompting,
|
851 |
+
extract_individual_vqa,
|
852 |
+
],
|
853 |
+
style="chart_qa",
|
854 |
+
)
|
855 |
+
|
856 |
+
|
857 |
+
add_task(
|
858 |
+
"chart_qa_prompting_explanation",
|
859 |
+
source=seqio.TfdsDataSource(
|
860 |
+
tfds_name="chart_qa:1.0.2",
|
861 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
862 |
+
splits={"train": "train", "validation": "val", "test": "test"}
|
863 |
+
),
|
864 |
+
preprocessors=[
|
865 |
+
rename(question="query", answer="label", **{"metadata/is_human": "is_human"}),
|
866 |
+
chartqa_explanation,
|
867 |
+
extract_individual_vqa,
|
868 |
+
],
|
869 |
+
style="chart_qa",
|
870 |
+
)
|
871 |
+
|
872 |
+
|
873 |
+
|
874 |
+
add_task(
|
875 |
+
"coco_captioning_karpathy_multi",
|
876 |
+
source=seqio.TfdsDataSource(
|
877 |
+
tfds_name="coco_captioning_karpathy:1.0.2",
|
878 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
879 |
+
splits={"train": "train", "validation": "val", "test": "test"}
|
880 |
+
),
|
881 |
+
preprocessors=[rename(text="captions")],
|
882 |
+
inf_preprocessor=[add_coco_url],
|
883 |
+
style="coco_captioning",
|
884 |
+
)
|
885 |
+
|
886 |
+
|
887 |
+
add_task(
|
888 |
+
"coco_caption_2017_grouped",
|
889 |
+
source=seqio.TfdsDataSource(
|
890 |
+
tfds_name="coco_all:1.0.1",
|
891 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
892 |
+
),
|
893 |
+
preprocessors=[
|
894 |
+
functools.partial(
|
895 |
+
rekey, key_map={
|
896 |
+
"image/filename": ["image/filename"],
|
897 |
+
"image": ["image"],
|
898 |
+
"text": ["captions", "text"]
|
899 |
+
}),
|
900 |
+
join_captions
|
901 |
+
],
|
902 |
+
style="coco_captioning_multiple",
|
903 |
+
)
|
904 |
+
|
905 |
+
|
906 |
+
add_task(
|
907 |
+
"llava_pretrain",
|
908 |
+
source=seqio.TfdsDataSource(
|
909 |
+
tfds_name="llava_pretrain:1.0.0",
|
910 |
+
tfds_data_dir="gs://mm-olmo-datasets/",
|
911 |
+
splits=dict(
|
912 |
+
train="train[4096:]",
|
913 |
+
validation="train[:4096]"
|
914 |
+
)
|
915 |
+
),
|
916 |
+
preprocessors=[extract_llava],
|
917 |
+
style="web_caption"
|
918 |
+
)
|
919 |
+
|
920 |
+
|
921 |
+
add_task(
|
922 |
+
"rohun_images",
|
923 |
+
source=seqio.TfdsDataSource(
|
924 |
+
tfds_name="rohun_images:1.0.0",
|
925 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
926 |
+
),
|
927 |
+
preprocessors=[],
|
928 |
+
style="long_caption",
|
929 |
+
inf_only=True
|
930 |
+
)
|
931 |
+
|
932 |
+
|
933 |
+
add_task(
|
934 |
+
"dense_caption_eval",
|
935 |
+
source=seqio.TfdsDataSource(
|
936 |
+
tfds_name="dense_captioning_eval:1.0.0",
|
937 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
938 |
+
splits=dict(validation="train")
|
939 |
+
),
|
940 |
+
preprocessors=[],
|
941 |
+
style="long_caption",
|
942 |
+
inf_only=True
|
943 |
+
)
|
944 |
+
|
945 |
+
|
946 |
+
add_task(
|
947 |
+
"dense_caption_eval_dbg",
|
948 |
+
source=seqio.TfdsDataSource(
|
949 |
+
tfds_name="dense_captioning_eval:1.0.0",
|
950 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
951 |
+
splits=dict(validation="train")
|
952 |
+
),
|
953 |
+
preprocessors=[
|
954 |
+
lambda ds: ds.filter(lambda x: x["url"] == "https://explore-multimodal-datasets.s3.us-west-2.amazonaws.com/eval-set/v0/eval-set/a211be07e2c9c722ef75093026a608856bd07ad935ebdedea6f2944b1f2d2b0e.jpg")
|
955 |
+
],
|
956 |
+
style="long_caption",
|
957 |
+
inf_only=True
|
958 |
+
)
|
959 |
+
|
960 |
+
|
961 |
+
add_task(
|
962 |
+
"dense_caption_sample",
|
963 |
+
source=seqio.TfdsDataSource(
|
964 |
+
tfds_name="dense_captioning_eval:1.0.0",
|
965 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
966 |
+
splits=dict(
|
967 |
+
validation="train"
|
968 |
+
)
|
969 |
+
),
|
970 |
+
preprocessors=[select_dense_caption_sample],
|
971 |
+
style="long_caption",
|
972 |
+
)
|
973 |
+
|
974 |
+
|
975 |
+
add_task(
|
976 |
+
"cockatoo_1per_caption_287k",
|
977 |
+
source=seqio.TfdsDataSource(
|
978 |
+
tfds_name="cockatoo_1per_caption_287k:1.0.5",
|
979 |
+
tfds_data_dir="gs://mm-olmo-data/",
|
980 |
+
splits=dict(
|
981 |
+
train="train[5120:]",
|
982 |
+
validation="train[:5120]"
|
983 |
+
)
|
984 |
+
),
|
985 |
+
preprocessors=[
|
986 |
+
rename(text="caption"),
|
987 |
+
],
|
988 |
+
style="long_caption"
|
989 |
+
)
|
990 |
+
|
991 |
+
|
992 |
+
def _filter_large_ratio(ds):
|
993 |
+
return ds.filter(
|
994 |
+
lambda x: tf.shape(x["image"])[0] > tf.shape(x["image"])[1]*2
|
995 |
+
)
|
996 |
+
|
997 |
+
|
998 |
+
add_task(
|
999 |
+
f"cockatoo_dbg",
|
1000 |
+
source= seqio.TfdsDataSource(
|
1001 |
+
tfds_name="cockatoo_476k:1.0.5",
|
1002 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1003 |
+
splits=dict(
|
1004 |
+
train="train[5120:]",
|
1005 |
+
validation="train[:5120]"
|
1006 |
+
)
|
1007 |
+
)
|
1008 |
+
,
|
1009 |
+
preprocessors=[
|
1010 |
+
_filter_large_ratio,
|
1011 |
+
extract_caption_and_transcript
|
1012 |
+
],
|
1013 |
+
style=["long_caption", "transcript"]
|
1014 |
+
)
|
1015 |
+
|
1016 |
+
|
1017 |
+
for name, src in [
|
1018 |
+
("712k_sept6", seqio.TfdsDataSource(
|
1019 |
+
tfds_name="cockatoo_712k_sept6:1.0.5",
|
1020 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1021 |
+
splits=dict(
|
1022 |
+
train="train[5120:]",
|
1023 |
+
validation="train[:5120]"
|
1024 |
+
)
|
1025 |
+
)),
|
1026 |
+
("476k", seqio.TfdsDataSource(
|
1027 |
+
tfds_name="cockatoo_476k:1.0.5",
|
1028 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1029 |
+
splits=dict(
|
1030 |
+
train="train[5120:]",
|
1031 |
+
validation="train[:5120]"
|
1032 |
+
)
|
1033 |
+
)),
|
1034 |
+
("476k_gpt_captions", seqio.TfdsDataSource(
|
1035 |
+
tfds_name="cockatoo_476k_gpt_captions:1.0.5",
|
1036 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1037 |
+
splits=dict(
|
1038 |
+
train="train[5120:]",
|
1039 |
+
validation="train[:5120]"
|
1040 |
+
)
|
1041 |
+
)),
|
1042 |
+
("100k_of_476k_gpt_captions", seqio.TfdsDataSource(
|
1043 |
+
tfds_name="cockatoo_476k_gpt_captions:1.0.5",
|
1044 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1045 |
+
splits=dict(
|
1046 |
+
train="train[5120:105120]",
|
1047 |
+
validation="train[:5120]"
|
1048 |
+
)
|
1049 |
+
)),
|
1050 |
+
("200k_of_476k_gpt_captions", seqio.TfdsDataSource(
|
1051 |
+
tfds_name="cockatoo_476k_gpt_captions:1.0.5",
|
1052 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1053 |
+
splits=dict(
|
1054 |
+
train="train[5120:205120]",
|
1055 |
+
validation="train[:5120]"
|
1056 |
+
)
|
1057 |
+
)),
|
1058 |
+
("300k_of_476k_gpt_captions", seqio.TfdsDataSource(
|
1059 |
+
tfds_name="cockatoo_476k_gpt_captions:1.0.5",
|
1060 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1061 |
+
splits=dict(
|
1062 |
+
train="train[5120:305120]",
|
1063 |
+
validation="train[:5120]"
|
1064 |
+
)
|
1065 |
+
)),
|
1066 |
+
("400k_of_476k_gpt_captions", seqio.TfdsDataSource(
|
1067 |
+
tfds_name="cockatoo_476k_gpt_captions:1.0.5",
|
1068 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1069 |
+
splits=dict(
|
1070 |
+
train="train[5120:405120]",
|
1071 |
+
validation="train[:5120]"
|
1072 |
+
)
|
1073 |
+
)),
|
1074 |
+
("400k_of_476k", seqio.TfdsDataSource(
|
1075 |
+
tfds_name="cockatoo_476k:1.0.5",
|
1076 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1077 |
+
splits=dict(
|
1078 |
+
train="train[5120:405120]",
|
1079 |
+
validation="train[:5120]"
|
1080 |
+
)
|
1081 |
+
)),
|
1082 |
+
("300k_of_476k", seqio.TfdsDataSource(
|
1083 |
+
tfds_name="cockatoo_476k:1.0.5",
|
1084 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1085 |
+
splits=dict(
|
1086 |
+
train="train[5120:305120]",
|
1087 |
+
validation="train[:5120]"
|
1088 |
+
)
|
1089 |
+
)),
|
1090 |
+
("200k_of_476k", seqio.TfdsDataSource(
|
1091 |
+
tfds_name="cockatoo_476k:1.0.5",
|
1092 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1093 |
+
splits=dict(
|
1094 |
+
train="train[5120:205120]",
|
1095 |
+
validation="train[:5120]"
|
1096 |
+
)
|
1097 |
+
)),
|
1098 |
+
("100k_of_476k", seqio.TfdsDataSource(
|
1099 |
+
tfds_name="cockatoo_476k:1.0.5",
|
1100 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1101 |
+
splits=dict(
|
1102 |
+
train="train[5120:105120]",
|
1103 |
+
validation="train[:5120]"
|
1104 |
+
)
|
1105 |
+
)),
|
1106 |
+
("276k", seqio.TfdsDataSource(
|
1107 |
+
tfds_name="cockatoo:1.0.5",
|
1108 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1109 |
+
splits=dict(
|
1110 |
+
train="train[5120:]",
|
1111 |
+
validation="train[:5120]"
|
1112 |
+
)
|
1113 |
+
)),
|
1114 |
+
("180k", seqio.TfdsDataSource(
|
1115 |
+
tfds_name="cockatoo:1.0.3",
|
1116 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1117 |
+
splits=dict(
|
1118 |
+
train="train[4096:]",
|
1119 |
+
validation="train[:4096]"
|
1120 |
+
)
|
1121 |
+
)),
|
1122 |
+
("84k_claude_captions", seqio.TfdsDataSource(
|
1123 |
+
tfds_name="cockatoo_84k_claude_captions:1.0.0",
|
1124 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1125 |
+
splits=dict(
|
1126 |
+
train="train[1000:]",
|
1127 |
+
validation="train[:1000]"
|
1128 |
+
)
|
1129 |
+
)),
|
1130 |
+
]:
|
1131 |
+
add_task(
|
1132 |
+
f"cockatoo_{name}",
|
1133 |
+
source=src,
|
1134 |
+
preprocessors=[extract_caption],
|
1135 |
+
style="long_caption"
|
1136 |
+
)
|
1137 |
+
|
1138 |
+
add_task(
|
1139 |
+
f"cockatoo_and_transcript_{name}",
|
1140 |
+
source=src,
|
1141 |
+
preprocessors=[extract_caption_and_transcript],
|
1142 |
+
style=["long_caption", "transcript"]
|
1143 |
+
)
|
1144 |
+
|
1145 |
+
add_task(
|
1146 |
+
f"cockatoo_and_transcript_stratified_{name}",
|
1147 |
+
source=src,
|
1148 |
+
preprocessors=[
|
1149 |
+
extract_caption_and_transcript,
|
1150 |
+
# put this here to hack seqio into repeating the dataset after
|
1151 |
+
# `extract_caption_and_transcript` which will properly stratify the transcripts
|
1152 |
+
seqio.CacheDatasetPlaceholder(),
|
1153 |
+
],
|
1154 |
+
style=["long_caption", "transcript"]
|
1155 |
+
)
|
1156 |
+
add_task(
|
1157 |
+
f"cockatoo_and_all_transcripts_{name}",
|
1158 |
+
source=src,
|
1159 |
+
preprocessors=[extract_caption_and_all_transcripts],
|
1160 |
+
style=["long_caption", "transcript", "transcript", "transcript"]
|
1161 |
+
)
|
1162 |
+
|
1163 |
+
add_task(
|
1164 |
+
f"cockatoo_all_transcripts_{name}",
|
1165 |
+
source=src,
|
1166 |
+
preprocessors=[extract_all_transcripts],
|
1167 |
+
style="transcript"
|
1168 |
+
)
|
1169 |
+
add_task(
|
1170 |
+
f"cockatoo_transcripts_{name}",
|
1171 |
+
source=src,
|
1172 |
+
preprocessors=[extract_transcript],
|
1173 |
+
style="transcript"
|
1174 |
+
)
|
1175 |
+
|
1176 |
+
|
1177 |
+
TFRECORD_IMAGE_TEXT_FEATURES = {
|
1178 |
+
'image': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
|
1179 |
+
'text':tf.io.FixedLenFeature(shape=(), dtype=tf.string),
|
1180 |
+
}
|
1181 |
+
|
1182 |
+
|
1183 |
+
add_task(
|
1184 |
+
"laion400m",
|
1185 |
+
source=seqio.TFExampleDataSource(
|
1186 |
+
split_to_filepattern={
|
1187 |
+
"train": os.path.join("gs://unified-io-2-us-east/", "pretrain-datasets", "laion400m", "1.0.0", "laion400m-train*"),
|
1188 |
+
},
|
1189 |
+
feature_description=TFRECORD_IMAGE_TEXT_FEATURES,
|
1190 |
+
),
|
1191 |
+
preprocessors=[
|
1192 |
+
functools.partial(rekey, key_map={
|
1193 |
+
"image": ["image"],
|
1194 |
+
"text": ["text"]
|
1195 |
+
}),
|
1196 |
+
],
|
1197 |
+
style="laion",
|
1198 |
+
)
|
1199 |
+
|
1200 |
+
|
1201 |
+
add_task(
|
1202 |
+
"laion_2B",
|
1203 |
+
source=seqio.TFExampleDataSource(
|
1204 |
+
split_to_filepattern={
|
1205 |
+
"train": os.path.join(MULTITASK_TFDS_DATA_DIR, "laion2b_en", "1.0.0", "laion2b_en-train*"),
|
1206 |
+
},
|
1207 |
+
feature_description=TFRECORD_IMAGE_TEXT_FEATURES,
|
1208 |
+
),
|
1209 |
+
preprocessors=[
|
1210 |
+
functools.partial(rekey, key_map={
|
1211 |
+
"image": ["image"],
|
1212 |
+
"text": ["text"]
|
1213 |
+
}),
|
1214 |
+
],
|
1215 |
+
style="laion",
|
1216 |
+
)
|
1217 |
+
|
1218 |
+
|
1219 |
+
add_task(
|
1220 |
+
"region_caption_vg",
|
1221 |
+
source=seqio.TfdsDataSource(
|
1222 |
+
tfds_name="vg:1.0.1",
|
1223 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1224 |
+
),
|
1225 |
+
preprocessors=[region_captions_to_dense],
|
1226 |
+
style="region_captions",
|
1227 |
+
)
|
1228 |
+
|
1229 |
+
|
1230 |
+
add_task(
|
1231 |
+
"pdfa_eng_wds",
|
1232 |
+
source=seqio.TfdsDataSource(
|
1233 |
+
tfds_name="pdfa_eng_wds:1.0.0",
|
1234 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1235 |
+
),
|
1236 |
+
preprocessors=[
|
1237 |
+
functools.partial(max_words, max_words=400),
|
1238 |
+
format_pdfa_eng_wds
|
1239 |
+
],
|
1240 |
+
style="pdfa_eng_wds",
|
1241 |
+
)
|
1242 |
+
|
1243 |
+
|
1244 |
+
add_task(
|
1245 |
+
"idl_words",
|
1246 |
+
source=seqio.TfdsDataSource(
|
1247 |
+
tfds_name="idl_words:1.0.0",
|
1248 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1249 |
+
),
|
1250 |
+
preprocessors=[],
|
1251 |
+
style="idl_words",
|
1252 |
+
)
|
1253 |
+
|
1254 |
+
|
1255 |
+
|
1256 |
+
open_image_v6_keys_to_features = {
|
1257 |
+
'image': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
|
1258 |
+
'image_id': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
|
1259 |
+
'detection/label':tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.string, allow_missing=True),
|
1260 |
+
'detection/bbox':tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.float32, allow_missing=True),
|
1261 |
+
'detection/num':tf.io.FixedLenFeature(shape=(), dtype=tf.int64),
|
1262 |
+
'vrd/sub_label': tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.string, allow_missing=True),
|
1263 |
+
'vrd/obj_label': tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.string, allow_missing=True),
|
1264 |
+
'vrd/sub_bbox':tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.float32, allow_missing=True),
|
1265 |
+
'vrd/obj_bbox':tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.float32, allow_missing=True),
|
1266 |
+
'vrd/relation': tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.string, allow_missing=True),
|
1267 |
+
'vrd/num':tf.io.FixedLenFeature(shape=(), dtype=tf.int64),
|
1268 |
+
'cap/cap_caption': tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.string, allow_missing=True),
|
1269 |
+
'cap/num':tf.io.FixedLenFeature(shape=(), dtype=tf.int64),
|
1270 |
+
'seg/masks': tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.string, allow_missing=True),
|
1271 |
+
'seg/num':tf.io.FixedLenFeature(shape=(), dtype=tf.int64),
|
1272 |
+
'seg/label': tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.string, allow_missing=True),
|
1273 |
+
'seg/bbox': tf.io.FixedLenSequenceFeature(shape=(), dtype=tf.float32, allow_missing=True),
|
1274 |
+
}
|
1275 |
+
|
1276 |
+
|
1277 |
+
add_task(
|
1278 |
+
"localized_narratives_v6",
|
1279 |
+
source=seqio.TFExampleDataSource(
|
1280 |
+
split_to_filepattern={
|
1281 |
+
"train": os.path.join(MULTITASK_TFDS_DATA_DIR, "open_image_v6", "1.0.0", "open_image_v6-train*"),
|
1282 |
+
},
|
1283 |
+
feature_description=open_image_v6_keys_to_features,
|
1284 |
+
),
|
1285 |
+
preprocessors=[extract_localized_narrative],
|
1286 |
+
style="localized_narratives",
|
1287 |
+
)
|
1288 |
+
|
1289 |
+
|
1290 |
+
add_task(
|
1291 |
+
"lvis_objects",
|
1292 |
+
# A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
|
1293 |
+
source=seqio.TfdsDataSource(
|
1294 |
+
tfds_name="lvis:1.2.0",
|
1295 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1296 |
+
),
|
1297 |
+
preprocessors=[
|
1298 |
+
extract_lvis,
|
1299 |
+
region_captions_to_dense,
|
1300 |
+
],
|
1301 |
+
style="lvis_objects",
|
1302 |
+
)
|
1303 |
+
|
1304 |
+
|
1305 |
+
add_task(
|
1306 |
+
"open_images_with_objects",
|
1307 |
+
# A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
|
1308 |
+
source=seqio.TFExampleDataSource(
|
1309 |
+
split_to_filepattern={
|
1310 |
+
"train": os.path.join(MULTITASK_TFDS_DATA_DIR, "open_image_v6", "1.0.0", "open_image_v6-train*"),
|
1311 |
+
},
|
1312 |
+
feature_description=open_image_v6_keys_to_features,
|
1313 |
+
),
|
1314 |
+
preprocessors=[
|
1315 |
+
extract_open_images_boxes,
|
1316 |
+
region_captions_to_dense,
|
1317 |
+
],
|
1318 |
+
style="visual_narratives_with_objects",
|
1319 |
+
)
|
1320 |
+
|
1321 |
+
|
1322 |
+
add_task(
|
1323 |
+
"cockatoo_with_acc_476k_gpt_captions",
|
1324 |
+
source=seqio.TfdsDataSource(
|
1325 |
+
tfds_name="cockatoo_with_acc_476k_gpt_captions:1.0.0",
|
1326 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1327 |
+
splits=dict(
|
1328 |
+
train="train[5120:]",
|
1329 |
+
validation="train[:5120]"
|
1330 |
+
)
|
1331 |
+
),
|
1332 |
+
preprocessors=[accuracy_conditioned_joint],
|
1333 |
+
inf_preprocessor=[functools.partial(accuracy_conditioned_joint, is_eval=True)],
|
1334 |
+
style=None
|
1335 |
+
)
|
1336 |
+
|
1337 |
+
|
1338 |
+
add_task(
|
1339 |
+
"dense_caption_eval_with_acc",
|
1340 |
+
source=seqio.TfdsDataSource(
|
1341 |
+
tfds_name="dense_captioning_eval:1.0.0",
|
1342 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1343 |
+
splits=dict(validation="train")
|
1344 |
+
),
|
1345 |
+
preprocessors=[functools.partial(accuracy_conditioned_joint, is_eval=True)],
|
1346 |
+
style="long_caption",
|
1347 |
+
inf_only=True
|
1348 |
+
)
|
1349 |
+
|
1350 |
+
# ************************
|
1351 |
+
# VQA Datasets
|
1352 |
+
# ************************
|
1353 |
+
|
1354 |
+
add_task(
|
1355 |
+
"science_qa_img",
|
1356 |
+
source=seqio.TfdsDataSource(
|
1357 |
+
tfds_name="science_qa:1.0.0",
|
1358 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1359 |
+
splits={"train": "train", "validation": "val", "test": "test"}
|
1360 |
+
),
|
1361 |
+
preprocessors=[
|
1362 |
+
image_only,
|
1363 |
+
rename(answer_idx="answer"),
|
1364 |
+
build_question_with_hint,
|
1365 |
+
format_multiple_choice_qa
|
1366 |
+
],
|
1367 |
+
style="science_qa",
|
1368 |
+
)
|
1369 |
+
|
1370 |
+
|
1371 |
+
add_task(
|
1372 |
+
"tabwmp_da",
|
1373 |
+
source=seqio.TfdsDataSource(
|
1374 |
+
tfds_name="tab_mwp:1.0.0",
|
1375 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1376 |
+
splits={"train": "train", "validation": "dev", "test": "test"}
|
1377 |
+
),
|
1378 |
+
preprocessors=[
|
1379 |
+
rename(text="answer")
|
1380 |
+
],
|
1381 |
+
style="tabwmp_da",
|
1382 |
+
)
|
1383 |
+
|
1384 |
+
|
1385 |
+
add_task(
|
1386 |
+
"figure_qa",
|
1387 |
+
source=seqio.TfdsDataSource(
|
1388 |
+
tfds_name="figure_qa:1.0.2",
|
1389 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1390 |
+
splits={"train": "train1", "validation": "validation1", "test": "no_annot_test1"}
|
1391 |
+
),
|
1392 |
+
preprocessors=[extract_figureqa, extract_individual_vqa],
|
1393 |
+
style="figure_qa",
|
1394 |
+
)
|
1395 |
+
|
1396 |
+
add_task(
|
1397 |
+
"figure_qa_zero_shot",
|
1398 |
+
source=seqio.TfdsDataSource(
|
1399 |
+
tfds_name="figure_qa:1.0.2",
|
1400 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1401 |
+
splits={"train": "train1", "validation": "validation1", "test": "no_annot_test1"}
|
1402 |
+
),
|
1403 |
+
preprocessors=[extract_figureqa, convert_figureqa_answer, extract_individual_vqa],
|
1404 |
+
style="figure_qa",
|
1405 |
+
)
|
1406 |
+
|
1407 |
+
|
1408 |
+
add_task(
|
1409 |
+
"plot_qa",
|
1410 |
+
source=seqio.TfdsDataSource(
|
1411 |
+
tfds_name="plot_qa:1.0.0",
|
1412 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1413 |
+
),
|
1414 |
+
preprocessors=[extract_figureqa, extract_individual_vqa],
|
1415 |
+
inf_preprocessor=[
|
1416 |
+
extract_figureqa,
|
1417 |
+
functools.partial(flatten_parts, parts=["questions", "answer", "question_id"]),
|
1418 |
+
extract_individual_vqa
|
1419 |
+
],
|
1420 |
+
style="plot_qa",
|
1421 |
+
)
|
1422 |
+
|
1423 |
+
|
1424 |
+
add_task(
|
1425 |
+
"ai2_diagram",
|
1426 |
+
source=seqio.TfdsDataSource(
|
1427 |
+
tfds_name="ai2_diagram:1.0.2",
|
1428 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1429 |
+
splits={"train": "train[1024:]", "validation": "train[:1024]", "test": "test"}
|
1430 |
+
),
|
1431 |
+
preprocessors=[
|
1432 |
+
rename(choices="answer_texts", answer_idx="correct_answer"),
|
1433 |
+
format_multiple_choice_qa
|
1434 |
+
],
|
1435 |
+
style="ai2_diagram",
|
1436 |
+
)
|
1437 |
+
|
1438 |
+
|
1439 |
+
add_task(
|
1440 |
+
"ai2_diagram_v2",
|
1441 |
+
source=seqio.TfdsDataSource(
|
1442 |
+
tfds_name="ai2_diagram_v2:1.0.1",
|
1443 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1444 |
+
),
|
1445 |
+
preprocessors=[
|
1446 |
+
rename(choices="answer_texts", answer_idx="correct_answer"),
|
1447 |
+
format_ai2d
|
1448 |
+
],
|
1449 |
+
style="ai2_diagram",
|
1450 |
+
)
|
1451 |
+
|
1452 |
+
|
1453 |
+
add_task(
|
1454 |
+
"ai2_diagram_v2_transparent",
|
1455 |
+
source=seqio.TfdsDataSource(
|
1456 |
+
tfds_name="ai2_diagram_v2_transparent:1.0.5",
|
1457 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1458 |
+
),
|
1459 |
+
preprocessors=[
|
1460 |
+
rename(choices="answer_texts", answer_idx="correct_answer"),
|
1461 |
+
format_ai2d
|
1462 |
+
],
|
1463 |
+
style="ai2_diagram",
|
1464 |
+
)
|
1465 |
+
|
1466 |
+
# ai2_diagram_v2 mixed with addiitonal abc label questions with transparent box.
|
1467 |
+
# Shares the same image split as ai2_diagram_v2.
|
1468 |
+
add_task(
|
1469 |
+
"ai2_diagram_v2_mix_transparent",
|
1470 |
+
source=seqio.TfdsDataSource(
|
1471 |
+
tfds_name="ai2_diagram_v2_mix_transparent:1.0.6",
|
1472 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1473 |
+
splits={
|
1474 |
+
"train": "train_mix",
|
1475 |
+
"validation": "validation_mix",
|
1476 |
+
"test": "test_mix", # test should only use either transparent or opaque
|
1477 |
+
# "test": "test_opaque",
|
1478 |
+
}
|
1479 |
+
),
|
1480 |
+
preprocessors=[
|
1481 |
+
rename(choices="answer_texts", answer_idx="correct_answer"),
|
1482 |
+
format_ai2d
|
1483 |
+
],
|
1484 |
+
style="ai2_diagram",
|
1485 |
+
)
|
1486 |
+
|
1487 |
+
add_task(
|
1488 |
+
"ai2_diagram_v2_mix_transparent_one_style",
|
1489 |
+
source=seqio.TfdsDataSource(
|
1490 |
+
tfds_name="ai2_diagram_v2_mix_transparent:1.0.6",
|
1491 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1492 |
+
splits={
|
1493 |
+
"train": "train_mix",
|
1494 |
+
"validation": "validation_mix",
|
1495 |
+
"test": "test_mix", # test should only use either transparent or opaque
|
1496 |
+
# "test": "test_opaque",
|
1497 |
+
}
|
1498 |
+
),
|
1499 |
+
preprocessors=[
|
1500 |
+
rename(choices="answer_texts", answer_idx="correct_answer"),
|
1501 |
+
functools.partial(format_ai2d, variable_style=False),
|
1502 |
+
],
|
1503 |
+
style="ai2_diagram",
|
1504 |
+
)
|
1505 |
+
|
1506 |
+
|
1507 |
+
for src, test_sets in [
|
1508 |
+
["refclef_unc", ["testA", "testB", "testC", "testAB", "testBC"]],
|
1509 |
+
["refcoco_unc", ["testA", "testB"]],
|
1510 |
+
["refcocoplus_unc", ["testA", "testB"]],
|
1511 |
+
["refcocog_umd", ["test"]],
|
1512 |
+
]:
|
1513 |
+
if "coco" in src:
|
1514 |
+
add_url = [add_coco_url]
|
1515 |
+
else:
|
1516 |
+
add_url = []
|
1517 |
+
splits = {x: x for x in test_sets}
|
1518 |
+
splits.update({"train": "train", "validation": "val"})
|
1519 |
+
add_task(
|
1520 |
+
src,
|
1521 |
+
source=seqio.TfdsDataSource(
|
1522 |
+
tfds_name=f"{src}:1.0.2",
|
1523 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1524 |
+
splits=splits
|
1525 |
+
),
|
1526 |
+
preprocessors=[refexp],
|
1527 |
+
inf_preprocessor=add_url + [
|
1528 |
+
refexp_inf,
|
1529 |
+
# Flatten objects
|
1530 |
+
functools.partial(flatten_parts, parts=["refexp", "metadata/bbox"]),
|
1531 |
+
# Flatten expressions
|
1532 |
+
functools.partial(flatten_parts, parts=["refexp"])
|
1533 |
+
],
|
1534 |
+
style="refexp",
|
1535 |
+
decode_image=True,
|
1536 |
+
)
|
1537 |
+
add_task(
|
1538 |
+
src + "_pointing",
|
1539 |
+
source=seqio.TfdsDataSource(
|
1540 |
+
tfds_name=f"{src}:1.0.2",
|
1541 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1542 |
+
splits=splits
|
1543 |
+
),
|
1544 |
+
preprocessors=[refexp_pointing],
|
1545 |
+
inf_preprocessor=add_url + [
|
1546 |
+
refexp_pointing_inf,
|
1547 |
+
functools.partial(flatten_parts, parts=["refexp", "metadata/bbox", "metadata/mask", "metadata/answer"]),
|
1548 |
+
functools.partial(flatten_parts, parts=["refexp"])
|
1549 |
+
],
|
1550 |
+
decode_image=True,
|
1551 |
+
style="refexp_pointing",
|
1552 |
+
)
|
1553 |
+
|
1554 |
+
|
1555 |
+
# FIXME
|
1556 |
+
add_task(
|
1557 |
+
"ai2_diagram_test",
|
1558 |
+
source=seqio.TfdsDataSource(
|
1559 |
+
tfds_name="ai2_diagram:1.0.2",
|
1560 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1561 |
+
splits={"train": "train[1024:]", "validation": "train[:1024]", "test": "test"}
|
1562 |
+
),
|
1563 |
+
preprocessors=[
|
1564 |
+
rename(choices="answer_texts", answer_idx="correct_answer"),
|
1565 |
+
format_multiple_choice_qa
|
1566 |
+
],
|
1567 |
+
style="ai2_diagram",
|
1568 |
+
)
|
1569 |
+
|
1570 |
+
|
1571 |
+
add_task(
|
1572 |
+
"gqa",
|
1573 |
+
source=seqio.TfdsDataSource(
|
1574 |
+
tfds_name="gqa:1.0.1",
|
1575 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1576 |
+
splits={"train": "train", "validation": "val", "test": "test"}
|
1577 |
+
),
|
1578 |
+
preprocessors=[
|
1579 |
+
functools.partial(format_gqa, is_balanced=True),
|
1580 |
+
extract_individual_vqa,
|
1581 |
+
],
|
1582 |
+
inf_preprocessor=[
|
1583 |
+
functools.partial(format_gqa, is_balanced=True),
|
1584 |
+
extract_individual_vqa,
|
1585 |
+
],
|
1586 |
+
style="gqa",
|
1587 |
+
)
|
1588 |
+
|
1589 |
+
|
1590 |
+
add_task(
|
1591 |
+
"gqa_multi",
|
1592 |
+
source=seqio.TfdsDataSource(
|
1593 |
+
tfds_name="gqa:1.0.1",
|
1594 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1595 |
+
splits={"train": "train", "validation": "val", "test": "test"}
|
1596 |
+
),
|
1597 |
+
preprocessors=[
|
1598 |
+
functools.partial(format_gqa, is_balanced=True, flatten=False),
|
1599 |
+
extract_individual_vqa,
|
1600 |
+
],
|
1601 |
+
inf_preprocessor=[
|
1602 |
+
functools.partial(format_gqa, is_balanced=True, flatten=False),
|
1603 |
+
extract_individual_vqa,
|
1604 |
+
],
|
1605 |
+
style="gqa",
|
1606 |
+
)
|
1607 |
+
|
1608 |
+
|
1609 |
+
add_task(
|
1610 |
+
"text_vqa",
|
1611 |
+
source=seqio.TfdsDataSource(
|
1612 |
+
tfds_name="text_vqa:1.0.3",
|
1613 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1614 |
+
),
|
1615 |
+
preprocessors=[
|
1616 |
+
functools.partial(
|
1617 |
+
rekey, key_map={
|
1618 |
+
"image": ["image"],
|
1619 |
+
"questions": ["question"],
|
1620 |
+
"answers": ["answers"],
|
1621 |
+
"id": ["question_id"]
|
1622 |
+
}),
|
1623 |
+
extract_individual_vqa,
|
1624 |
+
],
|
1625 |
+
style="text_vqa",
|
1626 |
+
)
|
1627 |
+
|
1628 |
+
|
1629 |
+
add_task(
|
1630 |
+
"okvqa",
|
1631 |
+
source=seqio.TfdsDataSource(
|
1632 |
+
tfds_name="ok_vqa:1.0.2",
|
1633 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1634 |
+
),
|
1635 |
+
preprocessors=[
|
1636 |
+
rename(example_id="question_id"),
|
1637 |
+
add_coco_url,
|
1638 |
+
extract_individual_vqa,
|
1639 |
+
],
|
1640 |
+
style="okvqa",
|
1641 |
+
)
|
1642 |
+
|
1643 |
+
add_task(
|
1644 |
+
"a_okvqa_da",
|
1645 |
+
source=seqio.TfdsDataSource(
|
1646 |
+
tfds_name="a_ok_vqa:1.0.2",
|
1647 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1648 |
+
splits={"train": "train", "validation": "val", "test": "test"}
|
1649 |
+
),
|
1650 |
+
preprocessors=[
|
1651 |
+
rename(**{
|
1652 |
+
"example_id": "question_id",
|
1653 |
+
"answers": "direct_answers",
|
1654 |
+
"metadata/difficult_direct_answer": "difficult_direct_answer"
|
1655 |
+
}),
|
1656 |
+
extract_individual_vqa,
|
1657 |
+
],
|
1658 |
+
inf_preprocessor=[
|
1659 |
+
filter_difficult_direct_answer,
|
1660 |
+
rename(**{
|
1661 |
+
"example_id": "question_id",
|
1662 |
+
"answers": "direct_answers",
|
1663 |
+
"metadata/difficult_direct_answer": "difficult_direct_answer"
|
1664 |
+
}),
|
1665 |
+
add_coco_url,
|
1666 |
+
extract_individual_vqa,
|
1667 |
+
],
|
1668 |
+
style="a_okvqa_da",
|
1669 |
+
)
|
1670 |
+
|
1671 |
+
|
1672 |
+
add_task(
|
1673 |
+
"a_okvqa_mc",
|
1674 |
+
source=seqio.TfdsDataSource(
|
1675 |
+
tfds_name="a_ok_vqa:1.0.2",
|
1676 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1677 |
+
splits={"train": "train", "validation": "val", "test": "test"}
|
1678 |
+
),
|
1679 |
+
preprocessors=[
|
1680 |
+
rename(**{
|
1681 |
+
"example_id": "question_id",
|
1682 |
+
"metadata/difficult_direct_answer": "difficult_direct_answer",
|
1683 |
+
"answer_idx": "correct_choice_idx"
|
1684 |
+
}),
|
1685 |
+
add_coco_url,
|
1686 |
+
format_multiple_choice_qa,
|
1687 |
+
],
|
1688 |
+
style="a_okvqa_mc",
|
1689 |
+
)
|
1690 |
+
|
1691 |
+
|
1692 |
+
add_task(
|
1693 |
+
"dv_qa",
|
1694 |
+
source=seqio.TfdsDataSource(
|
1695 |
+
tfds_name="dv_qa:1.0.0",
|
1696 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1697 |
+
splits={"train": "train", "validation": "val_easy"}
|
1698 |
+
),
|
1699 |
+
preprocessors=[
|
1700 |
+
extract_figureqa,
|
1701 |
+
extract_individual_vqa,
|
1702 |
+
],
|
1703 |
+
inf_preprocessor=[
|
1704 |
+
extract_figureqa,
|
1705 |
+
flatten_vqa,
|
1706 |
+
extract_individual_vqa
|
1707 |
+
],
|
1708 |
+
style="dv_qa",
|
1709 |
+
)
|
1710 |
+
|
1711 |
+
|
1712 |
+
@seqio.map_over_dataset
|
1713 |
+
def add_image_question_example_id(ex):
|
1714 |
+
key = tf.strings.join([ex["question"], "\n\n", ex["image"]])
|
1715 |
+
ex["metadata/example_id"] = tf.strings.to_hash_bucket(key, 2**30)
|
1716 |
+
return ex
|
1717 |
+
|
1718 |
+
|
1719 |
+
add_task(
|
1720 |
+
"chart_qa",
|
1721 |
+
source=seqio.TfdsDataSource(
|
1722 |
+
tfds_name="chart_qa:1.0.2",
|
1723 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1724 |
+
splits={"train": "train", "validation": "val", "test": "test"}
|
1725 |
+
),
|
1726 |
+
preprocessors=[
|
1727 |
+
rename(question="query", answer="label", **{"metadata/is_human": "is_human"}),
|
1728 |
+
add_image_question_example_id,
|
1729 |
+
extract_individual_vqa,
|
1730 |
+
],
|
1731 |
+
style="chart_qa",
|
1732 |
+
)
|
1733 |
+
|
1734 |
+
|
1735 |
+
add_task(
|
1736 |
+
"chart_qa_ex",
|
1737 |
+
source=seqio.TfdsDataSource(
|
1738 |
+
tfds_name="chart_qa:1.0.2",
|
1739 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1740 |
+
splits={"train": "train", "validation": "val", "test": "test"}
|
1741 |
+
),
|
1742 |
+
preprocessors=[
|
1743 |
+
rename(question="query", answer="label", **{"metadata/is_human": "is_human"}),
|
1744 |
+
extract_individual_vqa,
|
1745 |
+
],
|
1746 |
+
style="scifi_charts_qa_exp",
|
1747 |
+
)
|
1748 |
+
|
1749 |
+
|
1750 |
+
add_task(
|
1751 |
+
"chart_qa_weighted",
|
1752 |
+
source=seqio.TfdsDataSource(
|
1753 |
+
tfds_name="chart_qa:1.0.2",
|
1754 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1755 |
+
splits={"train": "train", "validation": "val", "test": "test"}
|
1756 |
+
),
|
1757 |
+
preprocessors=[
|
1758 |
+
rename(question="query", answer="label", **{"metadata/is_human": "is_human"}),
|
1759 |
+
extract_individual_vqa,
|
1760 |
+
functools.partial(reweight_chartqa, human=2*20901/(20901+7398), aug=2*7398/(20901+7398)),
|
1761 |
+
],
|
1762 |
+
style="chart_qa",
|
1763 |
+
)
|
1764 |
+
|
1765 |
+
|
1766 |
+
add_task(
|
1767 |
+
"chart_qa_human",
|
1768 |
+
source=seqio.TfdsDataSource(
|
1769 |
+
tfds_name="chart_qa:1.0.2",
|
1770 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1771 |
+
splits={"train": "train", "validation": "val", "test": "test"}
|
1772 |
+
),
|
1773 |
+
preprocessors=[
|
1774 |
+
rename(question="query", answer="label"),
|
1775 |
+
add_image_question_example_id,
|
1776 |
+
filter_human,
|
1777 |
+
extract_individual_vqa,
|
1778 |
+
],
|
1779 |
+
style="chart_qa",
|
1780 |
+
)
|
1781 |
+
|
1782 |
+
|
1783 |
+
add_task(
|
1784 |
+
"chart_qa_aug",
|
1785 |
+
source=seqio.TfdsDataSource(
|
1786 |
+
tfds_name="chart_qa:1.0.2",
|
1787 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1788 |
+
splits={"train": "train", "validation": "val", "test": "test"}
|
1789 |
+
),
|
1790 |
+
preprocessors=[
|
1791 |
+
rename(question="query", answer="label"),
|
1792 |
+
filter_aug,
|
1793 |
+
extract_individual_vqa,
|
1794 |
+
],
|
1795 |
+
style="chart_qa",
|
1796 |
+
)
|
1797 |
+
|
1798 |
+
|
1799 |
+
add_task(
|
1800 |
+
"doc_qa",
|
1801 |
+
source=seqio.TfdsDataSource(
|
1802 |
+
tfds_name="doc_qa:1.0.1",
|
1803 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1804 |
+
splits={"train": "train", "validation": "val", "test": "test"}
|
1805 |
+
),
|
1806 |
+
preprocessors=[fix_doqa_url, extract_individual_vqa],
|
1807 |
+
style="doc_qa",
|
1808 |
+
)
|
1809 |
+
|
1810 |
+
|
1811 |
+
add_task(
|
1812 |
+
"ocr_qa",
|
1813 |
+
source=seqio.TfdsDataSource(
|
1814 |
+
tfds_name="ocr_vqa:1.0.0",
|
1815 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1816 |
+
),
|
1817 |
+
preprocessors=[extract_individual_vqa],
|
1818 |
+
inf_preprocessor=[flatten_vqa, extract_individual_vqa],
|
1819 |
+
style="ocr_vqa",
|
1820 |
+
)
|
1821 |
+
|
1822 |
+
|
1823 |
+
add_task(
|
1824 |
+
"st_qa",
|
1825 |
+
source=seqio.TfdsDataSource(
|
1826 |
+
tfds_name="st_vqa:1.0.2",
|
1827 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1828 |
+
splits={"train": "train[1024:]", "validation": "train[:1024]", "test": "test"}
|
1829 |
+
),
|
1830 |
+
preprocessors=[extract_individual_vqa],
|
1831 |
+
inf_preprocessor=[extract_individual_vqa],
|
1832 |
+
style="st_qa",
|
1833 |
+
)
|
1834 |
+
|
1835 |
+
|
1836 |
+
add_task(
|
1837 |
+
"tally_qa",
|
1838 |
+
source=seqio.TfdsDataSource(
|
1839 |
+
tfds_name="tally_qa:1.0.2",
|
1840 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1841 |
+
splits={"train": "train", "validation": "test"}
|
1842 |
+
),
|
1843 |
+
preprocessors=[
|
1844 |
+
extract_tally_qa,
|
1845 |
+
extract_individual_vqa
|
1846 |
+
],
|
1847 |
+
inf_preprocessor=[
|
1848 |
+
extract_tally_qa,
|
1849 |
+
flatten_vqa,
|
1850 |
+
extract_individual_vqa
|
1851 |
+
],
|
1852 |
+
style="tally_qa",
|
1853 |
+
)
|
1854 |
+
|
1855 |
+
|
1856 |
+
add_task(
|
1857 |
+
"info_qa",
|
1858 |
+
source=seqio.TfdsDataSource(
|
1859 |
+
tfds_name="info_qa:1.0.0",
|
1860 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1861 |
+
splits={"train": "train", "validation": "val", "test": "test"}
|
1862 |
+
),
|
1863 |
+
preprocessors=[extract_individual_vqa],
|
1864 |
+
style="info_qa",
|
1865 |
+
)
|
1866 |
+
|
1867 |
+
add_task(
|
1868 |
+
"android_control",
|
1869 |
+
source=seqio.TfdsDataSource(
|
1870 |
+
tfds_name="android_control:2.0.0",
|
1871 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1872 |
+
splits={"train": "train", "validation": "val", "test": "test"}
|
1873 |
+
),
|
1874 |
+
preprocessors=[extract_android_control],
|
1875 |
+
style="android_control",
|
1876 |
+
)
|
1877 |
+
|
1878 |
+
for mode in ["ll", "hl", "hl_ll", "hl_cot"]:
|
1879 |
+
add_task(
|
1880 |
+
f"android_control_{mode}",
|
1881 |
+
source=seqio.TfdsDataSource(
|
1882 |
+
tfds_name="android_control:2.0.0",
|
1883 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1884 |
+
splits={"train": "train", "validation": "val", "test": "test"}
|
1885 |
+
),
|
1886 |
+
preprocessors=[functools.partial(extract_andriod_control_inf, mode=mode)],
|
1887 |
+
style="android_control",
|
1888 |
+
)
|
1889 |
+
|
1890 |
+
|
1891 |
+
map_coco_vqa = functools.partial(rekey, key_map={
|
1892 |
+
"image": ["image"],
|
1893 |
+
"questions": ["vqa", "questions"],
|
1894 |
+
"answers": ["vqa", "answers"],
|
1895 |
+
"id": ["vqa", "id"],
|
1896 |
+
"metadata/image_url": ["metadata/image_url"],
|
1897 |
+
})
|
1898 |
+
|
1899 |
+
|
1900 |
+
add_task(
|
1901 |
+
"coco_2017_vqa",
|
1902 |
+
source=seqio.TfdsDataSource(
|
1903 |
+
tfds_name="coco_all:1.0.1",
|
1904 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1905 |
+
),
|
1906 |
+
preprocessors=[
|
1907 |
+
add_coco_url,
|
1908 |
+
map_coco_vqa,
|
1909 |
+
flatten_vqa,
|
1910 |
+
extract_individual_vqa
|
1911 |
+
],
|
1912 |
+
style="vqa2",
|
1913 |
+
)
|
1914 |
+
|
1915 |
+
|
1916 |
+
add_task(
|
1917 |
+
"cockatoo_qa",
|
1918 |
+
source=seqio.TfdsDataSource(
|
1919 |
+
tfds_name="cockatoo_qa:1.0.0",
|
1920 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1921 |
+
splits=dict(
|
1922 |
+
train="train[5120:]",
|
1923 |
+
validation="train[:5120]"
|
1924 |
+
)
|
1925 |
+
),
|
1926 |
+
preprocessors=[rename(text="answer")],
|
1927 |
+
style=None,
|
1928 |
+
)
|
1929 |
+
|
1930 |
+
|
1931 |
+
add_task(
|
1932 |
+
"synthetic_qa_v3",
|
1933 |
+
source=seqio.TfdsDataSource(
|
1934 |
+
tfds_name="synthetic_qa_v3:0.0.4",
|
1935 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1936 |
+
splits=dict(
|
1937 |
+
train="train[2048:]",
|
1938 |
+
validation="train[:2048]"
|
1939 |
+
)
|
1940 |
+
),
|
1941 |
+
preprocessors=[extract_cockatoo_qa_v2, prefix_how_many_messages],
|
1942 |
+
style="synthetic_qa",
|
1943 |
+
)
|
1944 |
+
|
1945 |
+
|
1946 |
+
add_task(
|
1947 |
+
"synthetic_qa_v3_style_tag",
|
1948 |
+
source=seqio.TfdsDataSource(
|
1949 |
+
tfds_name="synthetic_qa_v3:0.0.4",
|
1950 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1951 |
+
splits=dict(
|
1952 |
+
train="train[2048:]",
|
1953 |
+
validation="train[:2048]"
|
1954 |
+
)
|
1955 |
+
),
|
1956 |
+
preprocessors=[extract_cockatoo_qa_v2, prefix_how_many_messages],
|
1957 |
+
style="llm_qa",
|
1958 |
+
)
|
1959 |
+
|
1960 |
+
|
1961 |
+
add_task(
|
1962 |
+
"synthetic_qa_v3_as_user_qa",
|
1963 |
+
source=seqio.TfdsDataSource(
|
1964 |
+
tfds_name="synthetic_qa_v3:0.0.4",
|
1965 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1966 |
+
splits=dict(
|
1967 |
+
train="train[2048:]",
|
1968 |
+
validation="train[:2048]"
|
1969 |
+
)
|
1970 |
+
),
|
1971 |
+
preprocessors=[extract_cockatoo_qa_v2, prefix_how_many_messages],
|
1972 |
+
style="user_qa",
|
1973 |
+
)
|
1974 |
+
|
1975 |
+
|
1976 |
+
add_task(
|
1977 |
+
"synthetic_qa_v3_multi_turn",
|
1978 |
+
source=seqio.TfdsDataSource(
|
1979 |
+
tfds_name="synthetic_qa_v3:0.0.4",
|
1980 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1981 |
+
splits=dict(
|
1982 |
+
train="train[2048:]",
|
1983 |
+
validation="train[:2048]"
|
1984 |
+
)
|
1985 |
+
),
|
1986 |
+
preprocessors=[extract_cockatoo_qa_v2, filter_single_turn, prefix_how_many_messages],
|
1987 |
+
style="synthetic_qa",
|
1988 |
+
)
|
1989 |
+
|
1990 |
+
|
1991 |
+
NE_SHARDS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
|
1992 |
+
|
1993 |
+
for i in NE_SHARDS:
|
1994 |
+
add_task(
|
1995 |
+
f"named_entity{i}",
|
1996 |
+
source=seqio.TfdsDataSource(
|
1997 |
+
tfds_name=f"named_entities_qa_{i}_of_18:1.0.0",
|
1998 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
1999 |
+
splits=dict(
|
2000 |
+
train="train[1024:]",
|
2001 |
+
validation="train[:1024]"
|
2002 |
+
)
|
2003 |
+
),
|
2004 |
+
preprocessors=[filter_named_entity, extract_named_entity, extract_individual_vqa],
|
2005 |
+
inf_preprocessor=[
|
2006 |
+
filter_named_entity,
|
2007 |
+
extract_named_entity,
|
2008 |
+
flatten_vqa,
|
2009 |
+
extract_individual_vqa
|
2010 |
+
],
|
2011 |
+
style="named_entity",
|
2012 |
+
ignore_errors=True
|
2013 |
+
)
|
2014 |
+
|
2015 |
+
|
2016 |
+
add_task(
|
2017 |
+
"user_qa",
|
2018 |
+
source=seqio.TfdsDataSource(
|
2019 |
+
tfds_name="user_qa:0.0.1",
|
2020 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2021 |
+
splits=dict(
|
2022 |
+
train="train[2048:]",
|
2023 |
+
validation="train[:2048]"
|
2024 |
+
)
|
2025 |
+
),
|
2026 |
+
preprocessors=[extract_cockatoo_qa_v2, prefix_how_many_messages],
|
2027 |
+
style="user_qa",
|
2028 |
+
)
|
2029 |
+
|
2030 |
+
add_task(
|
2031 |
+
"user_questions_for_elo",
|
2032 |
+
source=seqio.TfdsDataSource(
|
2033 |
+
tfds_name="user_questions_for_elo:0.0.3",
|
2034 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2035 |
+
),
|
2036 |
+
preprocessors=[functools.partial(extract_individual_vqa, test=True)],
|
2037 |
+
inf_only=True,
|
2038 |
+
style="demo",
|
2039 |
+
)
|
2040 |
+
|
2041 |
+
|
2042 |
+
def _filter_by_id(ds, prediction_file, max_seq_len):
|
2043 |
+
with open(prediction_file) as f:
|
2044 |
+
predictions = json.load(f)
|
2045 |
+
is_long = []
|
2046 |
+
lens = []
|
2047 |
+
tokenizer = build_tokenizer("hf-Qwen/Qwen2-7B")
|
2048 |
+
for pred in predictions:
|
2049 |
+
n_tokens = len(tokenizer.encode(pred["prediction"]))
|
2050 |
+
lens.append(n_tokens)
|
2051 |
+
if n_tokens >= max_seq_len:
|
2052 |
+
is_long.append(pred["example_id"])
|
2053 |
+
is_long = tf.constant(is_long)
|
2054 |
+
logging.info(f"Filtering for {len(is_long)} ids")
|
2055 |
+
return ds.filter(lambda ex: tf.reduce_any(ex["example_id"] == is_long))
|
2056 |
+
|
2057 |
+
|
2058 |
+
|
2059 |
+
add_task(
|
2060 |
+
"user_questions_for_elo",
|
2061 |
+
source=seqio.TfdsDataSource(
|
2062 |
+
tfds_name="user_questions_for_elo:0.0.3",
|
2063 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2064 |
+
),
|
2065 |
+
preprocessors=[functools.partial(extract_individual_vqa, test=True)],
|
2066 |
+
inf_only=True,
|
2067 |
+
style="demo",
|
2068 |
+
)
|
2069 |
+
|
2070 |
+
|
2071 |
+
add_task(
|
2072 |
+
"user_questions_for_elo_long",
|
2073 |
+
source=seqio.TfdsDataSource(
|
2074 |
+
tfds_name="user_questions_for_elo:0.0.3",
|
2075 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2076 |
+
),
|
2077 |
+
preprocessors=[
|
2078 |
+
functools.partial(_filter_by_id, prediction_file="/weka/oe-training-default/chrisc/cockatoo/models/uber-model-v11/70b-335-30k-3.2-resume8k-noopt/predictions-ck20000-user_questions_for_elo-test/predictions.json", max_seq_len=230),
|
2079 |
+
functools.partial(extract_individual_vqa, test=True)
|
2080 |
+
],
|
2081 |
+
inf_only=True,
|
2082 |
+
style="demo",
|
2083 |
+
)
|
2084 |
+
|
2085 |
+
|
2086 |
+
add_task(
|
2087 |
+
"coco_2014_vqa",
|
2088 |
+
source=seqio.TfdsDataSource(
|
2089 |
+
tfds_name="coco_2014_all:1.0.1",
|
2090 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2091 |
+
),
|
2092 |
+
preprocessors=[
|
2093 |
+
add_coco_url,
|
2094 |
+
map_coco_vqa,
|
2095 |
+
flatten_vqa,
|
2096 |
+
extract_individual_vqa
|
2097 |
+
],
|
2098 |
+
inf_preprocessor=[
|
2099 |
+
add_coco_url,
|
2100 |
+
map_coco_vqa,
|
2101 |
+
flatten_vqa,
|
2102 |
+
extract_individual_vqa
|
2103 |
+
],
|
2104 |
+
style="vqa2",
|
2105 |
+
)
|
2106 |
+
|
2107 |
+
|
2108 |
+
add_task(
|
2109 |
+
"coco_2014_vqa_multi",
|
2110 |
+
source=seqio.TfdsDataSource(
|
2111 |
+
tfds_name="coco_2014_all:1.0.1",
|
2112 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2113 |
+
),
|
2114 |
+
preprocessors=[
|
2115 |
+
add_coco_url,
|
2116 |
+
map_coco_vqa,
|
2117 |
+
extract_individual_vqa
|
2118 |
+
],
|
2119 |
+
inf_preprocessor=[
|
2120 |
+
add_coco_url,
|
2121 |
+
map_coco_vqa,
|
2122 |
+
flatten_vqa,
|
2123 |
+
extract_individual_vqa
|
2124 |
+
],
|
2125 |
+
style="vqa2",
|
2126 |
+
)
|
2127 |
+
|
2128 |
+
|
2129 |
+
add_task(
|
2130 |
+
"coco_2017_vqa_multi",
|
2131 |
+
source=seqio.TfdsDataSource(
|
2132 |
+
tfds_name="coco_all:1.0.1",
|
2133 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2134 |
+
),
|
2135 |
+
preprocessors=[
|
2136 |
+
add_coco_url,
|
2137 |
+
map_coco_vqa,
|
2138 |
+
extract_individual_vqa
|
2139 |
+
],
|
2140 |
+
inf_preprocessor=[
|
2141 |
+
add_coco_url,
|
2142 |
+
map_coco_vqa,
|
2143 |
+
flatten_vqa,
|
2144 |
+
extract_individual_vqa
|
2145 |
+
],
|
2146 |
+
style="vqa2",
|
2147 |
+
)
|
2148 |
+
|
2149 |
+
|
2150 |
+
add_task(
|
2151 |
+
"vqa_v2_test",
|
2152 |
+
source=seqio.TfdsDataSource(
|
2153 |
+
tfds_name="coco_test_all:1.0.1",
|
2154 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2155 |
+
),
|
2156 |
+
preprocessors=[
|
2157 |
+
functools.partial(rekey, key_map={
|
2158 |
+
"image": ["image"],
|
2159 |
+
"questions": ["vqa", "questions"],
|
2160 |
+
"answers": ["vqa", "answers"],
|
2161 |
+
"id": ["vqa", "id"],
|
2162 |
+
}),
|
2163 |
+
flatten_vqa,
|
2164 |
+
functools.partial(extract_individual_vqa, test=True)
|
2165 |
+
],
|
2166 |
+
style="vqa2",
|
2167 |
+
inf_only=True
|
2168 |
+
)
|
2169 |
+
|
2170 |
+
# ************************
|
2171 |
+
# Eval-only Datasets
|
2172 |
+
# ************************
|
2173 |
+
|
2174 |
+
add_task(
|
2175 |
+
"seed_bench_test",
|
2176 |
+
source=seqio.TfdsDataSource(
|
2177 |
+
tfds_name="seed_bench:1.0.0",
|
2178 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2179 |
+
),
|
2180 |
+
preprocessors=[
|
2181 |
+
format_multiple_choice_qa,
|
2182 |
+
],
|
2183 |
+
style="a_okvqa_mc",
|
2184 |
+
inf_only=True
|
2185 |
+
)
|
2186 |
+
|
2187 |
+
|
2188 |
+
add_task(
|
2189 |
+
"pope_test",
|
2190 |
+
# A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
|
2191 |
+
source=seqio.TfdsDataSource(
|
2192 |
+
tfds_name="pope:1.0.0",
|
2193 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2194 |
+
),
|
2195 |
+
preprocessors=[
|
2196 |
+
add_coco_url,
|
2197 |
+
extract_individual_vqa
|
2198 |
+
],
|
2199 |
+
style="vqa2",
|
2200 |
+
inf_only=True
|
2201 |
+
)
|
2202 |
+
|
2203 |
+
|
2204 |
+
MME_SOURCE = seqio.TfdsDataSource(
|
2205 |
+
tfds_name="mme:1.0.0",
|
2206 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2207 |
+
)
|
2208 |
+
|
2209 |
+
|
2210 |
+
add_task(
|
2211 |
+
"mme_test",
|
2212 |
+
# A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
|
2213 |
+
source=MME_SOURCE,
|
2214 |
+
preprocessors=[
|
2215 |
+
functools.partial(flatten_parts, parts=["questions", "answers"]),
|
2216 |
+
rename(question="questions", answer="answers"),
|
2217 |
+
extract_individual_vqa,
|
2218 |
+
],
|
2219 |
+
style="vqa2",
|
2220 |
+
inf_only=True
|
2221 |
+
)
|
2222 |
+
|
2223 |
+
add_task(
|
2224 |
+
"real_world_qa_test",
|
2225 |
+
# A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
|
2226 |
+
source=seqio.TfdsDataSource(
|
2227 |
+
tfds_name="real_world_qa:1.0.0",
|
2228 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2229 |
+
),
|
2230 |
+
preprocessors=[
|
2231 |
+
functools.partial(
|
2232 |
+
format_multiple_style_qa,
|
2233 |
+
types=['multiple_choice', 'short_answer'],
|
2234 |
+
styles=['a_okvqa_mc', 'vqa2'],
|
2235 |
+
default_style="a_okvqa_mc",
|
2236 |
+
),
|
2237 |
+
],
|
2238 |
+
style=None,
|
2239 |
+
inf_only=True
|
2240 |
+
)
|
2241 |
+
|
2242 |
+
add_task(
|
2243 |
+
"real_world_qa_no_instruction",
|
2244 |
+
# A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
|
2245 |
+
source=seqio.TfdsDataSource(
|
2246 |
+
tfds_name="real_world_qa:1.0.0",
|
2247 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2248 |
+
),
|
2249 |
+
preprocessors=[
|
2250 |
+
functools.partial(
|
2251 |
+
functools.partial(format_multiple_style_qa, strip_instruction=True),
|
2252 |
+
types=['multiple_choice', 'short_answer'],
|
2253 |
+
styles=['a_okvqa_mc', 'vqa2'],
|
2254 |
+
default_style="a_okvqa_mc",
|
2255 |
+
),
|
2256 |
+
],
|
2257 |
+
style=None,
|
2258 |
+
inf_only=True
|
2259 |
+
)
|
2260 |
+
|
2261 |
+
add_task(
|
2262 |
+
"real_world_qa_dbg",
|
2263 |
+
# A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
|
2264 |
+
source=seqio.TfdsDataSource(
|
2265 |
+
tfds_name="real_world_qa:1.0.0",
|
2266 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2267 |
+
),
|
2268 |
+
preprocessors=[
|
2269 |
+
functools.partial(
|
2270 |
+
format_multiple_style_qa,
|
2271 |
+
types=['multiple_choice', 'short_answer'],
|
2272 |
+
styles=['user_qa', 'user_qa'],
|
2273 |
+
default_style="user_qa",
|
2274 |
+
),
|
2275 |
+
],
|
2276 |
+
style=None,
|
2277 |
+
inf_only=True
|
2278 |
+
)
|
2279 |
+
|
2280 |
+
|
2281 |
+
add_task(
|
2282 |
+
"mmmu",
|
2283 |
+
# A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
|
2284 |
+
source=seqio.TfdsDataSource(
|
2285 |
+
tfds_name="mmmu:1.0.0",
|
2286 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2287 |
+
splits={"train": "dev"},
|
2288 |
+
),
|
2289 |
+
preprocessors=[
|
2290 |
+
rename(img_type="metadata/img_type"),
|
2291 |
+
functools.partial(
|
2292 |
+
extract_mmmu,
|
2293 |
+
types=['multiple-choice', 'open'],
|
2294 |
+
styles=['a_okvqa_mc', 'vqa2'],
|
2295 |
+
default_style="a_okvqa_mc",
|
2296 |
+
),
|
2297 |
+
],
|
2298 |
+
style=None,
|
2299 |
+
)
|
2300 |
+
|
2301 |
+
|
2302 |
+
add_task(
|
2303 |
+
"mmmu_test",
|
2304 |
+
# A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
|
2305 |
+
source=seqio.TfdsDataSource(
|
2306 |
+
tfds_name="mmmu:1.0.0",
|
2307 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2308 |
+
splits={"validation": "validation", "test": "test"},
|
2309 |
+
),
|
2310 |
+
preprocessors=[
|
2311 |
+
rename(img_type="metadata/img_type"),
|
2312 |
+
extract_mmmu,
|
2313 |
+
],
|
2314 |
+
style=None,
|
2315 |
+
inf_only=True
|
2316 |
+
)
|
2317 |
+
|
2318 |
+
for style in ["vaia_qa", "vaia_qa_short_answer_first", "vqa_online", ]:
|
2319 |
+
add_task(
|
2320 |
+
f"mmmu_test_{style}",
|
2321 |
+
# A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
|
2322 |
+
source=seqio.TfdsDataSource(
|
2323 |
+
tfds_name="mmmu:1.0.0",
|
2324 |
+
# tfds_name="mmmu_khan_academy:1.0.1",
|
2325 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2326 |
+
splits={"validation": "validation", "test": "test", "dev": "dev"},
|
2327 |
+
),
|
2328 |
+
preprocessors=[
|
2329 |
+
rename(img_type="metadata/img_type"),
|
2330 |
+
extract_mmmu_cot,
|
2331 |
+
],
|
2332 |
+
style=style,
|
2333 |
+
inf_only=True
|
2334 |
+
)
|
2335 |
+
|
2336 |
+
|
2337 |
+
add_task(
|
2338 |
+
"math_vista_test",
|
2339 |
+
# A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
|
2340 |
+
source=seqio.TfdsDataSource(
|
2341 |
+
tfds_name="math_vista:1.0.0",
|
2342 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2343 |
+
splits={"validation": "testmini", "test": "test"},
|
2344 |
+
),
|
2345 |
+
preprocessors=[
|
2346 |
+
functools.partial(rekey, key_map={
|
2347 |
+
"id": ["id"],
|
2348 |
+
"query": ["query"],
|
2349 |
+
"image": ["image"],
|
2350 |
+
"choices": ["choices"],
|
2351 |
+
"answer": ["answer"],
|
2352 |
+
"metadata/question_type": ["question_type"],
|
2353 |
+
"metadata/answer_type": ["answer_type"],
|
2354 |
+
"metadata/precision": ["precision"],
|
2355 |
+
"metadata/split": ["metadata/split"],
|
2356 |
+
}),
|
2357 |
+
functools.partial(extract_math_vista, styles=['a_okvqa_mc', 'vqa2']),
|
2358 |
+
],
|
2359 |
+
style=None,
|
2360 |
+
inf_only=True
|
2361 |
+
)
|
2362 |
+
|
2363 |
+
|
2364 |
+
add_task(
|
2365 |
+
"math_vista_v2",
|
2366 |
+
# A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
|
2367 |
+
source=seqio.TfdsDataSource(
|
2368 |
+
tfds_name="math_vista:1.0.0",
|
2369 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2370 |
+
splits={"validation": "testmini", "test": "test"},
|
2371 |
+
),
|
2372 |
+
preprocessors=[
|
2373 |
+
functools.partial(rekey, key_map={
|
2374 |
+
"id": ["id"],
|
2375 |
+
"query": ["query"],
|
2376 |
+
"image": ["image"],
|
2377 |
+
"choices": ["choices"],
|
2378 |
+
"answer": ["answer"],
|
2379 |
+
"metadata/question_type": ["question_type"],
|
2380 |
+
"metadata/answer_type": ["answer_type"],
|
2381 |
+
"metadata/precision": ["precision"],
|
2382 |
+
"metadata/split": ["metadata/split"],
|
2383 |
+
}),
|
2384 |
+
reformat_math_vista,
|
2385 |
+
functools.partial(
|
2386 |
+
extract_math_vista,
|
2387 |
+
styles=['a_okvqa_mc', 'vqa2'],
|
2388 |
+
),
|
2389 |
+
],
|
2390 |
+
style=None,
|
2391 |
+
inf_only=True
|
2392 |
+
)
|
2393 |
+
|
2394 |
+
|
2395 |
+
MM_BENCH_SRC = seqio.TfdsDataSource(
|
2396 |
+
tfds_name="mmbench:1.0.0",
|
2397 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2398 |
+
splits={"validation": "dev", "test": "test"},
|
2399 |
+
)
|
2400 |
+
|
2401 |
+
add_task(
|
2402 |
+
"mmbench_test",
|
2403 |
+
source=MM_BENCH_SRC,
|
2404 |
+
preprocessors=[format_mmbench],
|
2405 |
+
style="a_okvqa_mc",
|
2406 |
+
inf_only=True
|
2407 |
+
)
|
2408 |
+
|
2409 |
+
|
2410 |
+
add_task(
|
2411 |
+
"sugar_crepe_test",
|
2412 |
+
# A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
|
2413 |
+
source=seqio.TfdsDataSource(
|
2414 |
+
tfds_name="sugar_crepe:1.0.0",
|
2415 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2416 |
+
),
|
2417 |
+
preprocessors=[
|
2418 |
+
add_coco_url,
|
2419 |
+
functools.partial(flatten_parts, parts=["choices", "answer_idx", "metadata/answer_type"]),
|
2420 |
+
format_multiple_choice_qa,
|
2421 |
+
],
|
2422 |
+
style="a_okvqa_mc",
|
2423 |
+
inf_only=True
|
2424 |
+
)
|
2425 |
+
|
2426 |
+
|
2427 |
+
add_task(
|
2428 |
+
"blink_test",
|
2429 |
+
# A TfdsTask takes in a TFDS name instead of a tf.data.Dataset function.
|
2430 |
+
source=seqio.TfdsDataSource(
|
2431 |
+
tfds_name="blink:1.0.0",
|
2432 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2433 |
+
),
|
2434 |
+
preprocessors=[
|
2435 |
+
functools.partial(rekey, key_map={
|
2436 |
+
"id": ["id"],
|
2437 |
+
"question": ["prompt"],
|
2438 |
+
"image": ["image_concat"],
|
2439 |
+
"choices": ["choices"],
|
2440 |
+
"answer_idx": ["answer_idx"],
|
2441 |
+
"metadata/subtask": ["metadata/subtask"],
|
2442 |
+
"metadata/question": ["question"],
|
2443 |
+
}),
|
2444 |
+
format_multiple_choice_qa,
|
2445 |
+
output_options,
|
2446 |
+
],
|
2447 |
+
style="a_okvqa_mc",
|
2448 |
+
inf_only=True
|
2449 |
+
)
|
2450 |
+
|
2451 |
+
add_task(
|
2452 |
+
"oscarbench_qa",
|
2453 |
+
source=seqio.TfdsDataSource(
|
2454 |
+
tfds_name="oscarbench_qa:1.0.0",
|
2455 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2456 |
+
splits={"validation": "val"}
|
2457 |
+
),
|
2458 |
+
preprocessors=[oscar_preprocessor],
|
2459 |
+
style="oscarbench_qa"
|
2460 |
+
|
2461 |
+
)
|
2462 |
+
|
2463 |
+
add_task(
|
2464 |
+
"charxiv",
|
2465 |
+
source=seqio.TfdsDataSource(
|
2466 |
+
tfds_name="charxiv:1.0.0",
|
2467 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2468 |
+
splits={"validation": "validation", "test": "test"}
|
2469 |
+
),
|
2470 |
+
preprocessors=[charxiv_preprocessor, extract_individual_vqa],
|
2471 |
+
inf_preprocessor=[
|
2472 |
+
charxiv_preprocessor,
|
2473 |
+
functools.partial(flatten_parts, parts=["question", "answer"]),
|
2474 |
+
extract_individual_vqa,
|
2475 |
+
],
|
2476 |
+
style="charxiv",
|
2477 |
+
)
|
2478 |
+
|
2479 |
+
add_task(
|
2480 |
+
"charxiv_descriptive",
|
2481 |
+
source=seqio.TfdsDataSource(
|
2482 |
+
tfds_name="charxiv:1.0.0",
|
2483 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2484 |
+
splits={"validation": "validation", "test": "test"}
|
2485 |
+
),
|
2486 |
+
preprocessors=[charxiv_descriptive_preprocessor, extract_individual_vqa],
|
2487 |
+
inf_preprocessor=[
|
2488 |
+
charxiv_descriptive_preprocessor,
|
2489 |
+
functools.partial(flatten_parts, parts=["question", "answer"]),
|
2490 |
+
extract_individual_vqa,
|
2491 |
+
],
|
2492 |
+
style="charxiv_descriptive",
|
2493 |
+
)
|
2494 |
+
|
2495 |
+
add_task(
|
2496 |
+
"charxiv_reasoning",
|
2497 |
+
source=seqio.TfdsDataSource(
|
2498 |
+
tfds_name="charxiv:1.0.0",
|
2499 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2500 |
+
splits={"validation": "validation", "test": "test"}
|
2501 |
+
),
|
2502 |
+
preprocessors=[charxiv_reasoning_preprocessor, extract_individual_vqa],
|
2503 |
+
style="charxiv_reasoning",
|
2504 |
+
)
|
2505 |
+
|
2506 |
+
for tablevqa_name in ["fintabnetqa", "vwtq", "vwtq_syn"]:
|
2507 |
+
add_task(
|
2508 |
+
tablevqa_name,
|
2509 |
+
source=seqio.TfdsDataSource(
|
2510 |
+
tfds_name=f"{tablevqa_name}:1.0.0",
|
2511 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2512 |
+
splits={"validation": "test[:125]", "test": "test"}
|
2513 |
+
),
|
2514 |
+
preprocessors=[tablevqa_preprocessor, extract_individual_vqa],
|
2515 |
+
style=tablevqa_name,
|
2516 |
+
)
|
2517 |
+
|
2518 |
+
add_task(
|
2519 |
+
"vtabfact",
|
2520 |
+
source=seqio.TfdsDataSource(
|
2521 |
+
tfds_name="vtabfact:1.0.0",
|
2522 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2523 |
+
splits={"validation": "test[:125]", "test": "test"}
|
2524 |
+
),
|
2525 |
+
preprocessors=[vtabfact_preprocessor, extract_individual_vqa],
|
2526 |
+
style="vtabfact",
|
2527 |
+
)
|
2528 |
+
|
2529 |
+
add_task(
|
2530 |
+
"nutrition_fact",
|
2531 |
+
source=seqio.TfdsDataSource(
|
2532 |
+
tfds_name="nutrition_fact:1.0.0",
|
2533 |
+
tfds_data_dir=MULTITASK_TFDS_DATA_DIR,
|
2534 |
+
splits={"validation": "test", "test": "test"}
|
2535 |
+
),
|
2536 |
+
preprocessors=[nutrition_fact_preprocessor, extract_individual_vqa],
|
2537 |
+
inf_preprocessor=[
|
2538 |
+
nutrition_fact_preprocessor,
|
2539 |
+
functools.partial(flatten_parts, parts=["question", "answer"]),
|
2540 |
+
extract_individual_vqa,
|
2541 |
+
],
|
2542 |
+
style="nutrition_fact",
|
2543 |
+
inf_only=True
|
2544 |
+
)
|
2545 |
+
|
2546 |
+
for k in ["chart_qa", "info_qa", "doc_qa", "text_vqa", "coco_2014_vqa",
|
2547 |
+
"ai2_diagram_v2_mix_transparent", "chart_qa_human"]:
|
2548 |
+
TASKS[k + "_demo"] = dataclasses.replace(TASKS[k], style="demo")
|
torch_util.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gc
|
2 |
+
import os
|
3 |
+
import logging
|
4 |
+
from typing import Optional, TypeVar, List, Tuple
|
5 |
+
|
6 |
+
import torch
|
7 |
+
import torch.distributed as dist
|
8 |
+
|
9 |
+
T = TypeVar("T")
|
10 |
+
|
11 |
+
|
12 |
+
log = logging.getLogger(__name__)
|
13 |
+
|
14 |
+
|
15 |
+
def seed_all(seed: int):
|
16 |
+
"""Seed all rng objects."""
|
17 |
+
import random
|
18 |
+
|
19 |
+
import numpy as np
|
20 |
+
|
21 |
+
if seed < 0 or seed > 2**32 - 1:
|
22 |
+
raise ValueError(f"Seed {seed} is invalid. It must be on [0; 2^32 - 1]")
|
23 |
+
random.seed(seed)
|
24 |
+
np.random.seed(seed)
|
25 |
+
torch.manual_seed(seed)
|
26 |
+
# torch.manual_seed may call manual_seed_all but calling it again here
|
27 |
+
# to make sure it gets called at least once
|
28 |
+
torch.cuda.manual_seed_all(seed)
|
29 |
+
|
30 |
+
|
31 |
+
def is_distributed() -> bool:
|
32 |
+
return dist.is_available() and dist.is_initialized()
|
33 |
+
|
34 |
+
|
35 |
+
def get_node_rank() -> int:
|
36 |
+
return int(os.environ.get("NODE_RANK") or (get_global_rank() - get_local_rank()) // get_local_world_size())
|
37 |
+
|
38 |
+
|
39 |
+
def get_world_size() -> int:
|
40 |
+
if is_distributed():
|
41 |
+
return dist.get_world_size()
|
42 |
+
else:
|
43 |
+
return 1
|
44 |
+
|
45 |
+
|
46 |
+
def get_local_world_size() -> int:
|
47 |
+
return int(os.environ.get("LOCAL_WORLD_SIZE") or 1)
|
48 |
+
|
49 |
+
|
50 |
+
def get_global_rank() -> int:
|
51 |
+
if is_distributed():
|
52 |
+
return int(os.environ.get("RANK") or dist.get_rank())
|
53 |
+
else:
|
54 |
+
return 0
|
55 |
+
|
56 |
+
|
57 |
+
def get_local_rank() -> int:
|
58 |
+
return int(os.environ.get("LOCAL_RANK") or 0)
|
59 |
+
|
60 |
+
|
61 |
+
def get_fs_local_rank() -> int:
|
62 |
+
"""Get the local rank per filesystem, meaning that, regardless of the number of nodes,
|
63 |
+
if all ranks share the same filesystem then `get_fs_local_rank()` will be equivalent to `get_global_rank()`,
|
64 |
+
but if nodes do not share the same filesystem then `get_fs_local_rank()` will be equivalent to `get_local_rank()`.
|
65 |
+
"""
|
66 |
+
if os.environ.get("OLMO_SHARED_FS"):
|
67 |
+
return int(os.environ.get("FS_LOCAL_RANK") or get_global_rank())
|
68 |
+
else:
|
69 |
+
return int(os.environ.get("FS_LOCAL_RANK") or get_local_rank())
|
70 |
+
|
71 |
+
|
72 |
+
def move_to_device(o: T, device: torch.device) -> T:
|
73 |
+
if isinstance(o, torch.Tensor):
|
74 |
+
return o.to(device) # type: ignore[return-value]
|
75 |
+
elif isinstance(o, dict):
|
76 |
+
return {k: move_to_device(v, device) for k, v in o.items()} # type: ignore[return-value]
|
77 |
+
elif isinstance(o, list):
|
78 |
+
return [move_to_device(x, device) for x in o] # type: ignore[return-value]
|
79 |
+
elif isinstance(o, tuple):
|
80 |
+
return tuple((move_to_device(x, device) for x in o)) # type: ignore[return-value]
|
81 |
+
else:
|
82 |
+
return o
|
83 |
+
|
84 |
+
|
85 |
+
def ensure_finite_(x: torch.Tensor, check_neg_inf: bool = True, check_pos_inf: bool = False):
|
86 |
+
"""
|
87 |
+
Modify ``x`` in place to replace ``float("-inf")`` with the minimum value of the dtype when ``check_neg_inf``
|
88 |
+
is ``True`` and to replace ``float("inf")`` with the maximum value of the dtype when ``check_pos_inf`` is ``True``.
|
89 |
+
"""
|
90 |
+
if check_neg_inf:
|
91 |
+
x.masked_fill_(x == float("-inf"), torch.finfo(x.dtype).min)
|
92 |
+
if check_pos_inf:
|
93 |
+
x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max)
|
94 |
+
|
95 |
+
|
96 |
+
def get_default_device() -> torch.device:
|
97 |
+
if torch.cuda.is_available() and torch.cuda.is_initialized():
|
98 |
+
return torch.device("cuda")
|
99 |
+
else:
|
100 |
+
return torch.device("cpu")
|
101 |
+
|
102 |
+
|
103 |
+
def barrier() -> None:
|
104 |
+
if is_distributed():
|
105 |
+
dist.barrier()
|
106 |
+
|
107 |
+
|
108 |
+
def peak_gpu_memory(reset: bool = False) -> Optional[float]:
|
109 |
+
"""
|
110 |
+
Get the peak GPU memory usage in MB across all ranks.
|
111 |
+
Only rank 0 will get the final result.
|
112 |
+
"""
|
113 |
+
if not torch.cuda.is_available():
|
114 |
+
return None
|
115 |
+
|
116 |
+
device = torch.device("cuda")
|
117 |
+
peak_mb = torch.cuda.max_memory_allocated(device) / 1000000
|
118 |
+
if is_distributed():
|
119 |
+
peak_mb_tensor = torch.tensor(peak_mb, device=device)
|
120 |
+
dist.reduce(peak_mb_tensor, 0, dist.ReduceOp.MAX)
|
121 |
+
peak_mb = peak_mb_tensor.item()
|
122 |
+
|
123 |
+
if reset:
|
124 |
+
# Reset peak stats.
|
125 |
+
torch.cuda.reset_max_memory_allocated(device)
|
126 |
+
|
127 |
+
return peak_mb
|
128 |
+
|
129 |
+
|
130 |
+
V = TypeVar("V", bool, int, float)
|
131 |
+
|
132 |
+
|
133 |
+
def synchronize_value(value: V, device: torch.device) -> V:
|
134 |
+
if dist.is_available() and dist.is_initialized():
|
135 |
+
value_tensor = torch.tensor(value, device=device)
|
136 |
+
dist.broadcast(value_tensor, 0)
|
137 |
+
return value_tensor.item() # type: ignore
|
138 |
+
else:
|
139 |
+
return value
|
140 |
+
|
141 |
+
|
142 |
+
def synchronize_flag(flag: bool, device: torch.device) -> bool:
|
143 |
+
return synchronize_value(flag, device)
|
144 |
+
|
145 |
+
|
146 |
+
def gc_cuda():
|
147 |
+
gc.collect()
|
148 |
+
if torch.cuda.is_available():
|
149 |
+
torch.cuda.empty_cache()
|
150 |
+
|
151 |
+
|
152 |
+
def listinstr(lst, s, delimiter=None):
|
153 |
+
assert isinstance(lst, list)
|
154 |
+
for item in lst:
|
155 |
+
if delimiter:
|
156 |
+
if all(x in s for x in item.split(delimiter)):
|
157 |
+
return True
|
158 |
+
else:
|
159 |
+
if item in s:
|
160 |
+
return True
|
161 |
+
return False
|
162 |
+
|
163 |
+
|
164 |
+
def freeze_module(module: torch.nn.Module, exclude_params: Optional[List[str]] = None):
|
165 |
+
for name, param in module.named_parameters():
|
166 |
+
if exclude_params is not None and listinstr(exclude_params, name):
|
167 |
+
continue
|
168 |
+
param.requires_grad = False
|
169 |
+
|
170 |
+
|
171 |
+
def freeze_parameters_by_name(model: torch.nn.Module, freeze_names: Tuple[str]):
|
172 |
+
for name in freeze_names:
|
173 |
+
try:
|
174 |
+
module_or_param = model.get_submodule(name)
|
175 |
+
except:
|
176 |
+
try:
|
177 |
+
module_or_param = model.get_parameter(name)
|
178 |
+
except:
|
179 |
+
log.warning(f"Could not find module or parameter with name {name}")
|
180 |
+
if isinstance(module_or_param, torch.nn.Module):
|
181 |
+
freeze_module(module_or_param)
|
182 |
+
else:
|
183 |
+
module_or_param.requires_grad = False
|
util.py
CHANGED
@@ -33,7 +33,7 @@ from .exceptions import (
|
|
33 |
OLMoNetworkError,
|
34 |
OLMoThreadError,
|
35 |
)
|
36 |
-
from .torch_util import get_global_rank, get_local_rank, get_node_rank, is_distributed
|
37 |
|
38 |
try:
|
39 |
from functools import cache
|
|
|
33 |
OLMoNetworkError,
|
34 |
OLMoThreadError,
|
35 |
)
|
36 |
+
# from .torch_util import get_global_rank, get_local_rank, get_node_rank, is_distributed
|
37 |
|
38 |
try:
|
39 |
from functools import cache
|
utils.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import dataclasses
|
2 |
+
import hashlib
|
3 |
+
import sys
|
4 |
+
import typing
|
5 |
+
import warnings
|
6 |
+
import socket
|
7 |
+
from typing import Optional, Any, Dict
|
8 |
+
import os
|
9 |
+
import logging
|
10 |
+
import absl.flags
|
11 |
+
from flax.traverse_util import flatten_dict
|
12 |
+
|
13 |
+
from ml_collections import ConfigDict, config_flags
|
14 |
+
from ml_collections.config_dict import placeholder
|
15 |
+
from mlxu import function_args_to_config
|
16 |
+
|
17 |
+
_log_extra_fields: Dict[str, Any] = {}
|
18 |
+
|
19 |
+
|
20 |
+
def is_float_printable(x):
|
21 |
+
try:
|
22 |
+
f"{x:0.2f}"
|
23 |
+
return True
|
24 |
+
except (ValueError, TypeError):
|
25 |
+
return False
|
26 |
+
|
27 |
+
|
28 |
+
def compute_hash(string: str) -> str:
|
29 |
+
"""Computes the hash of a string."""
|
30 |
+
return hashlib.sha256(string.encode("utf-8")).hexdigest()
|
31 |
+
|
32 |
+
|
33 |
+
def pop_metadata(data):
|
34 |
+
meta = {k: data.pop(k) for k in list(data) if k.startswith("metadata")}
|
35 |
+
return data, meta
|
36 |
+
|
37 |
+
|
38 |
+
def setup_logging():
|
39 |
+
handler: logging.Handler
|
40 |
+
handler = logging.StreamHandler(sys.stdout)
|
41 |
+
formatter = logging.Formatter(
|
42 |
+
"[%(levelname)-.1s %(asctime)s %(filename)s:%(lineno)s] %(message)s",
|
43 |
+
datefmt="%H:%M:%S"
|
44 |
+
)
|
45 |
+
handler.setFormatter(formatter)
|
46 |
+
logging.basicConfig(handlers=[handler], level=logging.INFO)
|
47 |
+
|
48 |
+
logging.captureWarnings(True)
|
49 |
+
logging.getLogger("urllib3").setLevel(logging.ERROR)
|
50 |
+
|
51 |
+
|
52 |
+
def get_maybe_optional_type(field_type):
|
53 |
+
if type(None) in typing.get_args(field_type):
|
54 |
+
# Handle optional type
|
55 |
+
args = [x for x in typing.get_args(field_type) if x != type(None)]
|
56 |
+
assert len(args) == 1
|
57 |
+
field_type = args[0]
|
58 |
+
return field_type
|
59 |
+
|
60 |
+
|
61 |
+
def config_from_dataclass(dataclass, defaults_to_none=False) -> ConfigDict:
|
62 |
+
"""Build a `ConfigDict` matching the possibly nested dataclass
|
63 |
+
|
64 |
+
dataclass: A dataclass instance or a dataclass type, if an instance defaults
|
65 |
+
will be set to the values in the class, if a class defaults will be
|
66 |
+
set to the field defaults, or None if the field is required
|
67 |
+
defaults_to_none: Make all defaults None
|
68 |
+
"""
|
69 |
+
out = {}
|
70 |
+
fields = dataclasses.fields(dataclass)
|
71 |
+
for field in fields:
|
72 |
+
if not field.init:
|
73 |
+
continue
|
74 |
+
|
75 |
+
if defaults_to_none:
|
76 |
+
default = None
|
77 |
+
elif hasattr(dataclass, field.name):
|
78 |
+
default = getattr(dataclass, field.name)
|
79 |
+
elif field.default is dataclasses.MISSING:
|
80 |
+
default = None
|
81 |
+
else:
|
82 |
+
default = field.default
|
83 |
+
|
84 |
+
field_type = get_maybe_optional_type(field.type)
|
85 |
+
|
86 |
+
if hasattr(field_type, "__dataclass_fields__"):
|
87 |
+
if not defaults_to_none and default is None:
|
88 |
+
pass
|
89 |
+
else:
|
90 |
+
out[field.name] = config_from_dataclass(
|
91 |
+
default or field.type, defaults_to_none=defaults_to_none)
|
92 |
+
else:
|
93 |
+
if default is None:
|
94 |
+
assert not field_type == typing.Any
|
95 |
+
origin = getattr(field_type, "__origin__", None)
|
96 |
+
if origin is not None:
|
97 |
+
field_type = origin
|
98 |
+
out[field.name] = placeholder(field_type)
|
99 |
+
else:
|
100 |
+
out[field.name] = default
|
101 |
+
return ConfigDict(out)
|
102 |
+
|
103 |
+
|
104 |
+
def dataclass_with_none(cls):
|
105 |
+
"""Build an instance of possibly nested dataclass `cls` with all attributes None"""
|
106 |
+
fields = dataclasses.fields(cls)
|
107 |
+
args = {}
|
108 |
+
for field in fields:
|
109 |
+
if not field.init:
|
110 |
+
pass
|
111 |
+
elif dataclasses.is_dataclass(field.type):
|
112 |
+
args[field.name] = dataclass_with_none(field.type)
|
113 |
+
else:
|
114 |
+
args[field.name] = None
|
115 |
+
return cls(**args)
|
116 |
+
|
117 |
+
|
118 |
+
def dataclass_from_config(cls, config: Dict):
|
119 |
+
"""Build an instance of `cls` with attributes from `config``"""
|
120 |
+
fields = dataclasses.fields(cls)
|
121 |
+
args = set(x.name for x in fields)
|
122 |
+
for k in config.keys():
|
123 |
+
if k not in args:
|
124 |
+
raise ValueError(f"Config has unknown arg {k} fr {cls}")
|
125 |
+
args = {}
|
126 |
+
for field in fields:
|
127 |
+
if not field.init:
|
128 |
+
continue
|
129 |
+
|
130 |
+
field_type = get_maybe_optional_type(field.type)
|
131 |
+
if hasattr(field_type, "__dataclass_fields__"):
|
132 |
+
if config.get(field.name) is None:
|
133 |
+
args[field.name] = None
|
134 |
+
elif hasattr(field_type, "from_dict"):
|
135 |
+
src = config[field.name]
|
136 |
+
if isinstance(src, ConfigDict):
|
137 |
+
src = src.to_dict()
|
138 |
+
args[field.name] = field_type.from_dict(src)
|
139 |
+
else:
|
140 |
+
args[field.name] = dataclass_from_config(field_type, config[field.name])
|
141 |
+
elif field.name in config:
|
142 |
+
if isinstance(config[field.name], ConfigDict):
|
143 |
+
args[field.name] = config[field.name].to_dict()
|
144 |
+
else:
|
145 |
+
args[field.name] = config[field.name]
|
146 |
+
return cls(**args)
|
147 |
+
|
148 |
+
|
149 |
+
def update_dataclass(obj, updates):
|
150 |
+
"""Sets attributes in `obj` to match non-None fields in `updates`"""
|
151 |
+
fields = dataclasses.fields(obj)
|
152 |
+
for field in fields:
|
153 |
+
if not field.init:
|
154 |
+
continue
|
155 |
+
update = updates.get(field.name)
|
156 |
+
if update is None:
|
157 |
+
continue
|
158 |
+
current_value = getattr(obj, field.name)
|
159 |
+
if dataclasses.is_dataclass(current_value):
|
160 |
+
update_dataclass(current_value, update)
|
161 |
+
else:
|
162 |
+
if isinstance(update, (ConfigDict, dict)):
|
163 |
+
assert all(x is None for x in flatten_dict(update).values())
|
164 |
+
else:
|
165 |
+
setattr(obj, field.name, update)
|
166 |
+
|
167 |
+
|
168 |
+
def log_metrics_to_console(prefix: str, metrics: Dict[str, float]):
|
169 |
+
# Stolen from the OLMo codebase
|
170 |
+
def format_value(value: float) -> str:
|
171 |
+
if isinstance(value, str):
|
172 |
+
return value
|
173 |
+
if value < 0.0001:
|
174 |
+
return str(value) # scientific notation
|
175 |
+
elif value > 1000:
|
176 |
+
return f"{int(value):,d}"
|
177 |
+
elif value > 100:
|
178 |
+
return f"{value:.1f}"
|
179 |
+
elif value > 10:
|
180 |
+
return f"{value:.2f}"
|
181 |
+
elif value > 1:
|
182 |
+
return f"{value:.3f}"
|
183 |
+
else:
|
184 |
+
return f"{value:.4f}"
|
185 |
+
|
186 |
+
logging.info(
|
187 |
+
f"{prefix}\n"
|
188 |
+
+ "\n".join(
|
189 |
+
[
|
190 |
+
f" {name}={format_value(value)}"
|
191 |
+
for name, value in metrics.items()
|
192 |
+
if not name.startswith("optim/") # there's too many optimizer metrics
|
193 |
+
]
|
194 |
+
)
|
195 |
+
)
|