molmo-dense-captioner-v22-qwen2 / modeling_molmo.py

Upload folder using huggingface_hub

56df21f verified 24 days ago

57 kB

	import math
	from copy import deepcopy
	from dataclasses import fields, dataclass, replace
	from enum import Enum
	from typing import List, Optional, Tuple, Union, Dict, Any, Sequence, Callable, cast, MutableMapping

	import torch
	from transformers import PreTrainedModel, GenerationConfig, add_start_docstrings
	from transformers.activations import ACT2FN
	from transformers.cache_utils import Cache
	from transformers.modeling_flash_attention_utils import _flash_attention_forward
	from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput
	from transformers.models.auto import AutoModelForCausalLM
	from torch import nn
	from transformers.utils import logging

	from .config_molmo import MolmoConfig, MolmoVisionConfig
	from torch.nn import functional as F


	logger = logging.get_logger(__name__)


	MOLMO_START_DOCSTRING = r"""
	This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
	library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
	etc.)

	This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
	Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
	and behavior.

	Parameters:
	config ([`MolmoConfig`]):
	Model configuration class with all the parameters of the model. Initializing with a config file does not
	load the weights associated with the model, only the configuration. Check out the
	[`~PreTrainedModel.from_pretrained`] method to load the model weights.
	"""


	@add_start_docstrings(
	"The bare Molmo Model outputting raw hidden-states without any specific head on top.",
	MOLMO_START_DOCSTRING,
	)
	class MolmoPreTrainedModel(PreTrainedModel):
	config_class = MolmoConfig
	base_model_prefix = "model"
	_no_split_modules = ["MolmoBlock", "MolmoeBlock", "MolmoVisionBlock"]
	_skip_keys_device_placement = "past_key_values"
	_supports_flash_attn_2 = True
	_supports_sdpa = True
	# supports_gradient_checkpointing = True
	# _supports_cache_class = True
	# _supports_static_cache = False

	def _init_weights(self, module):
	std = self.config.initializer_range
	if isinstance(module, (nn.Linear,)):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=std)


	class MolmoRotaryEmbedding(nn.Module):
	"""
	[Rotary positional embeddings (RoPE)](https://arxiv.org/abs/2104.09864).
	"""

	def __init__(self, dim, max_position_embeddings=2048, rope_theta=10000, full_precision=True, device=None):
	super().__init__()
	self.dim = dim
	self.rope_theta = rope_theta
	self.full_precision = full_precision
	self.max_position_embeddings = max_position_embeddings

	# Cache sin/cos embeddings
	dim = self.dim
	inv_freq = 1.0 / (self.rope_theta ** (torch.arange(0, dim, 2, device=device, dtype=torch.float) / dim))
	seq = torch.arange(self.max_position_embeddings, device=device, dtype=torch.float)
	freqs = torch.einsum("i , j -> i j", seq, inv_freq)
	positions = torch.cat((freqs, freqs), dim=-1)
	pos_sin, pos_cos = positions.sin()[None, None, :, :], positions.cos()[None, None, :, :]
	self.register_buffer("rope_pos_sin", pos_sin, persistent=False)
	self.register_buffer("rope_pos_cos", pos_cos, persistent=False)

	def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
	B, nh, T, hs = x.size()
	x = x.view(B, nh, T, 2, hs // 2)
	x1, x2 = x.unbind(dim=-2)
	return torch.cat((-x2, x1), dim=-1)

	def apply_rotary_pos_emb(self, pos_sin: torch.Tensor, pos_cos: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
	return (t * pos_cos) + (self.rotate_half(t) * pos_sin)

	def forward(
	self,
	q: torch.Tensor,
	k: torch.Tensor,
	position_ids: Optional[torch.Tensor] = None
	) -> Tuple[torch.Tensor, torch.Tensor]:
	if self.full_precision:
	q_, k_ = q.float(), k.float()
	else:
	q_, k_ = q, k

	with torch.autocast(q.device.type, enabled=False):
	batch_size = q_.shape[0]
	query_len, key_len = q_.shape[-2], k_.shape[-2] # could be different if layer_past not None
	if position_ids is not None:
	freqs_cis_len = self.max_position_embeddings
	else:
	freqs_cis_len = key_len
	# self.get_rotary_embedding(freqs_cis_len, q_.device)
	pos_sin = self.rope_pos_sin[:, :, :freqs_cis_len, :].type_as(q_)
	pos_cos = self.rope_pos_cos[:, :, :freqs_cis_len, :].type_as(q_)
	if position_ids is not None:
	assert query_len == key_len, "Query and key lengths must be equal when using position IDs."
	pos_sin = pos_sin[0, 0][position_ids].view(
	(batch_size, 1, key_len, pos_sin.shape[-1])
	)
	pos_cos = pos_cos[0, 0][position_ids].view(
	(batch_size, 1, key_len, pos_cos.shape[-1])
	)
	q_ = self.apply_rotary_pos_emb(
	pos_sin[:, :, key_len - query_len : key_len, :],
	pos_cos[:, :, key_len - query_len : key_len, :],
	q_,
	)
	k_ = self.apply_rotary_pos_emb(pos_sin, pos_cos, k_)
	return q_.type_as(q), k_.type_as(k)


	class MolmoAttention(nn.Module):
	def __init__(
	self,
	config: MolmoConfig,
	device=None
	):
	super().__init__()
	self.config = config
	self.rotary_emb = MolmoRotaryEmbedding(
	config.hidden_size // config.num_attention_heads,
	config.max_position_embeddings,
	config.rope_theta, device=device)

	self.k_norm: Optional[nn.Module] = None
	self.q_norm: Optional[nn.Module] = None
	self.hidden_size = config.intermediate_size
	if config.qk_layer_norm:
	if config.num_key_value_heads is None:
	config.num_key_value_heads = config.num_attention_heads
	self.q_norm = MolmoRmsLayerNorm(
	config,
	size=config.hidden_size,
	eps=config.layer_norm_eps
	)
	self.k_norm = MolmoRmsLayerNorm(
	config,
	size=config.hidden_size,
	eps=config.layer_norm_eps
	)

	# Attention output projection.
	input_dim = config.hidden_size
	head_dim = config.hidden_size // config.num_attention_heads
	self.fused_dims = (
	config.hidden_size,
	config.num_key_value_heads * head_dim,
	config.num_key_value_heads * head_dim,
	)
	self.att_proj = nn.Linear(
	config.hidden_size, sum(self.fused_dims),
	bias=config.qkv_bias,
	)
	self.attn_out = nn.Linear(
	input_dim, config.hidden_size,
	bias=False,
	)

	def attention(self,
	q: torch.Tensor,
	k: torch.Tensor,
	v: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.Tensor] = None,
	drop_mask: Optional[torch.Tensor] = None,
	layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
	use_cache: bool = False,
	) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
	B, T, C = q.size() # batch size, sequence length, hidden_size
	dtype = k.dtype

	# Optionally apply layer norm to keys and queries.
	if self.q_norm is not None and self.k_norm is not None:
	q = self.q_norm(q).to(dtype=dtype)
	k = self.k_norm(k).to(dtype=dtype)

	# Move head forward to be next to the batch dim.
	# shape: (B, nh, T, hs)
	q = q.view(B, T, self.config.num_attention_heads, C // self.config.num_attention_heads).transpose(1, 2)
	# shape: (B, n_kv_h, T, hs)
	k = k.view(B, T, self.config.num_key_value_heads, C // self.config.num_attention_heads).transpose(1, 2)
	# shape: (B, n_kv_h, T, hs)
	v = v.view(B, T, self.config.num_key_value_heads, C // self.config.num_attention_heads).transpose(1, 2)

	# Apply rotary embeddings
	q, k = self.rotary_emb(q, k, position_ids=position_ids)

	if layer_past is not None:
	past_key, past_value = layer_past
	k = torch.cat((past_key.to(k.device), k), dim=-2)
	v = torch.cat((past_value.to(v.device), v), dim=-2)

	present = (k, v) if use_cache else None
	query_len, key_len = q.shape[-2], k.shape[-2] # could be different if layer_past not None

	if attention_mask is not None:
	attention_mask = attention_mask[:, :, key_len - query_len: key_len, :key_len]

	# if attention_bias is not None:
	# attention_bias = self._cast_attn_bias(
	# attention_bias[:, :, key_len - query_len : key_len, :key_len], dtype)

	# Get the attention scores.
	# shape: (B, nh, T, hs)
	att = self._scaled_dot_product_attention(
	q,
	k,
	v,
	attention_mask=attention_mask,
	dropout_p=0.0 if not self.training else self.config.attention_dropout,
	is_causal=attention_mask is None,
	)

	# Re-assemble all head outputs side-by-side.
	att = att.transpose(1, 2).contiguous().view(B, T, C)

	# Apply output projection.
	return self.attn_out(att), present

	def _scaled_dot_product_attention(
	self,
	q: torch.Tensor,
	k: torch.Tensor,
	v: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	dropout_p: float = 0.0,
	is_causal: bool = False,
	) -> torch.Tensor:
	if attention_mask is not None:
	attention_mask = attention_mask.to(q.device)

	if self.config.attention_type == "sdpa":
	assert k.size(1) == v.size(1)
	num_kv_heads = k.size(1)
	num_q_heads = q.size(1)
	if num_q_heads != num_kv_heads:
	assert num_q_heads % num_kv_heads == 0
	k = k.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
	v = v.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)

	return F.scaled_dot_product_attention(
	q,
	k,
	v,
	attn_mask=attention_mask,
	dropout_p=dropout_p,
	is_causal=is_causal,
	)
	elif self.config.attention_type == "flash":
	# Downcast in case we are running with fp32 hidden states
	# Our attention mask is [1, 1, N, N]
	valid_mask = torch.reduce_any(attention_mask, -1)[0]
	attn_output = _flash_attention_forward(
	q.transpose(1, 2).to(torch.bfloat16),
	k.transpose(1, 2).to(torch.bfloat16),
	v.transpose(1, 2).to(torch.bfloat16),
	attention_mask=valid_mask,
	query_length=q.shape[2],
	is_causal=True,
	)
	else:
	raise NotImplementedError(self.config.attention_type)

	def forward(
	self,
	x,
	attention_mask,
	position_ids,
	layer_past,
	use_cache
	):
	qkv = self.att_proj(x)

	q, k, v = qkv.split(self.fused_dims, dim=-1)

	# Get attention scores.
	att, cache = self.attention(
	q, k, v,
	attention_mask,
	position_ids=position_ids,
	layer_past=layer_past,
	use_cache=use_cache
	)
	return att, cache


	class MolmoMlp(nn.Module):
	def __init__(self, input_dim, hidden_size, activation_fn, include_bias=False):
	super().__init__()
	self.ff_proj = nn.Linear(input_dim, hidden_size, bias=include_bias)
	self.ff_out = nn.Linear(hidden_size//2, input_dim, bias=include_bias)
	self.act = ACT2FN[activation_fn]

	def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
	x = self.ff_proj(x)
	x, gate = x.chunk(2, dim=-1)
	x = self.act(gate) * x
	x = self.ff_out(x)
	return x


	class MolmoBlock(nn.Module):
	def __init__(self, config: MolmoConfig, device=None):
	super().__init__()
	self.config = config
	self.hidden_size = config.intermediate_size
	self.dropout = nn.Dropout(config.residual_dropout)
	self.attn = MolmoAttention(config)
	self.attn_norm = MolmoRmsLayerNorm(config, size=config.hidden_size, eps=config.layer_norm_eps)
	self.mlp = MolmoMlp(config.hidden_size, config.intermediate_size, config.activation_type)
	self.ff_norm = MolmoRmsLayerNorm(config)

	def forward(
	self,
	x: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.Tensor] = None,
	layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
	use_cache: bool = False,
	) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
	if not self.config.norm_after:
	atten_in = self.attn_norm(x)
	else:
	atten_in = x

	att, cache = self.attn(
	atten_in,
	attention_mask=attention_mask,
	position_ids=position_ids,
	layer_past=layer_past,
	use_cache=use_cache
	)

	if self.config.norm_after:
	att = self.attn_norm(att)

	x = x + self.dropout(att)

	og_x = x

	if not self.config.norm_after:
	x = self.ff_norm(x)

	x = self.mlp(x)

	if self.config.norm_after:
	x = self.ff_norm(x)

	x = self.dropout(x)
	x = og_x + x

	return x, cache


	class MolmoeMLP(nn.Module):
	def __init__(self, input_dim, hidden_size, activation):
	super().__init__()
	self.gate_proj = nn.Linear(input_dim, hidden_size, bias=False)
	self.up_proj = nn.Linear(input_dim, hidden_size, bias=False)
	self.down_proj = nn.Linear(hidden_size, input_dim, bias=False)
	self.act_fn = ACT2FN[activation]

	def forward(self, x):
	return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))


	class MolmoeMlpExpert(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.num_experts = config.moe_num_experts
	self.top_k = config.moe_top_k
	self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False)
	self.experts = nn.ModuleList([MolmoeMLP(config.hidden_size, config.intermediate_size // 2, config.activation_type)
	for _ in range(self.num_experts)])

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	# hidden_states = self.ff_norm(hidden_states)
	batch_size, sequence_length, hidden_dim = hidden_states.shape
	hidden_states = hidden_states.view(-1, hidden_dim)
	# router_logits: (batch * sequence_length, n_experts)
	router_logits = self.gate(hidden_states)

	routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
	routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)

	# we cast back to the input dtype
	routing_weights = routing_weights.to(hidden_states.dtype)

	final_hidden_states = torch.zeros(
	(batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
	)

	# One hot encode the selected experts to create an expert mask
	# this will be used to easily index which expert is going to be selected
	expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)

	# Loop over all available experts in the model and perform the computation on each expert
	for expert_idx in range(self.num_experts):
	expert_layer = self.experts[expert_idx]
	idx, top_x = torch.where(expert_mask[expert_idx])

	# Index the correct hidden states and compute the expert hidden state for
	# the current expert. We need to make sure to multiply the output hidden
	# states by `routing_weights` on the corresponding tokens (top-1 and top-2)
	current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
	current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]

	# However `index_add_` only support torch tensors for indexing so we'll use
	# the `top_x` tensor here.
	final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
	final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
	return final_hidden_states, router_logits


	class MolmoeBlock(nn.Module):
	def __init__(self, config: MolmoConfig):
	super().__init__()
	self.attn = MolmoAttention(config)
	self.attn_norm = MolmoRmsLayerNorm(config, size=config.hidden_size, eps=config.layer_norm_eps)
	assert config.moe_num_experts > 0
	self.ff_norm = MolmoRmsLayerNorm(config, size=config.hidden_size, eps=config.layer_norm_eps)
	self.mlp = MolmoeMlpExpert(config)
	self.config = config
	self.hidden_size = config.intermediate_size
	self.dropout = nn.Dropout(config.residual_dropout)

	def forward(
	self,
	x: torch.Tensor,
	attention_mask: Optional[torch.FloatTensor] = None,
	position_ids: Optional[torch.Tensor] = None,
	layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
	use_cache: bool = False,
	) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
	if not self.config.norm_after:
	atten_in = self.attn_norm(x)
	else:
	atten_in = x

	att, cache = self.attn(
	atten_in,
	attention_mask=attention_mask,
	position_ids=position_ids,
	layer_past=layer_past,
	use_cache=use_cache
	)

	if self.config.norm_after:
	att = self.attn_norm(att)

	x = x + self.dropout(att)
	og_x = x

	if not self.config.norm_after:
	x = self.ff_norm(x)

	x, _ = self.mlp(x)

	if self.config.norm_after:
	x = self.ff_norm(x)

	x = self.dropout(x)
	x = og_x + x
	return x, cache


	class Embedding(nn.Module):
	def __init__(
	self,
	num_embeddings: int,
	num_new_embeddings: int,
	features: int,
	device: Union[str, torch.device] = None,
	initializer_range: float = 0.02,
	new_embed_initializer_range: float = 0.02,
	):
	super().__init__()
	self.initializer_range = initializer_range
	self.new_embed_initializer_range = new_embed_initializer_range
	self.embedding = nn.Parameter(
	torch.zeros(num_embeddings, features, device=device),
	)
	# We keep the special token embedding separate from the embedding from the LM so we can
	# put a separate learning rate of them during training
	self.new_embedding = nn.Parameter(torch.zeros(num_new_embeddings, features, device=device))

	def reset_parameters(self):
	nn.init.normal_(self.embedding, std=self.initializer_range)
	nn.init.normal_(self.new_embedding, std=self.new_embed_initializer_range)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return F.embedding(x, torch.cat([self.embedding, self.new_embedding], dim=0))


	def _expand_token(token, batch_size: int):
	return token.view(1, 1, -1).expand(batch_size, -1, -1)


	class VisionMlp(nn.Module):
	def __init__(self, dim: int, hidden_dim: int, hidden_act: str, device=None):
	super().__init__()
	self.w1 = nn.Linear(dim, hidden_dim, bias=True, device=device)
	self.act = ACT2FN[hidden_act]
	self.w2 = nn.Linear(hidden_dim, dim, bias=True, device=device)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return self.w2(self.act(self.w1(x)))


	class MolmoVisionBlock(nn.Module):

	def __init__(self, config: MolmoVisionConfig, attention_type, device=None):
	super().__init__()
	self.attention = VisionAttention(config, device=device, attention_type=attention_type)
	self.feed_forward = VisionMlp(
	config.image_emb_dim, config.image_mlp_dim, config.image_mlp_activations, device)
	self.attention_norm = nn.LayerNorm(
	config.image_emb_dim,
	eps=config.image_norm_eps,
	device=device,
	)
	self.ffn_norm = nn.LayerNorm(
	config.image_emb_dim,
	eps=config.image_norm_eps,
	device=device,
	)

	def reset_parameters(self):
	self.attention.reset_parameters()
	self.feed_forward.reset_parameters()
	self.attention_norm.reset_parameters()
	self.ffn_norm.reset_parameters()

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	x = x + self.attention(self.attention_norm(x))
	x = x + self.feed_forward(self.ffn_norm(x))
	return x


	class VisionPreLayerNorm(nn.LayerNorm):
	def forward(self, x: torch.Tensor) -> torch.Tensor:
	orig_type = x.dtype
	x = F.layer_norm(x.to(torch.float32), self.normalized_shape, self.weight.to(torch.float32),
	self.bias.to(torch.float32), self.eps)
	return x.to(orig_type)


	class VisionTransformer(nn.Module):

	def __init__(self, config: MolmoVisionConfig, attention_type, device=None):
	super().__init__()
	self.config = config

	# class embeddings and positional embeddings
	self.scale = config.image_emb_dim ** -0.5
	self.class_embedding = nn.Parameter(
	torch.zeros(config.image_emb_dim, device=device))
	self.positional_embedding = nn.Parameter(
	torch.zeros(config.image_num_pos, config.image_emb_dim, device=device))

	image_patch_size = config.image_patch_size
	self.patch_embedding = nn.Linear(
	image_patch_size * image_patch_size * 3,
	config.image_emb_dim,
	bias=False,
	device=device
	)

	self.pre_ln = VisionPreLayerNorm(
	config.image_emb_dim,
	eps=config.image_norm_eps,
	)
	self.blocks = nn.ModuleList([
	MolmoVisionBlock(config, attention_type=attention_type, device=device)
	for _ in range(config.image_num_layers)
	])

	def add_pos_emb(self, x: torch.Tensor, patch_num: int) -> torch.Tensor:
	cls_emb = self.positional_embedding[0:1]
	pos_emb = self.positional_embedding[1:]

	pos_emb = pos_emb.reshape(
	(int(math.sqrt(pos_emb.shape[0])), int(math.sqrt(pos_emb.shape[0])), pos_emb.shape[1])
	)

	(patch_num_0, patch_num_1) = patch_num

	if pos_emb.shape[0] != patch_num_0 or pos_emb.shape[1] != patch_num_1:
	# Dervied from https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
	# antialias: default True in jax.image.resize
	pos_emb = pos_emb.unsqueeze(0).permute(0, 3, 1, 2)
	pos_emb = F.interpolate(
	pos_emb, size=(patch_num_0, patch_num_1), mode="bicubic", align_corners=False, antialias=True,
	)
	pos_emb = pos_emb.permute(0, 2, 3, 1).squeeze(0)

	pos_emb = pos_emb.reshape(-1, pos_emb.shape[-1])
	x = x + torch.cat([cls_emb[None, :, :], pos_emb[None, :, :]], dim=1).to(x.dtype)
	return x

	def forward(self, x: torch.Tensor, patch_num: int = None) -> List[torch.Tensor]:
	if patch_num is None:
	patch_num = self.config.image_num_patch
	B, N, D = x.shape

	x = self.patch_embedding(x)

	# class embeddings and positional embeddings
	x = torch.cat([_expand_token(self.class_embedding, x.shape[0]).to(x.dtype), x], dim=1)
	x = self.add_pos_emb(x, patch_num)

	x = self.pre_ln(x)

	hidden_states = []
	for r in self.blocks:
	x = r(x)
	hidden_states.append(x)
	return hidden_states


	class VisionAttention(nn.Module):
	def __init__(self, config: MolmoVisionConfig, use_bias: bool =True,
	embed_dim: int=None, device=None, attention_type: str="sdpa"):
	super().__init__()
	self.config = config
	self.embed_dim = config.image_emb_dim
	self.num_heads = config.image_num_heads
	self.head_dim = config.image_head_dim
	self.num_key_value_heads = config.image_num_key_value_heads
	self.num_key_value_groups = self.num_heads // self.num_key_value_heads
	self.initializer_range = config.initializer_range
	self.attention_type = attention_type

	embed_dim = embed_dim if embed_dim else config.image_emb_dim

	self.wq = nn.Linear(
	embed_dim,
	self.num_heads * self.head_dim,
	bias=use_bias,
	device=device,
	)
	self.wk = nn.Linear(
	embed_dim,
	self.num_key_value_heads * self.head_dim,
	bias=use_bias,
	device=device,
	)
	self.wv = nn.Linear(
	embed_dim,
	self.num_key_value_heads * self.head_dim,
	bias=use_bias,
	device=device,
	)
	self.wo = nn.Linear(
	self.num_heads * self.head_dim,
	self.embed_dim,
	bias=use_bias,
	device=device,
	)
	self.residual_dropout = nn.Dropout(config.residual_dropout)

	def _split_heads(self, hidden_states, num_heads) -> torch.Tensor:
	return hidden_states.reshape(hidden_states.shape[:2] + (num_heads, self.head_dim))

	def _merge_heads(self, hidden_states) -> torch.Tensor:
	return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))

	def forward(self, inputs_q: torch.Tensor, inputs_kv: Optional[torch.Tensor] = None) -> torch.Tensor:
	if inputs_kv is not None:
	inputs_k = inputs_kv
	inputs_v = inputs_kv
	else:
	inputs_k = inputs_q
	inputs_v = inputs_q

	xq, xk, xv = self.wq(inputs_q), self.wk(inputs_k), self.wv(inputs_v)

	xq = self._split_heads(xq, self.num_heads)
	xk = self._split_heads(xk, self.num_key_value_heads)
	xv = self._split_heads(xv, self.num_key_value_heads)

	if self.num_heads != self.num_key_value_heads:
	xk = xk.repeat_interleave(self.num_key_value_groups, dim=2, output_size=self.num_heads)
	xv = xv.repeat_interleave(self.num_key_value_groups, dim=2, output_size=self.num_heads)

	og_dtype = xq.dtype

	if self.config.float32_attention:
	xq = xq.to(torch.float)
	xk = xk.to(torch.float)

	if self.attention_type == "direct":
	attn_weights = torch.einsum("...qhd,...khd->...hqk", xq / math.sqrt(xq.size(-1)), xk)
	attn_weights = F.softmax(attn_weights, dim=-1)
	attn_output = torch.einsum("...hqk,...khd->...qhd", attn_weights.to(xv.dtype), xv)

	elif self.attention_type == "sdpa":
	if self.config.float32_attention and not torch.is_autocast_enabled():
	xv = xv.to(torch.float32)
	attn_output = F.scaled_dot_product_attention(
	xq.transpose(1, 2).contiguous(),
	xk.transpose(1, 2).contiguous(),
	xv.transpose(1, 2).contiguous(),
	is_causal=False,
	).transpose(1, 2)

	elif self.attention_type == "flash":
	assert not self.config.float32_attention
	# Downcast in case we are running with fp32 hidden states
	attn_output = _flash_attention_forward(
	xq.transpose(1, 2).to(torch.bfloat16),
	xk.transpose(1, 2).to(torch.bfloat16),
	xv.transpose(1, 2).to(torch.bfloat16),
	attention_mask=None,
	query_length=inputs_q.shape[1],
	is_causal=False,
	)
	else:
	raise NotImplementedError(self.attention_type)
	attn_output = attn_output.to(og_dtype)
	attn_output = self._merge_heads(attn_output)
	attn_output = self.wo(attn_output)
	attn_output = self.residual_dropout(attn_output)
	return attn_output


	class MolmoImageProjector(nn.Module):
	def __init__(self, input_dim: int, hidden_dim, output_dim, act_fn="silu", device=None):
	super().__init__()
	self.w1 = nn.Linear(input_dim, hidden_dim, bias=False, device=device)
	self.w2 = nn.Linear(hidden_dim, output_dim, bias=False, device=device)
	self.w3 = nn.Linear(input_dim, hidden_dim, bias=False, device=device)
	self.act_fn = ACT2FN[act_fn]

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return self.w2(self.act_fn(self.w1(x))*self.w3(x))


	class OLMoVisionBackbone(nn.Module):
	def __init__(self, config: MolmoConfig):
	super().__init__()
	self.config = config
	self.image_vit = VisionTransformer(config.vision_config, config.attention_type)

	self.image_pooling_2d = VisionAttention(
	config.vision_config,
	embed_dim=len(config.vit_layers)*config.vision_config.image_emb_dim,
	attention_type=config.attention_type
	)

	# `MLP` assume the activation takes two inputs, so it must be a 'llama' version
	if config.activation_type == "swiglu":
	mlp_config = replace(config, activation_type="llama_swiglu")
	elif config.activation_type == "gelu":
	raise NotImplementedError()
	else:
	mlp_config = config

	self.image_projector = MolmoImageProjector(
	config.vision_config.image_emb_dim,
	config.intermediate_size//2, # //2 since `mlp_hidden_size` includes the gate and parts
	config.hidden_size,
	act_fn=config.activation_type
	)
	self.image_feature_dropout = nn.Dropout(config.image_feature_dropout)
	self.num_prefix_tokens = 1

	self.pad_embed = None
	if config.image_padding_embed:
	image_dim = config.vision_config.image_emb_dim*len(self.config.vit_layers)
	if config.image_padding_embed == "pad_and_partial_pad":
	self.pad_embed = nn.Parameter(torch.zeros((2, image_dim)))
	else:
	raise ValueError(config.image_padding_embed)

	def encode_image(self, images: torch.Tensor) -> torch.Tensor:
	cfg = self.config
	v_cfg = self.config.vision_config
	B, T, N, D = images.shape

	mask = ~torch.all(images.view(B * T, N, D) == -1, dim=(1, 2), keepdim=True)

	# Output all hidden states
	# n_layers x (batch_num_crops, (1+)n_tokens, image_emb_dim)
	images = images.view(B * T, N, D)
	image_features = self.image_vit(images)

	if cfg.vit_layers is not None:
	features = []
	for layer in cfg.vit_layers:
	features.append(image_features[layer])
	image_features = torch.cat(features, dim=-1)
	else:
	image_features = image_features[-1]

	cls_embed: torch.Tensor = None
	if self.num_prefix_tokens > 0:
	cls_embed = image_features[:, 0]
	image_features = image_features[:, 1:]

	image_features = image_features * mask
	image_features = image_features.view(B, T, N, -1)

	cls_embed = cls_embed.view(B, T, -1) if cls_embed is not None else None

	return image_features, cls_embed

	def forward(self, images: torch.Tensor, image_masks: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
	cfg = self.config

	# image_features: (batch_size, num_crops(=num_image), num_patch, nximage_emb_dim)
	batch_size, num_image = images.shape[:2]
	image_features, cls_embed = self.encode_image(images)

	if cfg.image_padding_embed:
	assert image_masks is not None
	if cfg.image_padding_embed == "pad_embed":
	all_pad = (image_masks == 0).to(dtype=torch.float32)
	pad_embed = self.pad_embed[None, None, None, :]
	image_features = image_features + pad_embed * torch.unsqueeze(all_pad, -1)
	elif cfg.image_padding_embed == "regress":
	pad_embed = self.pad_embed[None, None, None, :]
	image_features = image_features + pad_embed * torch.unsqueeze(torch.maximum(image_masks, torch.zeros_like(image_masks)), -1)
	elif cfg.image_padding_embed == "pad_and_partial_pad":
	pad_embed = self.pad_embed[:, None, None, None, :]
	all_pad = image_masks == 0
	partial_pad = torch.logical_and(image_masks < 1, torch.logical_not(all_pad)).to(dtype=image_features.dtype)
	all_pad = all_pad.to(dtype=image_features.dtype)
	image_features = image_features + pad_embed[0] * torch.unsqueeze(all_pad, -1)
	image_features = image_features + pad_embed[1] * torch.unsqueeze(partial_pad, -1)
	else:
	raise ValueError(cfg.image_padding_embed)

	image_features = self.image_feature_dropout(image_features)
	if cls_embed is not None:
	cls_embed = self.image_feature_dropout(cls_embed)

	image_features = image_features.reshape(
	(batch_size, num_image) + cfg.image_num_patch + (-1,))

	# transpose to get 2x2 feature squares [n_patches, 4, n_features]
	batch, n_crops, h, w, c = image_features.shape
	image_features = torch.reshape(image_features, [batch*n_crops, h//2, 2, w//2, 2, c])
	image_features = torch.permute(image_features, [0, 1, 3, 2, 4, 5])
	image_features = torch.reshape(image_features, [batchn_cropsh//2w//2, 22, c])

	query = image_features.mean(-2, keepdim=True)
	image_features = self.image_pooling_2d(query, image_features)

	h = self.config.vision_config.image_num_patch[0]//2
	w = self.config.vision_config.image_num_patch[1]//2
	image_features = image_features.reshape(batch_size, num_image, h * w, -1)

	# MLP layer to map the feature.
	image_features = self.image_projector(image_features)

	# image_features: (batch_size, num_image, num_patch, hidden_size)
	# cls_embed: (batch_size, num_image, hidden_size)
	return image_features, cls_embed


	def causal_attention_bias(seq_len: int, device: torch.device) -> torch.FloatTensor:
	att_bias = torch.triu(
	torch.ones(seq_len, seq_len, device=device, dtype=torch.float),
	diagonal=1,
	)
	att_bias.masked_fill_(att_bias == 1, torch.finfo(att_bias.dtype).min)
	return att_bias.view(1, 1, seq_len, seq_len) # type: ignore


	class MolmoRmsLayerNorm(nn.Module):
	"""
	RMS layer norm, a simplified :class:`LayerNorm` implementation
	"""

	def __init__(
	self,
	config: MolmoConfig,
	size: Optional[int] = None,
	elementwise_affine: Optional[bool] = None,
	eps: float = 1e-5,
	):
	super().__init__()
	self.config = config
	self.eps = self.config.layer_norm_eps or eps
	self.normalized_shape = (size or config.hidden_size,)
	if elementwise_affine or (elementwise_affine is None):
	self.weight = nn.Parameter(torch.ones(self.normalized_shape))
	use_bias = self.config.bias_for_layer_norm
	if use_bias:
	self.bias = nn.Parameter(torch.zeros(self.normalized_shape))
	else:
	self.register_parameter("bias", None)
	else:
	self.register_parameter("bias", None)
	self.register_parameter("weight", None)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	with torch.autocast(enabled=False, device_type=x.device.type):
	og_dtype = x.dtype
	x = x.to(torch.float32)
	variance = x.pow(2).mean(-1, keepdim=True)
	x = x * torch.rsqrt(variance + self.eps)
	x = x.to(og_dtype)

	if self.weight is not None:
	if self.bias is not None:
	return self.weight * x + self.bias
	else:
	return self.weight * x
	else:
	return x


	class MolmoModel(MolmoPreTrainedModel):
	def __init__(self, config: MolmoConfig, init_params: bool = True):
	super().__init__(config)

	if self.config.additional_vocab_size is not None:
	wte = Embedding(
	config.vocab_size,
	config.additional_vocab_size,
	config.hidden_size,
	)
	else:
	wte = nn.Embedding(config.vocab_size, config.hidden_size)

	self.transformer = nn.ModuleDict(
	dict(
	wte=wte,
	emb_drop=nn.Dropout(config.embedding_dropout),
	ln_f=MolmoRmsLayerNorm(config),
	)
	)

	if config.moe_num_experts > 0:
	blocks = [MolmoeBlock(config) for i in range(config.num_hidden_layers)]
	else:
	blocks = [MolmoBlock(config) for i in range(config.num_hidden_layers)]
	self.transformer.update({"blocks": nn.ModuleList(blocks)})

	if not config.weight_tying:
	self.transformer.update(
	{
	"ff_out": nn.Linear(
	config.hidden_size,
	config.vocab_size,
	bias=False,
	)
	}
	)

	self.vision_backbone: Optional[OLMoVisionBackbone] = None
	if config.vision_config is not None:
	self.vision_backbone = OLMoVisionBackbone(config)

	def reset_parameters(self):
	if self.vision_backbone is not None:
	self.vision_backbone.reset_parameters()
	self.reset_non_vision_parameters()

	def reset_non_vision_parameters(self):
	self.transformer.wte.reset_parameters()
	if hasattr(self.transformer.wte, "new_embedding"):
	nn.init.normal_(self.transformer.wte.new_embedding, std=self.config.new_embedding_init_range)

	if hasattr(self.transformer, "wpe"):
	nn.init.normal_(self.transformer.wpe, mean=0.0, std=1.0)

	self.transformer.ln_f.reset_parameters() # type: ignore

	if hasattr(self.transformer, "ff_out"):
	nn.init.normal_(self.transformer.ff_out, mean=0.0, std=0.02)

	for block in self.transformer.blocks:
	block.reset_parameters()

	def forward(
	self,
	input_ids: torch.LongTensor,
	input_embeddings: Optional[torch.FloatTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	images: Optional[torch.Tensor] = None,
	image_masks: Optional[torch.Tensor] = None,
	image_input_idx: Optional[torch.Tensor] = None,
	subsegment_ids: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.Tensor] = None,
	past_key_values: Optional[Sequence[Tuple[torch.Tensor, torch.Tensor]]] = None,
	use_cache: bool = False,
	last_logits_only: bool = False,
	output_hidden_states: Optional[bool] = None,
	append_last_valid_logits: Optional[torch.Tensor] = None,
	) -> ModelOutput:
	"""
	:param input_ids: A tensor of shape `(batch_size, seq_len)`.
	:param input_embeddings: A tensor of shape `(batch_size, seq_len, hidden_size)` with input
	embeddings. When provided, it is treated as the output of the input embedding layer.
	:param attention_mask: A tensor of shape `(batch_size, seq_len)` that indicates
	which input IDs are masked. A `1` value in the mask means that
	the corresponding input ID should not be ignored. A `0` means
	that the corresponding input ID is masked.

	This has the same meaning as the `attention_mask` in HuggingFace's `transformers`
	library.
	:param attention_bias: A tensor of shape `(batch_size, 1, seq_len, seq_len)`,
	`(1, 1, seq_len, seq_len)`, or `(seq_len, seq_len)`. This is used
	to introduce causal or other biases.

	If the tensor is a bool or byte tensor, a `True` or `1` at `attention_bias[:, :, i, j]`
	indicates that the i-th element in the sequence is allowed to attend to the j-th
	element in the sequence.

	If the tensor is a float tensor, it will just be added to the attention
	scores before the softmax.

	The default is causal, which corresponds to a lower-diagonal byte matrix of ones.
	:param response_mask: A tensor of shape `(batch_size, seq_len)` that indicates
	the response mask. A `1` value in the mask means that the corresponding token
	is a response token. A `0` means that the corresponding token is not
	a response token.
	:param past_key_values: Pre-computed keys and values for each attention block.
	Can be used to speed up sequential decoding. The `input_ids` which have
	their past given to this model should not be passed as `input_ids` as they have already been computed.
	:param use_cache: If `True`, return key and value tensors for each block.
	:param last_logits_only: If `True`, only compute the logits for the last token of each sequence.
	This can speed up decoding when you only care about the next token.
	"""
	output_hidden_states = output_hidden_states if output_hidden_states is not None else False

	if past_key_values:
	assert len(past_key_values) == self.config.num_hidden_layers

	has_image = images is not None

	assert not (has_image and input_embeddings is not None), "Cannot provide both images and input embeddings."
	assert not (has_image and past_key_values is not None), "Cached key and values should not be used with images."

	batch_size, seq_len = input_ids.size() if input_embeddings is None else input_embeddings.size()[:2]
	if past_key_values is None:
	past_length = 0
	else:
	past_length = past_key_values[0][0].size(-2)

	if attention_mask is None:
	attention_mask = input_ids != -1

	if subsegment_ids is not None:
	raise NotImplementedError()
	else:
	if position_ids is None:
	position_ids = torch.clamp(
	torch.cumsum(attention_mask.to(torch.int32), dim=-1) - 1,
	min=0,
	).broadcast_to((batch_size, attention_mask.shape[-1]))

	# Get embeddings of input.
	# shape: (batch_size, seq_len, hidden_size)
	if input_ids is not None:
	input_ids = input_ids * (input_ids != -1).to(input_ids.dtype)
	x = self.transformer.wte(input_ids) if input_embeddings is None else input_embeddings # type: ignore

	num_image: Optional[int] = None
	if images is not None:
	# shape: (batch_size, num_image, num_patch, hidden_size)
	# cls_embed: (batch_size, num_image, hidden_size)
	image_features, cls_embed = self.vision_backbone(images, image_masks)
	num_image, num_patch = image_features.shape[1:3]
	assert image_input_idx.shape == (batch_size, num_image, num_patch)

	# inster the image feature into the embedding.
	image_features = image_features.view(batch_size, num_image * num_patch, -1)
	image_input_idx = image_input_idx.view(batch_size, num_image * num_patch)

	valid = image_input_idx >= 0
	batch_idx = torch.arange(batch_size, device=x.device)
	batch_idx = torch.tile(batch_idx[:, None], [1, image_features.shape[1]])

	# For hf demo/endpoint
	image_features = image_features.to(x.device)

	x[batch_idx[valid], image_input_idx[valid]] += image_features[valid]

	# Add input + positional embeddings and apply dropout.
	# shape: (batch_size, seq_len, hidden_size)
	x = self.transformer.emb_drop(x) # type: ignore

	# normalized
	if self.config.normalize_input_embeds:
	x = x * (self.config.hidden_size ** 0.5)

	# Merge attention mask with attention bias.
	# FIXME we are ignoring the attention mask input parameter
	if self.config.attention_type == "flash":
	attention_mask = input_ids != -1
	elif (
	attention_mask is not None
	or past_key_values is not None
	):
	total_len = (past_length + seq_len)
	attention_mask = torch.tril(torch.ones(total_len, total_len, device=x.device, dtype=torch.bool))
	attention_mask = attention_mask.view(1, 1, total_len, total_len)

	attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = [] if use_cache else None

	# decoder layers
	all_hidden_states = []

	# Apply blocks one-by-one.
	for block_idx, block in enumerate(self.transformer.blocks):
	if output_hidden_states:
	# add hidden states
	all_hidden_states.append(x)

	layer_past = None if past_key_values is None else past_key_values[block_idx]
	x, cache = block(x, attention_mask=attention_mask, position_ids=position_ids, layer_past=layer_past, use_cache=use_cache)

	if attn_key_values is not None:
	assert cache is not None
	attn_key_values.append(cache)

	if last_logits_only:
	# shape: (batch_size, 1, hidden_size)
	if append_last_valid_logits is not None:
	last_valid_output = x[
	torch.arange(x.shape[0], device=x.device), append_last_valid_logits.to(x.device)]
	x = last_valid_output.unsqueeze(1)
	else:
	x = x[:, -1, :].unsqueeze(1)

	# Apply final layer norm.
	# shape: (batch_size, seq_len or 1, hidden_size)
	x = self.transformer.ln_f(x) # type: ignore
	if output_hidden_states:
	# add final hidden state post-final-layernorm, following HuggingFace's convention
	all_hidden_states.append(x)

	# Get logits.
	# shape: (batch_size, seq_len or 1, vocab_size)
	if self.config.weight_tying:
	logits = F.linear(x, self.transformer.wte.weight, None) # type: ignore
	else:
	logits = self.transformer.ff_out(x) # type: ignore
	if self.config.scale_logits:
	logits.mul_(1 / math.sqrt(self.config.hidden_size))

	if not last_logits_only and append_last_valid_logits is not None:
	last_valid_logit = logits[
	torch.arange(logits.shape[0], device=logits.device), append_last_valid_logits]
	logits = torch.cat([logits[:, :-1], last_valid_logit[:, None]], dim=1)

	return ModelOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None) # type: ignore[arg-type]


	class MolmoForCausalLM(MolmoPreTrainedModel):

	def __init__(self, config: MolmoConfig, model: Optional[MolmoModel] = None, init_params: bool = False):
	super().__init__(config)

	if not model:
	self.model = MolmoModel(config, init_params=init_params)
	else:
	self.model = model
	self.post_init()

	def get_input_embeddings(self) -> torch.nn.Module:
	return self.model.transformer.wte

	def get_output_embeddings(self):
	if self.config.weight_tying:
	return self.model.transformer.wte
	else:
	return self.model.transformer.ff_out

	def forward(
	self,
	input_ids: torch.LongTensor = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	attention_bias: Optional[torch.Tensor] = None,
	response_mask: Optional[torch.Tensor] = None,
	images: Optional[torch.Tensor] = None,
	image_masks: Optional[torch.Tensor] = None,
	image_input_idx: Optional[torch.Tensor] = None,
	subsegment_ids: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.Tensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	labels: Optional[torch.LongTensor] = None,
	loss_masks: Optional[torch.Tensor] = None,
	use_cache: Optional[bool] = None,
	last_logits_only: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	append_last_valid_logits: Optional[torch.Tensor] = None,
	return_dict: Optional[bool] = None,
	cache_position: Optional[
	Cache
	] = None, # This is a hack mitigation of an issue in transformers `4.39.x` https://github.com/huggingface/transformers/issues/29426
	) -> Union[Tuple, CausalLMOutputWithPast]:
	if use_cache is None:
	use_cache = self.config.use_cache

	if output_attentions:
	raise ValueError("output_attentions is not yet supported in Molmo")

	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
	outputs = self.model.forward(
	input_ids=input_ids,
	input_embeddings=inputs_embeds,
	attention_mask=attention_mask,
	images=images,
	image_masks=image_masks,
	image_input_idx=image_input_idx,
	subsegment_ids=subsegment_ids,
	position_ids=position_ids,
	past_key_values=past_key_values,
	use_cache=use_cache,
	last_logits_only=last_logits_only,
	output_hidden_states=output_hidden_states,
	append_last_valid_logits=append_last_valid_logits,
	)

	logits = outputs.logits
	hidden_states = outputs.hidden_states

	loss = None
	if labels is not None:
	if loss_masks is not None:
	loss_masks = loss_masks * (loss_masks > 0)
	batch_size_in_tokens = max(loss_masks.sum().item(), 1)
	labels = labels.long()
	labels.masked_fill_(~(loss_masks > 0), -100)
	labels = labels.view(-1)
	logits_for_loss = logits.to(torch.float32).view(-1, logits.size(-1))
	loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='none')
	loss = loss_fct(logits_for_loss, labels)
	loss = loss.view(input_ids.shape[0], -1)
	loss = loss * loss_masks
	loss = loss.sum() / batch_size_in_tokens
	use_zloss = getattr(self.config, "softmax_auxiliary_loss", False)
	if use_zloss:
	z_squared = logits_for_loss.logsumexp(-1).pow(2)
	z_loss = self.config.softmax_auxiliary_loss_scale * z_squared
	z_loss = z_loss.view(input_ids.shape[0], -1)
	z_loss = z_loss * loss_masks
	z_loss = z_loss.sum() / batch_size_in_tokens
	loss += z_loss
	else:
	# Shift so that tokens < n predict n
	shift_logits = logits[..., :-1, :].contiguous()
	shift_labels = labels[..., 1:].contiguous()
	# Flatten the tokens
	loss_fct = torch.nn.CrossEntropyLoss()
	shift_logits = shift_logits.view(-1, self.config.vocab_size)
	shift_labels = shift_labels.view(-1)
	# Enable model parallelism
	shift_labels = shift_labels.to(shift_logits.device)
	loss = loss_fct(shift_logits, shift_labels)

	if not return_dict:
	output = (logits,) + outputs[1:]
	return (loss,) + output if loss is not None else output

	return CausalLMOutputWithPast(
	loss=loss,
	logits=logits,
	past_key_values=outputs.attn_key_values,
	hidden_states=hidden_states,
	)

	def can_generate(self) -> bool:
	return True

	@torch.no_grad()
	def generate_from_batch(
	self,
	batch: Dict[str, Any],
	generation_config: Optional[GenerationConfig] = None,
	**kwargs,
	):
	if generation_config is not None:
	assert generation_config.use_cache

	images = batch.get("images")
	image_masks = batch.get("image_masks")
	image_input_idx = batch.get("image_input_idx")

	# Validate inputs.
	input_ids = batch["input_ids"]
	batch_size, seq_len = input_ids.shape
	attention_mask = batch.get("attention_mask", None)
	max_new_tokens = generation_config.max_new_tokens
	assert max_new_tokens is not None
	mask_len = seq_len + max_new_tokens
	position_ids: Optional[torch.Tensor] = None
	append_last_valid_logits: Optional[torch.Tensor] = None
	if attention_mask is None:
	attention_mask = input_ids != -1
	position_ids = torch.clamp(
	torch.cumsum(attention_mask.to(torch.int32), dim=-1) - 1,
	min=0
	)
	append_last_valid_logits = attention_mask.long().sum(dim=-1) - 1
	attention_mask = torch.cat(
	[attention_mask, attention_mask.new_ones((batch_size, max_new_tokens))],
	dim=1,
	)
	if attention_mask is not None:
	assert attention_mask.shape == (batch_size, mask_len)

	out = super().generate(
	batch["input_ids"],
	generation_config,
	attention_mask=attention_mask,
	images=images,
	image_masks=image_masks,
	image_input_idx=image_input_idx,
	position_ids=position_ids,
	append_last_valid_logits=append_last_valid_logits,
	**kwargs,
	)

	return out

	def prepare_inputs_for_generation(
	self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple]] = None, **kwargs
	):
	if past_key_values:
	# This is because we want the model to only process the last generated token.
	input_ids = input_ids[:, -1:]

	attention_mask = kwargs.get("attention_mask")
	images = kwargs.get("images")
	image_masks = kwargs.get("image_masks")
	image_input_idx = kwargs.get("image_input_idx")
	position_ids = kwargs.get("position_ids")
	append_last_valid_logits = kwargs.get("append_last_valid_logits")
	model_inputs = {
	"input_ids": input_ids,
	"attention_mask": attention_mask,
	"position_ids": position_ids,
	"past_key_values": past_key_values,
	"use_cache": True,
	"last_logits_only": True,
	}
	if past_key_values is None:
	model_inputs["images"] = images
	model_inputs["image_masks"] = image_masks
	model_inputs["image_input_idx"] = image_input_idx
	model_inputs["append_last_valid_logits"] = append_last_valid_logits
	return model_inputs

	def _update_model_kwargs_for_generation(
	self,
	outputs: ModelOutput,
	model_kwargs: Dict[str, Any],
	is_encoder_decoder: bool = False,
	num_new_tokens: int = 1,
	) -> Dict[str, Any]:
	model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
	if "append_last_valid_logits" in model_kwargs:
	del model_kwargs["append_last_valid_logits"]
	if "images" in model_kwargs:
	del model_kwargs["images"]
	del model_kwargs["image_masks"]
	del model_kwargs["image_input_idx"]
	cache_name, cache = super()._extract_past_from_model_output(outputs)
	model_kwargs[cache_name] = cache
	model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens
	return model_kwargs


	# Always register for multi-modal features
	AutoModelForCausalLM.register(MolmoConfig, MolmoForCausalLM)