Aria / modeling_aria.py

first version

0531a03 about 1 month ago

25.2 kB

	# Copyright 2024 Rhymes AI. All rights reserved.
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	from dataclasses import dataclass
	from typing import List, Optional, Tuple, Union

	import torch
	import torch.nn as nn
	from torch import nn
	from transformers import PreTrainedModel
	from transformers.cache_utils import Cache
	from transformers.modeling_outputs import ModelOutput
	from transformers.utils import logging

	from .configuration_aria import AriaConfig
	from .moe_lm import AriaMoELMForCausalLM
	from .projector import AriaProjector
	from .vision_encoder import AriaVisionModel

	logger = logging.get_logger(__name__)


	class AriaPretrainedModel(PreTrainedModel):
	"""
	An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
	"""

	config_class = AriaConfig
	base_model_prefix = "model"
	_no_split_modules = []
	supports_gradient_checkpointing = True
	_skip_keys_device_placement = "past_key_values"
	_supports_flash_attn_2 = True
	_supports_cache_class = True

	@property
	def _supports_sdpa(self):
	"""
	Retrieve language_model's attribute to check whether the model supports
	SDPA (Scaled Dot Product Attention) or not.
	"""
	return self.language_model._supports_sdpa


	@dataclass
	# Copied from transformers.models.llava.modeling_llava.LlavaCausalLMOutputWithPast with Llava->Aria
	class AriaCausalLMOutputWithPast(ModelOutput):
	"""
	Base class for Aria causal language model (or autoregressive) outputs.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Language modeling loss (for next-token prediction).
	logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
	Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
	past_key_values (`tuple(tuple(torch.FloatTensor))`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
	`(batch_size, num_heads, sequence_length, embed_size_per_head)`)

	Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
	`past_key_values` input) to speed up sequential decoding.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	image_hidden_states (`tuple(torch.FloatTensor)`, optional):
	Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
	sequence_length, hidden_size)`.

	image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: torch.FloatTensor = None
	past_key_values: Optional[List[torch.FloatTensor]] = None
	hidden_states: Optional[Tuple[torch.FloatTensor]] = None
	attentions: Optional[Tuple[torch.FloatTensor]] = None
	image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None


	def build_mm_projector(config: AriaConfig):
	"""
	Builds and returns an AriaProjector instance based on the provided configuration.

	Args:
	config (AriaConfig): The configuration object containing necessary parameters.

	Returns:
	AriaProjector: An instance of the AriaProjector class.
	"""
	return AriaProjector(
	patch_to_query_dict=config.projector_patch_to_query_dict,
	embed_dim=config.vision_config.hidden_size,
	num_heads=config.vision_config.num_attention_heads,
	kv_dim=config.vision_config.hidden_size,
	ff_dim=config.text_config.hidden_size,
	output_dim=config.text_config.hidden_size,
	)


	# adapted from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration
	class AriaForConditionalGeneration(AriaPretrainedModel):
	"""
	Aria model for conditional generation tasks.

	This model combines a vision tower, a multi-modal projector, and a language model
	to perform tasks that involve both image and text inputs.
	"""

	def __init__(self, config: AriaConfig):
	super().__init__(config)

	self.vision_tower = AriaVisionModel(config.vision_config)
	self.multi_modal_projector = build_mm_projector(config)
	self.vocab_size = config.text_config.vocab_size
	self.language_model = AriaMoELMForCausalLM(config.text_config)
	self.pad_token_id = (
	self.config.pad_token_id if self.config.pad_token_id is not None else -1
	)
	self.post_init()

	def freeze_vit(self):
	"""Freeze the parameters of the vision tower."""
	for param in self.vision_tower.parameters():
	param.requires_grad = False

	def freeze_projector(self):
	"""Freeze the parameters of the multi-modal projector."""
	for param in self.multi_modal_projector.parameters():
	param.requires_grad = False

	def freeze_llm(self):
	"""Freeze the parameters of the language model."""
	for param in self.language_model.parameters():
	param.requires_grad = False

	def get_input_embeddings(self) -> nn.Module:
	"""Retrieve the input embeddings from the language model."""
	return self.language_model.get_input_embeddings()

	def set_input_embeddings(self, value):
	"""Set the input embeddings for the language model."""
	self.language_model.set_input_embeddings(value)

	def set_moe_z_loss_coeff(self, value):
	"""
	Set the z-loss coefficient for Mixture of Experts (MoE) models.

	Args:
	value: The z-loss coefficient value to set.
	"""
	self.language_model.set_z_loss_coeff(value)

	def set_moe_aux_loss_coeff(self, value):
	"""
	Set the auxiliary loss coefficient for Mixture of Experts (MoE) models.

	Args:
	value: The auxiliary loss coefficient value to set.
	"""
	self.language_model.set_aux_loss_coeff(value)

	# copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration
	def _merge_input_ids_with_image_features(
	self, image_features, inputs_embeds, input_ids, attention_mask, labels
	):
	"""
	Merge input IDs with image features to create a combined input representation.

	This method handles the complex logic of interleaving text and image tokens,
	adjusting attention masks and labels accordingly.

	Args:
	image_features (torch.Tensor): Processed image features.
	inputs_embeds (torch.Tensor): Text input embeddings.
	input_ids (torch.Tensor): Input token IDs.
	attention_mask (torch.Tensor): Attention mask for input tokens.
	labels (torch.Tensor, optional): Labels for language modeling.

	Returns:
	tuple: Contains the merged embeddings, updated attention mask,
	updated labels, and position IDs.
	"""
	num_images, num_image_patches, embed_dim = image_features.shape
	batch_size, sequence_length = input_ids.shape
	left_padding = not torch.sum(
	input_ids[:, -1] == torch.tensor(self.pad_token_id)
	)
	# 1. Create a mask to know where special image tokens are
	special_image_token_mask = input_ids == self.config.image_token_index
	num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
	# Compute the maximum embed dimension
	max_embed_dim = (
	num_special_image_tokens.max() * (num_image_patches - 1)
	) + sequence_length
	batch_indices, non_image_indices = torch.where(
	input_ids != self.config.image_token_index
	)

	# 2. Compute the positions where text should be written
	# Calculate new positions for text tokens in merged image-text sequence.
	# `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
	# `torch.cumsum` computes how each image token shifts subsequent text token positions.
	# - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
	new_token_positions = (
	torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1)
	- 1
	)
	nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
	if left_padding:
	new_token_positions += nb_image_pad[:, None] # offset for left padding
	text_to_overwrite = new_token_positions[batch_indices, non_image_indices]

	# 3. Create the full embedding, already padded to the maximum position
	final_embedding = torch.zeros(
	batch_size,
	max_embed_dim,
	embed_dim,
	dtype=inputs_embeds.dtype,
	device=inputs_embeds.device,
	)
	final_attention_mask = torch.zeros(
	batch_size,
	max_embed_dim,
	dtype=attention_mask.dtype,
	device=inputs_embeds.device,
	)
	if labels is not None:
	final_labels = torch.full(
	(batch_size, max_embed_dim),
	self.config.ignore_index,
	dtype=input_ids.dtype,
	device=input_ids.device,
	)
	# In case the Vision model or the Language model has been offloaded to CPU, we need to manually
	# set the corresponding tensors into their correct target device.
	target_device = inputs_embeds.device
	batch_indices, non_image_indices, text_to_overwrite = (
	batch_indices.to(target_device),
	non_image_indices.to(target_device),
	text_to_overwrite.to(target_device),
	)
	attention_mask = attention_mask.to(target_device)

	# 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
	# we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
	final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[
	batch_indices, non_image_indices
	]
	final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[
	batch_indices, non_image_indices
	]
	if labels is not None:
	final_labels[batch_indices, text_to_overwrite] = labels[
	batch_indices, non_image_indices
	]

	# 5. Fill the embeddings corresponding to the images. Anything that is not `text_positions` needs filling (#29835)
	image_to_overwrite = torch.full(
	(batch_size, max_embed_dim),
	True,
	dtype=torch.bool,
	device=inputs_embeds.device,
	)
	image_to_overwrite[batch_indices, text_to_overwrite] = False
	image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[
	:, None
	].to(target_device)

	if image_to_overwrite.sum() != image_features.shape[:-1].numel():
	raise ValueError(
	f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
	f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
	)

	final_embedding[image_to_overwrite] = (
	image_features.contiguous().reshape(-1, embed_dim).to(target_device)
	)
	final_attention_mask \|= image_to_overwrite
	position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_(
	(final_attention_mask == 0), 1
	)

	# 6. Mask out the embedding at padding positions, as we later use the past_key_value value to determine the non-attended tokens.
	batch_indices, pad_indices = torch.where(input_ids == self.pad_token_id)
	indices_to_mask = new_token_positions[batch_indices, pad_indices]

	final_embedding[batch_indices, indices_to_mask] = 0

	if labels is None:
	final_labels = None

	return final_embedding, final_attention_mask, final_labels, position_ids

	def forward(
	self,
	input_ids: torch.LongTensor = None,
	pixel_values: torch.FloatTensor = None,
	pixel_mask: torch.LongTensor = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, AriaCausalLMOutputWithPast]:
	"""
	Forward pass of the AriaForConditionalGeneration model.

	This method processes both text and image inputs, merges them if necessary,
	and generates output using the language model.

	Args:
	input_ids (torch.LongTensor, optional): Input token ids.
	pixel_values (torch.FloatTensor, optional): Pixel values of the images.
	pixel_mask (torch.LongTensor, optional): Mask for the pixel values.
	attention_mask (torch.Tensor, optional): Attention mask.
	position_ids (torch.LongTensor, optional): Position ids.
	past_key_values (List[torch.FloatTensor], optional): Past key values for efficient processing.
	inputs_embeds (torch.FloatTensor, optional): Input embeddings.
	labels (torch.LongTensor, optional): Labels for computing the language modeling loss.
	use_cache (bool, optional): Whether to use the model's cache mechanism.
	output_attentions (bool, optional): Whether to output attention weights.
	output_hidden_states (bool, optional): Whether to output hidden states.
	return_dict (bool, optional): Whether to return a ModelOutput object.

	Returns:
	Union[Tuple, AriaCausalLMOutputWithPast]: Model outputs.
	"""
	output_attentions = (
	output_attentions
	if output_attentions is not None
	else self.config.output_attentions
	)
	output_hidden_states = (
	output_hidden_states
	if output_hidden_states is not None
	else self.config.output_hidden_states
	)
	return_dict = (
	return_dict if return_dict is not None else self.config.use_return_dict
	)

	if inputs_embeds is None:
	# 1. Extra the input embeddings
	inputs_embeds = self.get_input_embeddings()(input_ids)

	# 2. Merge text and images
	if pixel_values is not None and input_ids.shape[1] != 1:
	image_outputs, image_attn_mask = self.vision_tower(
	pixel_values,
	pixel_mask=pixel_mask,
	)
	selected_image_feature = image_outputs.last_hidden_state

	image_features = self.multi_modal_projector(
	selected_image_feature, attn_mask=image_attn_mask
	)

	inputs_embeds = inputs_embeds.to(image_features.dtype)
	(
	inputs_embeds,
	attention_mask,
	labels,
	position_ids,
	) = self._merge_input_ids_with_image_features(
	image_features, inputs_embeds, input_ids, attention_mask, labels
	)

	# In case input_ids.shape[1] == 1 & pixel_values != None & past_key_values != None, we are in the case of
	# generation with cache
	elif (
	past_key_values is not None
	and pixel_values is not None
	and input_ids.shape[1] == 1
	):
	# Retrieve the first layer to inspect the logits and mask out the hidden states
	# that are set to 0
	first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]

	# Sum all dimensions of head_dim (-2) to avoid random errors
	# such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
	batch_index, non_attended_tokens = torch.where(
	first_layer_past_key_value.float().sum(-2) == 0
	)

	# Get the target length
	target_length = input_ids.shape[1]
	past_length = first_layer_past_key_value.shape[-1]

	extended_attention_mask = torch.ones(
	(attention_mask.shape[0], past_length),
	dtype=attention_mask.dtype,
	device=attention_mask.device,
	)

	# Filter out only the tokens that can be un-attended, this can happen
	# if one uses Llava + Fused modules where the cache on the
	# first iteration is already big enough, or if one passes custom cache
	valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
	new_batch_index = batch_index[valid_indices]
	new_non_attended_tokens = non_attended_tokens[valid_indices]

	# Zero-out the places where we don't need to attend
	extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0

	attention_mask = torch.cat(
	(extended_attention_mask, attention_mask[:, -target_length:]), dim=1
	)
	position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1

	outputs = self.language_model(
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	logits = outputs[0]

	loss = None
	if labels is not None:
	# Shift so that tokens < n predict n
	if attention_mask is not None:
	shift_attention_mask = attention_mask[..., 1:]
	shift_logits = logits[..., :-1, :][
	shift_attention_mask.to(logits.device) != 0
	].contiguous()
	shift_labels = labels[..., 1:][
	shift_attention_mask.to(labels.device) != 0
	].contiguous()
	else:
	shift_logits = logits[..., :-1, :].contiguous()
	shift_labels = labels[..., 1:].contiguous()
	# Flatten the tokens
	loss_fct = nn.CrossEntropyLoss()
	loss = loss_fct(
	shift_logits.view(-1, shift_logits.size(-1)),
	shift_labels.view(-1).to(shift_logits.device),
	)

	if not return_dict:
	output = (logits,) + outputs[1:]
	return (loss,) + output if loss is not None else output

	return AriaCausalLMOutputWithPast(
	loss=loss,
	logits=logits,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)

	def prepare_inputs_for_generation(
	self,
	input_ids,
	past_key_values=None,
	inputs_embeds=None,
	pixel_values=None,
	pixel_mask=None,
	attention_mask=None,
	**kwargs,
	):
	"""
	Prepare inputs for generation step.

	This method prepares the inputs for the generation step, handling both
	text and image inputs, and managing the model's cache mechanism.

	Args:
	input_ids (torch.LongTensor): Input token ids.
	past_key_values (Cache or List[torch.FloatTensor], optional): Past key values for efficient processing.
	inputs_embeds (torch.FloatTensor, optional): Input embeddings.
	pixel_values (torch.FloatTensor, optional): Pixel values of the images.
	pixel_mask (torch.LongTensor, optional): Mask for the pixel values.
	attention_mask (torch.Tensor, optional): Attention mask.
	**kwargs: Additional keyword arguments.

	Returns:
	dict: A dictionary containing the prepared inputs for the generation step.
	"""
	if past_key_values is not None:
	if isinstance(past_key_values, Cache):
	cache_length = past_key_values.get_seq_length()
	past_length = past_key_values.seen_tokens
	else:
	cache_length = past_length = past_key_values[0][0].shape[2]

	# Keep only the unprocessed tokens:
	# 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
	# some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
	# input)
	if (
	attention_mask is not None
	and attention_mask.shape[1] > input_ids.shape[1]
	):
	input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
	# 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
	# input_ids based on the past_length.
	elif past_length < input_ids.shape[1]:
	input_ids = input_ids[:, past_length:]
	# 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
	elif self.config.image_token_index in input_ids:
	input_ids = input_ids[:, input_ids.shape[1] - 1 :]
	# If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
	# older attention values, as their corresponding values are not part of the input.
	if cache_length < past_length and attention_mask is not None:
	attention_mask = attention_mask[
	:, -(cache_length + input_ids.shape[1]) :
	]

	position_ids = kwargs.get("position_ids", None)
	if attention_mask is not None and position_ids is None:
	# create position_ids on the fly for batch generation
	position_ids = attention_mask.long().cumsum(-1) - 1
	position_ids.masked_fill_(attention_mask == 0, 1)
	if past_key_values:
	position_ids = position_ids[:, -input_ids.shape[1] :]

	# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
	if inputs_embeds is not None and past_key_values is None:
	model_inputs = {"inputs_embeds": inputs_embeds}
	else:
	model_inputs = {"input_ids": input_ids}

	model_inputs.update(
	{
	"position_ids": position_ids,
	"past_key_values": past_key_values,
	"use_cache": kwargs.get("use_cache"),
	"attention_mask": attention_mask,
	"pixel_values": pixel_values,
	"pixel_mask": pixel_mask,
	}
	)
	return model_inputs