AudioLlama

Running on Zero

AudioLlama / mmaudio /ext /synchformer /video_model_builder.py

Rex Cheng

initial commit

dbac20f 10 days ago

11.3 kB

	#!/usr/bin/env python3
	# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
	# Copyright 2020 Ross Wightman
	# Modified Model definition

	from collections import OrderedDict
	from functools import partial

	import torch
	import torch.nn as nn
	from timm.layers import trunc_normal_

	from mmaudio.ext.synchformer import vit_helper


	class VisionTransformer(nn.Module):
	""" Vision Transformer with support for patch or hybrid CNN input stage """

	def __init__(self, cfg):
	super().__init__()
	self.img_size = cfg.DATA.TRAIN_CROP_SIZE
	self.patch_size = cfg.VIT.PATCH_SIZE
	self.in_chans = cfg.VIT.CHANNELS
	if cfg.TRAIN.DATASET == "Epickitchens":
	self.num_classes = [97, 300]
	else:
	self.num_classes = cfg.MODEL.NUM_CLASSES
	self.embed_dim = cfg.VIT.EMBED_DIM
	self.depth = cfg.VIT.DEPTH
	self.num_heads = cfg.VIT.NUM_HEADS
	self.mlp_ratio = cfg.VIT.MLP_RATIO
	self.qkv_bias = cfg.VIT.QKV_BIAS
	self.drop_rate = cfg.VIT.DROP
	self.drop_path_rate = cfg.VIT.DROP_PATH
	self.head_dropout = cfg.VIT.HEAD_DROPOUT
	self.video_input = cfg.VIT.VIDEO_INPUT
	self.temporal_resolution = cfg.VIT.TEMPORAL_RESOLUTION
	self.use_mlp = cfg.VIT.USE_MLP
	self.num_features = self.embed_dim
	norm_layer = partial(nn.LayerNorm, eps=1e-6)
	self.attn_drop_rate = cfg.VIT.ATTN_DROPOUT
	self.head_act = cfg.VIT.HEAD_ACT
	self.cfg = cfg

	# Patch Embedding
	self.patch_embed = vit_helper.PatchEmbed(img_size=224,
	patch_size=self.patch_size,
	in_chans=self.in_chans,
	embed_dim=self.embed_dim)

	# 3D Patch Embedding
	self.patch_embed_3d = vit_helper.PatchEmbed3D(img_size=self.img_size,
	temporal_resolution=self.temporal_resolution,
	patch_size=self.patch_size,
	in_chans=self.in_chans,
	embed_dim=self.embed_dim,
	z_block_size=self.cfg.VIT.PATCH_SIZE_TEMP)
	self.patch_embed_3d.proj.weight.data = torch.zeros_like(
	self.patch_embed_3d.proj.weight.data)

	# Number of patches
	if self.video_input:
	num_patches = self.patch_embed.num_patches * self.temporal_resolution
	else:
	num_patches = self.patch_embed.num_patches
	self.num_patches = num_patches

	# CLS token
	self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
	trunc_normal_(self.cls_token, std=.02)

	# Positional embedding
	self.pos_embed = nn.Parameter(
	torch.zeros(1, self.patch_embed.num_patches + 1, self.embed_dim))
	self.pos_drop = nn.Dropout(p=cfg.VIT.POS_DROPOUT)
	trunc_normal_(self.pos_embed, std=.02)

	if self.cfg.VIT.POS_EMBED == "joint":
	self.st_embed = nn.Parameter(torch.zeros(1, num_patches + 1, self.embed_dim))
	trunc_normal_(self.st_embed, std=.02)
	elif self.cfg.VIT.POS_EMBED == "separate":
	self.temp_embed = nn.Parameter(torch.zeros(1, self.temporal_resolution, self.embed_dim))

	# Layer Blocks
	dpr = [x.item() for x in torch.linspace(0, self.drop_path_rate, self.depth)]
	if self.cfg.VIT.ATTN_LAYER == "divided":
	self.blocks = nn.ModuleList([
	vit_helper.DividedSpaceTimeBlock(
	attn_type=cfg.VIT.ATTN_LAYER,
	dim=self.embed_dim,
	num_heads=self.num_heads,
	mlp_ratio=self.mlp_ratio,
	qkv_bias=self.qkv_bias,
	drop=self.drop_rate,
	attn_drop=self.attn_drop_rate,
	drop_path=dpr[i],
	norm_layer=norm_layer,
	) for i in range(self.depth)
	])
	else:
	self.blocks = nn.ModuleList([
	vit_helper.Block(attn_type=cfg.VIT.ATTN_LAYER,
	dim=self.embed_dim,
	num_heads=self.num_heads,
	mlp_ratio=self.mlp_ratio,
	qkv_bias=self.qkv_bias,
	drop=self.drop_rate,
	attn_drop=self.attn_drop_rate,
	drop_path=dpr[i],
	norm_layer=norm_layer,
	use_original_code=self.cfg.VIT.USE_ORIGINAL_TRAJ_ATTN_CODE)
	for i in range(self.depth)
	])
	self.norm = norm_layer(self.embed_dim)

	# MLP head
	if self.use_mlp:
	hidden_dim = self.embed_dim
	if self.head_act == 'tanh':
	# logging.info("Using TanH activation in MLP")
	act = nn.Tanh()
	elif self.head_act == 'gelu':
	# logging.info("Using GELU activation in MLP")
	act = nn.GELU()
	else:
	# logging.info("Using ReLU activation in MLP")
	act = nn.ReLU()
	self.pre_logits = nn.Sequential(
	OrderedDict([
	('fc', nn.Linear(self.embed_dim, hidden_dim)),
	('act', act),
	]))
	else:
	self.pre_logits = nn.Identity()

	# Classifier Head
	self.head_drop = nn.Dropout(p=self.head_dropout)
	if isinstance(self.num_classes, (list, )) and len(self.num_classes) > 1:
	for a, i in enumerate(range(len(self.num_classes))):
	setattr(self, "head%d" % a, nn.Linear(self.embed_dim, self.num_classes[i]))
	else:
	self.head = nn.Linear(self.embed_dim,
	self.num_classes) if self.num_classes > 0 else nn.Identity()

	# Initialize weights
	self.apply(self._init_weights)

	def _init_weights(self, m):
	if isinstance(m, nn.Linear):
	trunc_normal_(m.weight, std=.02)
	if isinstance(m, nn.Linear) and m.bias is not None:
	nn.init.constant_(m.bias, 0)
	elif isinstance(m, nn.LayerNorm):
	nn.init.constant_(m.bias, 0)
	nn.init.constant_(m.weight, 1.0)

	@torch.jit.ignore
	def no_weight_decay(self):
	if self.cfg.VIT.POS_EMBED == "joint":
	return {'pos_embed', 'cls_token', 'st_embed'}
	else:
	return {'pos_embed', 'cls_token', 'temp_embed'}

	def get_classifier(self):
	return self.head

	def reset_classifier(self, num_classes, global_pool=''):
	self.num_classes = num_classes
	self.head = (nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity())

	def forward_features(self, x):
	# if self.video_input:
	# x = x[0]
	B = x.shape[0]

	# Tokenize input
	# if self.cfg.VIT.PATCH_SIZE_TEMP > 1:
	# for simplicity of mapping between content dimensions (input x) and token dims (after patching)
	# we use the same trick as for AST (see modeling_ast.ASTModel.forward for the details):

	# apply patching on input
	x = self.patch_embed_3d(x)
	tok_mask = None

	# else:
	# tok_mask = None
	# # 2D tokenization
	# if self.video_input:
	# x = x.permute(0, 2, 1, 3, 4)
	# (B, T, C, H, W) = x.shape
	# x = x.reshape(B * T, C, H, W)

	# x = self.patch_embed(x)

	# if self.video_input:
	# (B2, T2, D2) = x.shape
	# x = x.reshape(B, T * T2, D2)

	# Append CLS token
	cls_tokens = self.cls_token.expand(B, -1, -1)
	x = torch.cat((cls_tokens, x), dim=1)
	# if tok_mask is not None:
	# # prepend 1(=keep) to the mask to account for the CLS token as well
	# tok_mask = torch.cat((torch.ones_like(tok_mask[:, [0]]), tok_mask), dim=1)

	# Interpolate positinoal embeddings
	# if self.cfg.DATA.TRAIN_CROP_SIZE != 224:
	# pos_embed = self.pos_embed
	# N = pos_embed.shape[1] - 1
	# npatch = int((x.size(1) - 1) / self.temporal_resolution)
	# class_emb = pos_embed[:, 0]
	# pos_embed = pos_embed[:, 1:]
	# dim = x.shape[-1]
	# pos_embed = torch.nn.functional.interpolate(
	# pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
	# scale_factor=math.sqrt(npatch / N),
	# mode='bicubic',
	# )
	# pos_embed = pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
	# new_pos_embed = torch.cat((class_emb.unsqueeze(0), pos_embed), dim=1)
	# else:
	new_pos_embed = self.pos_embed
	npatch = self.patch_embed.num_patches

	# Add positional embeddings to input
	if self.video_input:
	if self.cfg.VIT.POS_EMBED == "separate":
	cls_embed = self.pos_embed[:, 0, :].unsqueeze(1)
	tile_pos_embed = new_pos_embed[:, 1:, :].repeat(1, self.temporal_resolution, 1)
	tile_temporal_embed = self.temp_embed.repeat_interleave(npatch, 1)
	total_pos_embed = tile_pos_embed + tile_temporal_embed
	total_pos_embed = torch.cat([cls_embed, total_pos_embed], dim=1)
	x = x + total_pos_embed
	elif self.cfg.VIT.POS_EMBED == "joint":
	x = x + self.st_embed
	else:
	# image input
	x = x + new_pos_embed

	# Apply positional dropout
	x = self.pos_drop(x)

	# Encoding using transformer layers
	for i, blk in enumerate(self.blocks):
	x = blk(x,
	seq_len=npatch,
	num_frames=self.temporal_resolution,
	approx=self.cfg.VIT.APPROX_ATTN_TYPE,
	num_landmarks=self.cfg.VIT.APPROX_ATTN_DIM,
	tok_mask=tok_mask)

	### v-iashin: I moved it to the forward pass
	# x = self.norm(x)[:, 0]
	# x = self.pre_logits(x)
	###
	return x, tok_mask

	# def forward(self, x):
	# x = self.forward_features(x)
	# ### v-iashin: here. This should leave the same forward output as before
	# x = self.norm(x)[:, 0]
	# x = self.pre_logits(x)
	# ###
	# x = self.head_drop(x)
	# if isinstance(self.num_classes, (list, )) and len(self.num_classes) > 1:
	# output = []
	# for head in range(len(self.num_classes)):
	# x_out = getattr(self, "head%d" % head)(x)
	# if not self.training:
	# x_out = torch.nn.functional.softmax(x_out, dim=-1)
	# output.append(x_out)
	# return output
	# else:
	# x = self.head(x)
	# if not self.training:
	# x = torch.nn.functional.softmax(x, dim=-1)
	# return x