import decord import random import numpy as np from PIL import Image import torch from torchvision.transforms import Normalize, Compose, InterpolationMode, ToTensor, Resize def _convert_to_rgb(image): return image.convert('RGB') def image_transform(image_size: int): mean = (0.48145466, 0.4578275, 0.40821073) std = (0.26862954, 0.26130258, 0.27577711) normalize = Normalize(mean=mean, std=std) transforms = [ Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC), _convert_to_rgb, ToTensor(), normalize, ] return Compose(transforms) def preprocess_multimodal(sources, num_segments): for source in sources: for sentence in source: X_token = '