File size: 5,427 Bytes

5fe5ca4
 
1f1db3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fe5ca4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f1db3a
5fe5ca4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f1db3a
 
5fe5ca4
 
 
1f1db3a
5fe5ca4
 
 
 
 
 
 
1f1db3a
5fe5ca4
1f1db3a
5fe5ca4
 
 
1f1db3a
5fe5ca4
 
 
 
 
1f1db3a
 
5fe5ca4
 
1f1db3a
5fe5ca4
 
 
 
 
 
 
 
 
 
 
 
1f1db3a
5fe5ca4

import decord
import random
import numpy as np
from PIL import Image

import torch
from torchvision.transforms import Normalize, Compose, InterpolationMode, ToTensor, Resize


def _convert_to_rgb(image):
    return image.convert('RGB')


def image_transform(image_size: int):
    mean = (0.48145466, 0.4578275, 0.40821073)
    std = (0.26862954, 0.26130258, 0.27577711)

    normalize = Normalize(mean=mean, std=std)
    transforms = [
        Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
        _convert_to_rgb,
        ToTensor(),
        normalize,
    ]
    return Compose(transforms)


def preprocess_multimodal(sources, num_segments):
    for source in sources:
        for sentence in source:
            X_token = '<video>'
            if  X_token in sentence['content']:
                replace_token = ""

                ns = num_segments
                ns = ns // 2 - 1
                for _ in range(ns):
                    replace_token += "<image>"
                    replace_token += "<eof>"
                replace_token += "<image>"
                replace_token += "<eov>"
                
                replace_token = '<vi_start>' + replace_token + '<vi_end>'
                sentence["content"] = sentence["content"].replace(X_token, replace_token)
    return sources


def preprocess(
    sources,
    tokenizer,
    s_id=None,
):
    en_qa_templates = [
        "Review the given video and answer the question associated with its visual elements.", 
        "Watch the provided video and offer an accurate response to the related question.", 
        "Scrutinize the video carefully, identifying relevant details in order to address the linked question.", 
        "Take a close look at the presented visuals and deliver a precise answer to the corresponding question.", 
        "Observe the video attentively and accurately respond to the associated question.", 
        "View the video attentively and provide a suitable answer to the posed question.",
        "Examine the video and approach the connected question with an informed response.",
        "Assess the displayed video and answer the subsequent question with accuracy.", 
        "Consider the video content and deliver a relevant answer to the corresponding question.", 
        "Go through the video, taking into account key aspects, and respond to the question."
    ]
    ch_qa_templates = [
        "审阅所提供的视频，并回答与其视觉元素相关的问题。",
        "观看所提供的视频，对相关问题给出准确的回答。",
        "仔细审查视频，识别相关的细节，回答与之相关的问题。",
        "仔细观察所展示的视觉内容，并对相应的问题给出精确的回答。",
        "认真观察视频并准确回答相关的问题。",
        "详细观看视频，并且对提出的问题给出合适的回答。",
        "观察视频并用有依据的回答来解答相关的问题。",
        "评估展示的视频，并准确地回答随后的问题。",
        "根据视频内容，对相应的问题给出合理的答案。",
        "浏览视频，根据其中的关键内容回答问题。",
    ]
    if s_id != None:
        index = s_id
    else:
        index = random.choice(range(len(en_qa_templates)))
    system_prompt = f"""You are a helpful assistant, {en_qa_templates[index]} 你是一个乐于助人的助手，{ch_qa_templates[index]}"""
    messages = []
    for source in sources:
        message = [{'role': 'system', 'content': system_prompt}]
        for sentence in source:
            message.append(sentence)
        messages.append(message)

    input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors='pt')
    return input_ids


def get_index(fps, max_frame, num_segments):
    num_frames = max_frame 
    if num_frames <= num_segments:
        out_indices = np.array([(idx % num_frames) for idx in range(num_segments)])
        out_indices = np.sort(out_indices)
    else:
        out_indices = np.linspace(0, num_frames-1, num_segments)
    
    durations = [idx.item() / fps  for idx in out_indices]
    return out_indices.astype(np.int64), durations


def read_video(video_path, num_segments):
    image_processor = image_transform(image_size=448)
    vr = decord.VideoReader(video_path)
    fps = float(vr.get_avg_fps())
    
    frame_indices, durations = get_index(fps, len(vr) - 1, num_segments) 
    video = []
    for frame_index in frame_indices:
        image = Image.fromarray(vr[frame_index].asnumpy())
        video.append(image_processor(image).unsqueeze(0))
    video = torch.concat(video)
    return video, torch.Tensor(durations)


def get_input(video_path, num_segments, question, history, tokenizer, s_id):
    video, durations = read_video(video_path, num_segments)
    if history == None:
        conversations = []
        conversations.append({'role': 'user', 'content': f'<video>\n{question}'})
    else:
        conversations = history
        conversations.append({'role': 'user', 'content': question})
    sources = [conversations]
    sources = preprocess_multimodal(sources, video.shape[0])
    input_ids = preprocess(sources, tokenizer, s_id=s_id)

    return video, durations, input_ids, conversations


def add_pred_to_history(history, pred):
    history.append({'role': 'assistant', 'content': pred})
    return history