add data_utils
Browse files- data_utils.py +120 -0
data_utils.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import decord
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
from PIL import Image
|
5 |
+
import random
|
6 |
+
|
7 |
+
from eva_clip.transform import image_transform
|
8 |
+
image_processor = image_transform(image_size=448, is_train=False)
|
9 |
+
|
10 |
+
def preprocess_multimodal(sources, num_segments):
|
11 |
+
for source in sources:
|
12 |
+
for sentence in source:
|
13 |
+
X_token = '<video>'
|
14 |
+
if X_token in sentence['content']:
|
15 |
+
replace_token = ""
|
16 |
+
|
17 |
+
ns = num_segments
|
18 |
+
ns = ns // 2 - 1
|
19 |
+
for _ in range(ns):
|
20 |
+
replace_token += "<image>"
|
21 |
+
replace_token += "<eof>"
|
22 |
+
replace_token += "<image>"
|
23 |
+
replace_token += "<eov>"
|
24 |
+
|
25 |
+
replace_token = '<vi_start>' + replace_token + '<vi_end>'
|
26 |
+
sentence["content"] = sentence["content"].replace(X_token, replace_token)
|
27 |
+
return sources
|
28 |
+
|
29 |
+
def preprocess(
|
30 |
+
sources,
|
31 |
+
tokenizer,
|
32 |
+
s_id=None,
|
33 |
+
):
|
34 |
+
en_qa_templates = [
|
35 |
+
"Review the given video and answer the question associated with its visual elements.",
|
36 |
+
"Watch the provided video and offer an accurate response to the related question.",
|
37 |
+
"Scrutinize the video carefully, identifying relevant details in order to address the linked question.",
|
38 |
+
"Take a close look at the presented visuals and deliver a precise answer to the corresponding question.",
|
39 |
+
"Observe the video attentively and accurately respond to the associated question.",
|
40 |
+
"View the video attentively and provide a suitable answer to the posed question.",
|
41 |
+
"Examine the video and approach the connected question with an informed response.",
|
42 |
+
"Assess the displayed video and answer the subsequent question with accuracy.",
|
43 |
+
"Consider the video content and deliver a relevant answer to the corresponding question.",
|
44 |
+
"Go through the video, taking into account key aspects, and respond to the question."
|
45 |
+
]
|
46 |
+
ch_qa_templates = [
|
47 |
+
"审阅所提供的视频,并回答与其视觉元素相关的问题。",
|
48 |
+
"观看所提供的视频,对相关问题给出准确的回答。",
|
49 |
+
"仔细审查视频,识别相关的细节,回答与之相关的问题。",
|
50 |
+
"仔细观察所展示的视觉内容,并对相应的问题给出精确的回答。",
|
51 |
+
"认真观察视频并准确回答相关的问题。",
|
52 |
+
"详细观看视频,并且对提出的问题给出合适的回答。",
|
53 |
+
"观察视频并用有依据的回答来解答相关的问题。",
|
54 |
+
"评估展示的视频,并准确地回答随后的问题。",
|
55 |
+
"根据视频内容,对相应的问题给出合理的答案。",
|
56 |
+
"浏览视频,根据其中的关键内容回答问题。",
|
57 |
+
]
|
58 |
+
if s_id != None:
|
59 |
+
index = s_id
|
60 |
+
else:
|
61 |
+
index = random.choice(range(len(en_qa_templates)))
|
62 |
+
system_prompt = f"""You are a helpful assistant, {en_qa_templates[index]} 你是一个乐于助人的助手,{ch_qa_templates[index]}"""
|
63 |
+
chat_template = """{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>'
|
64 |
+
+ message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}
|
65 |
+
{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}"""
|
66 |
+
messages = []
|
67 |
+
for source in sources:
|
68 |
+
message = [{'role': 'system', 'content': system_prompt}]
|
69 |
+
for sentence in source:
|
70 |
+
message.append(sentence)
|
71 |
+
messages.append(message)
|
72 |
+
|
73 |
+
#input_ids = tokenizer.apply_chat_template(messages, chat_template, add_generation_prompt=True, return_tensors='pt')
|
74 |
+
input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors='pt')
|
75 |
+
return input_ids
|
76 |
+
|
77 |
+
def get_index(fps, max_frame, num_segments):
|
78 |
+
num_frames = max_frame
|
79 |
+
if num_frames <= num_segments:
|
80 |
+
out_indices = start_idx + np.array([(idx % num_frames) for idx in range(num_segments)])
|
81 |
+
out_indices = np.sort(out_indices)
|
82 |
+
else:
|
83 |
+
out_indices = np.linspace(0, num_frames-1, num_segments)
|
84 |
+
|
85 |
+
durations = [idx.item() / fps for idx in out_indices]
|
86 |
+
return out_indices.astype(np.int64), durations
|
87 |
+
|
88 |
+
def read_video(video_path, num_segments):
|
89 |
+
vr = decord.VideoReader(video_path)
|
90 |
+
max_frame = len(vr) - 1
|
91 |
+
fps = float(vr.get_avg_fps())
|
92 |
+
|
93 |
+
total_duration = len(vr) / fps
|
94 |
+
frame_indices, durations = get_index(fps, max_frame, num_segments)
|
95 |
+
video = []
|
96 |
+
for frame_index in frame_indices:
|
97 |
+
image = Image.fromarray(vr[frame_index].asnumpy())
|
98 |
+
video.append(image_processor(image).unsqueeze(0))
|
99 |
+
video = torch.concat(video)
|
100 |
+
return video, torch.Tensor(durations), total_duration
|
101 |
+
|
102 |
+
def get_input(video_path, num_segments, question, history, tokenizer, s_id):
|
103 |
+
video, durations, total_duration = read_video(video_path, num_segments)
|
104 |
+
if history == None:
|
105 |
+
conversations = []
|
106 |
+
conversations.append({'role': 'user', 'content': f'<video>\n{question}'})
|
107 |
+
else:
|
108 |
+
conversations = history
|
109 |
+
conversations.append({'role': 'user', 'content': question})
|
110 |
+
sources = [conversations]
|
111 |
+
sources = preprocess_multimodal(sources, video.shape[0])
|
112 |
+
input_ids = preprocess(sources, tokenizer, s_id=s_id)
|
113 |
+
|
114 |
+
return video, durations, input_ids, conversations
|
115 |
+
|
116 |
+
def add_pred_to_history(history, pred):
|
117 |
+
history.append({'role': 'assistant', 'content': pred})
|
118 |
+
return history
|
119 |
+
|
120 |
+
|