WEBing commited on
Commit
5fe5ca4
1 Parent(s): cfcb247

add data_utils

Browse files
Files changed (1) hide show
  1. data_utils.py +120 -0
data_utils.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import decord
2
+ import numpy as np
3
+ import torch
4
+ from PIL import Image
5
+ import random
6
+
7
+ from eva_clip.transform import image_transform
8
+ image_processor = image_transform(image_size=448, is_train=False)
9
+
10
+ def preprocess_multimodal(sources, num_segments):
11
+ for source in sources:
12
+ for sentence in source:
13
+ X_token = '<video>'
14
+ if X_token in sentence['content']:
15
+ replace_token = ""
16
+
17
+ ns = num_segments
18
+ ns = ns // 2 - 1
19
+ for _ in range(ns):
20
+ replace_token += "<image>"
21
+ replace_token += "<eof>"
22
+ replace_token += "<image>"
23
+ replace_token += "<eov>"
24
+
25
+ replace_token = '<vi_start>' + replace_token + '<vi_end>'
26
+ sentence["content"] = sentence["content"].replace(X_token, replace_token)
27
+ return sources
28
+
29
+ def preprocess(
30
+ sources,
31
+ tokenizer,
32
+ s_id=None,
33
+ ):
34
+ en_qa_templates = [
35
+ "Review the given video and answer the question associated with its visual elements.",
36
+ "Watch the provided video and offer an accurate response to the related question.",
37
+ "Scrutinize the video carefully, identifying relevant details in order to address the linked question.",
38
+ "Take a close look at the presented visuals and deliver a precise answer to the corresponding question.",
39
+ "Observe the video attentively and accurately respond to the associated question.",
40
+ "View the video attentively and provide a suitable answer to the posed question.",
41
+ "Examine the video and approach the connected question with an informed response.",
42
+ "Assess the displayed video and answer the subsequent question with accuracy.",
43
+ "Consider the video content and deliver a relevant answer to the corresponding question.",
44
+ "Go through the video, taking into account key aspects, and respond to the question."
45
+ ]
46
+ ch_qa_templates = [
47
+ "审阅所提供的视频,并回答与其视觉元素相关的问题。",
48
+ "观看所提供的视频,对相关问题给出准确的回答。",
49
+ "仔细审查视频,识别相关的细节,回答与之相关的问题。",
50
+ "仔细观察所展示的视觉内容,并对相应的问题给出精确的回答。",
51
+ "认真观察视频并准确回答相关的问题。",
52
+ "详细观看视频,并且对提出的问题给出合适的回答。",
53
+ "观察视频并用有依据的回答来解答相关的问题。",
54
+ "评估展示的视频,并准确地回答随后的问题。",
55
+ "根据视频内容,对相应的问题给出合理的答案。",
56
+ "浏览视频,根据其中的关键内容回答问题。",
57
+ ]
58
+ if s_id != None:
59
+ index = s_id
60
+ else:
61
+ index = random.choice(range(len(en_qa_templates)))
62
+ system_prompt = f"""You are a helpful assistant, {en_qa_templates[index]} 你是一个乐于助人的助手,{ch_qa_templates[index]}"""
63
+ chat_template = """{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>'
64
+ + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}
65
+ {% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}"""
66
+ messages = []
67
+ for source in sources:
68
+ message = [{'role': 'system', 'content': system_prompt}]
69
+ for sentence in source:
70
+ message.append(sentence)
71
+ messages.append(message)
72
+
73
+ #input_ids = tokenizer.apply_chat_template(messages, chat_template, add_generation_prompt=True, return_tensors='pt')
74
+ input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors='pt')
75
+ return input_ids
76
+
77
+ def get_index(fps, max_frame, num_segments):
78
+ num_frames = max_frame
79
+ if num_frames <= num_segments:
80
+ out_indices = start_idx + np.array([(idx % num_frames) for idx in range(num_segments)])
81
+ out_indices = np.sort(out_indices)
82
+ else:
83
+ out_indices = np.linspace(0, num_frames-1, num_segments)
84
+
85
+ durations = [idx.item() / fps for idx in out_indices]
86
+ return out_indices.astype(np.int64), durations
87
+
88
+ def read_video(video_path, num_segments):
89
+ vr = decord.VideoReader(video_path)
90
+ max_frame = len(vr) - 1
91
+ fps = float(vr.get_avg_fps())
92
+
93
+ total_duration = len(vr) / fps
94
+ frame_indices, durations = get_index(fps, max_frame, num_segments)
95
+ video = []
96
+ for frame_index in frame_indices:
97
+ image = Image.fromarray(vr[frame_index].asnumpy())
98
+ video.append(image_processor(image).unsqueeze(0))
99
+ video = torch.concat(video)
100
+ return video, torch.Tensor(durations), total_duration
101
+
102
+ def get_input(video_path, num_segments, question, history, tokenizer, s_id):
103
+ video, durations, total_duration = read_video(video_path, num_segments)
104
+ if history == None:
105
+ conversations = []
106
+ conversations.append({'role': 'user', 'content': f'<video>\n{question}'})
107
+ else:
108
+ conversations = history
109
+ conversations.append({'role': 'user', 'content': question})
110
+ sources = [conversations]
111
+ sources = preprocess_multimodal(sources, video.shape[0])
112
+ input_ids = preprocess(sources, tokenizer, s_id=s_id)
113
+
114
+ return video, durations, input_ids, conversations
115
+
116
+ def add_pred_to_history(history, pred):
117
+ history.append({'role': 'assistant', 'content': pred})
118
+ return history
119
+
120
+