Update README.md
Browse files
README.md
CHANGED
@@ -94,9 +94,6 @@ def _read_video_pyav(
|
|
94 |
frames.append(frame)
|
95 |
return np.stack([x.to_ndarray(format="rgb24") for x in frames])
|
96 |
|
97 |
-
## in VideoScore-v1.1, we support 48 frames in inference
|
98 |
-
MAX_NUM_FRAMES=48
|
99 |
-
|
100 |
ROUND_DIGIT=3
|
101 |
REGRESSION_QUERY_PROMPT = """
|
102 |
Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
|
@@ -119,13 +116,24 @@ factual consistency: 1.8
|
|
119 |
For this video, the text prompt is "{text_prompt}",
|
120 |
all the frames of video are as follows:
|
121 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
model_name="TIGER-Lab/VideoScore-v1.1"
|
|
|
123 |
video_path="video1.mp4"
|
124 |
video_prompt="Near the Elephant Gate village, they approach the haunted house at night. Rajiv feels anxious, but Bhavesh encourages him. As they reach the house, a mysterious sound in the air adds to the suspense."
|
|
|
125 |
processor = AutoProcessor.from_pretrained(model_name,torch_dtype=torch.bfloat16)
|
126 |
model = Idefics2ForSequenceClassification.from_pretrained(model_name,torch_dtype=torch.bfloat16).eval()
|
127 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
128 |
model.to(device)
|
|
|
129 |
# sample uniformly 8 frames from the video
|
130 |
container = av.open(video_path)
|
131 |
total_frames = container.streams.video[0].frames
|
@@ -133,6 +141,7 @@ if total_frames > MAX_NUM_FRAMES:
|
|
133 |
indices = np.arange(0, total_frames, total_frames / MAX_NUM_FRAMES).astype(int)
|
134 |
else:
|
135 |
indices = np.arange(total_frames)
|
|
|
136 |
frames = [Image.fromarray(x) for x in _read_video_pyav(container, indices)]
|
137 |
eval_prompt = REGRESSION_QUERY_PROMPT.format(text_prompt=video_prompt)
|
138 |
num_image_token = eval_prompt.count("<image>")
|
@@ -144,9 +153,11 @@ for x in [frames]:
|
|
144 |
flatten_images.extend(x)
|
145 |
else:
|
146 |
flatten_images.append(x)
|
|
|
147 |
flatten_images = [Image.open(x) if isinstance(x, str) else x for x in flatten_images]
|
148 |
inputs = processor(text=eval_prompt, images=flatten_images, return_tensors="pt")
|
149 |
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
|
|
150 |
with torch.no_grad():
|
151 |
outputs = model(**inputs)
|
152 |
logits = outputs.logits
|
@@ -154,10 +165,15 @@ num_aspects = logits.shape[-1]
|
|
154 |
aspect_scores = []
|
155 |
for i in range(num_aspects):
|
156 |
aspect_scores.append(round(logits[0, i].item(),ROUND_DIGIT))
|
|
|
157 |
print(aspect_scores)
|
158 |
"""
|
159 |
model output on visual quality, temporal consistency, dynamic degree,
|
160 |
text-to-video alignment, factual consistency, respectively
|
|
|
|
|
|
|
|
|
161 |
[2.328, 2.484, 2.562, 1.969, 2.594]
|
162 |
"""
|
163 |
```
|
|
|
94 |
frames.append(frame)
|
95 |
return np.stack([x.to_ndarray(format="rgb24") for x in frames])
|
96 |
|
|
|
|
|
|
|
97 |
ROUND_DIGIT=3
|
98 |
REGRESSION_QUERY_PROMPT = """
|
99 |
Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
|
|
|
116 |
For this video, the text prompt is "{text_prompt}",
|
117 |
all the frames of video are as follows:
|
118 |
"""
|
119 |
+
|
120 |
+
# MAX_NUM_FRAMES=16
|
121 |
+
# model_name="TIGER-Lab/VideoScore"
|
122 |
+
|
123 |
+
# =======================================
|
124 |
+
# we support 48 frames in VideoScore-v1.1
|
125 |
+
# =======================================
|
126 |
+
MAX_NUM_FRAMES=48
|
127 |
model_name="TIGER-Lab/VideoScore-v1.1"
|
128 |
+
|
129 |
video_path="video1.mp4"
|
130 |
video_prompt="Near the Elephant Gate village, they approach the haunted house at night. Rajiv feels anxious, but Bhavesh encourages him. As they reach the house, a mysterious sound in the air adds to the suspense."
|
131 |
+
|
132 |
processor = AutoProcessor.from_pretrained(model_name,torch_dtype=torch.bfloat16)
|
133 |
model = Idefics2ForSequenceClassification.from_pretrained(model_name,torch_dtype=torch.bfloat16).eval()
|
134 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
135 |
model.to(device)
|
136 |
+
|
137 |
# sample uniformly 8 frames from the video
|
138 |
container = av.open(video_path)
|
139 |
total_frames = container.streams.video[0].frames
|
|
|
141 |
indices = np.arange(0, total_frames, total_frames / MAX_NUM_FRAMES).astype(int)
|
142 |
else:
|
143 |
indices = np.arange(total_frames)
|
144 |
+
|
145 |
frames = [Image.fromarray(x) for x in _read_video_pyav(container, indices)]
|
146 |
eval_prompt = REGRESSION_QUERY_PROMPT.format(text_prompt=video_prompt)
|
147 |
num_image_token = eval_prompt.count("<image>")
|
|
|
153 |
flatten_images.extend(x)
|
154 |
else:
|
155 |
flatten_images.append(x)
|
156 |
+
|
157 |
flatten_images = [Image.open(x) if isinstance(x, str) else x for x in flatten_images]
|
158 |
inputs = processor(text=eval_prompt, images=flatten_images, return_tensors="pt")
|
159 |
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
160 |
+
|
161 |
with torch.no_grad():
|
162 |
outputs = model(**inputs)
|
163 |
logits = outputs.logits
|
|
|
165 |
aspect_scores = []
|
166 |
for i in range(num_aspects):
|
167 |
aspect_scores.append(round(logits[0, i].item(),ROUND_DIGIT))
|
168 |
+
|
169 |
print(aspect_scores)
|
170 |
"""
|
171 |
model output on visual quality, temporal consistency, dynamic degree,
|
172 |
text-to-video alignment, factual consistency, respectively
|
173 |
+
VideoScore:
|
174 |
+
[2.297, 2.469, 2.906, 2.766, 2.516]
|
175 |
+
|
176 |
+
VideoScore-v1.1:
|
177 |
[2.328, 2.484, 2.562, 1.969, 2.594]
|
178 |
"""
|
179 |
```
|