hexuan21 commited on
Commit
f183e8f
1 Parent(s): bf11f42

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +19 -3
README.md CHANGED
@@ -94,9 +94,6 @@ def _read_video_pyav(
94
  frames.append(frame)
95
  return np.stack([x.to_ndarray(format="rgb24") for x in frames])
96
 
97
- ## in VideoScore-v1.1, we support 48 frames in inference
98
- MAX_NUM_FRAMES=48
99
-
100
  ROUND_DIGIT=3
101
  REGRESSION_QUERY_PROMPT = """
102
  Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
@@ -119,13 +116,24 @@ factual consistency: 1.8
119
  For this video, the text prompt is "{text_prompt}",
120
  all the frames of video are as follows:
121
  """
 
 
 
 
 
 
 
 
122
  model_name="TIGER-Lab/VideoScore-v1.1"
 
123
  video_path="video1.mp4"
124
  video_prompt="Near the Elephant Gate village, they approach the haunted house at night. Rajiv feels anxious, but Bhavesh encourages him. As they reach the house, a mysterious sound in the air adds to the suspense."
 
125
  processor = AutoProcessor.from_pretrained(model_name,torch_dtype=torch.bfloat16)
126
  model = Idefics2ForSequenceClassification.from_pretrained(model_name,torch_dtype=torch.bfloat16).eval()
127
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
128
  model.to(device)
 
129
  # sample uniformly 8 frames from the video
130
  container = av.open(video_path)
131
  total_frames = container.streams.video[0].frames
@@ -133,6 +141,7 @@ if total_frames > MAX_NUM_FRAMES:
133
  indices = np.arange(0, total_frames, total_frames / MAX_NUM_FRAMES).astype(int)
134
  else:
135
  indices = np.arange(total_frames)
 
136
  frames = [Image.fromarray(x) for x in _read_video_pyav(container, indices)]
137
  eval_prompt = REGRESSION_QUERY_PROMPT.format(text_prompt=video_prompt)
138
  num_image_token = eval_prompt.count("<image>")
@@ -144,9 +153,11 @@ for x in [frames]:
144
  flatten_images.extend(x)
145
  else:
146
  flatten_images.append(x)
 
147
  flatten_images = [Image.open(x) if isinstance(x, str) else x for x in flatten_images]
148
  inputs = processor(text=eval_prompt, images=flatten_images, return_tensors="pt")
149
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
 
150
  with torch.no_grad():
151
  outputs = model(**inputs)
152
  logits = outputs.logits
@@ -154,10 +165,15 @@ num_aspects = logits.shape[-1]
154
  aspect_scores = []
155
  for i in range(num_aspects):
156
  aspect_scores.append(round(logits[0, i].item(),ROUND_DIGIT))
 
157
  print(aspect_scores)
158
  """
159
  model output on visual quality, temporal consistency, dynamic degree,
160
  text-to-video alignment, factual consistency, respectively
 
 
 
 
161
  [2.328, 2.484, 2.562, 1.969, 2.594]
162
  """
163
  ```
 
94
  frames.append(frame)
95
  return np.stack([x.to_ndarray(format="rgb24") for x in frames])
96
 
 
 
 
97
  ROUND_DIGIT=3
98
  REGRESSION_QUERY_PROMPT = """
99
  Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
 
116
  For this video, the text prompt is "{text_prompt}",
117
  all the frames of video are as follows:
118
  """
119
+
120
+ # MAX_NUM_FRAMES=16
121
+ # model_name="TIGER-Lab/VideoScore"
122
+
123
+ # =======================================
124
+ # we support 48 frames in VideoScore-v1.1
125
+ # =======================================
126
+ MAX_NUM_FRAMES=48
127
  model_name="TIGER-Lab/VideoScore-v1.1"
128
+
129
  video_path="video1.mp4"
130
  video_prompt="Near the Elephant Gate village, they approach the haunted house at night. Rajiv feels anxious, but Bhavesh encourages him. As they reach the house, a mysterious sound in the air adds to the suspense."
131
+
132
  processor = AutoProcessor.from_pretrained(model_name,torch_dtype=torch.bfloat16)
133
  model = Idefics2ForSequenceClassification.from_pretrained(model_name,torch_dtype=torch.bfloat16).eval()
134
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
135
  model.to(device)
136
+
137
  # sample uniformly 8 frames from the video
138
  container = av.open(video_path)
139
  total_frames = container.streams.video[0].frames
 
141
  indices = np.arange(0, total_frames, total_frames / MAX_NUM_FRAMES).astype(int)
142
  else:
143
  indices = np.arange(total_frames)
144
+
145
  frames = [Image.fromarray(x) for x in _read_video_pyav(container, indices)]
146
  eval_prompt = REGRESSION_QUERY_PROMPT.format(text_prompt=video_prompt)
147
  num_image_token = eval_prompt.count("<image>")
 
153
  flatten_images.extend(x)
154
  else:
155
  flatten_images.append(x)
156
+
157
  flatten_images = [Image.open(x) if isinstance(x, str) else x for x in flatten_images]
158
  inputs = processor(text=eval_prompt, images=flatten_images, return_tensors="pt")
159
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
160
+
161
  with torch.no_grad():
162
  outputs = model(**inputs)
163
  logits = outputs.logits
 
165
  aspect_scores = []
166
  for i in range(num_aspects):
167
  aspect_scores.append(round(logits[0, i].item(),ROUND_DIGIT))
168
+
169
  print(aspect_scores)
170
  """
171
  model output on visual quality, temporal consistency, dynamic degree,
172
  text-to-video alignment, factual consistency, respectively
173
+ VideoScore:
174
+ [2.297, 2.469, 2.906, 2.766, 2.516]
175
+
176
+ VideoScore-v1.1:
177
  [2.328, 2.484, 2.562, 1.969, 2.594]
178
  """
179
  ```