File size: 11,952 Bytes
b213d84 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 |
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates.
import csv
import logging
import numpy as np
from typing import Any, Callable, Dict, List, Optional, Union
import av
import torch
from torch.utils.data.dataset import Dataset
from detectron2.utils.file_io import PathManager
from ..utils import maybe_prepend_base_path
from .frame_selector import FrameSelector, FrameTsList
FrameList = List[av.frame.Frame] # pyre-ignore[16]
FrameTransform = Callable[[torch.Tensor], torch.Tensor]
def list_keyframes(video_fpath: str, video_stream_idx: int = 0) -> FrameTsList:
"""
Traverses all keyframes of a video file. Returns a list of keyframe
timestamps. Timestamps are counts in timebase units.
Args:
video_fpath (str): Video file path
video_stream_idx (int): Video stream index (default: 0)
Returns:
List[int]: list of keyframe timestaps (timestamp is a count in timebase
units)
"""
try:
with PathManager.open(video_fpath, "rb") as io:
container = av.open(io, mode="r")
stream = container.streams.video[video_stream_idx]
keyframes = []
pts = -1
# Note: even though we request forward seeks for keyframes, sometimes
# a keyframe in backwards direction is returned. We introduce tolerance
# as a max count of ignored backward seeks
tolerance_backward_seeks = 2
while True:
try:
container.seek(pts + 1, backward=False, any_frame=False, stream=stream)
except av.AVError as e:
# the exception occurs when the video length is exceeded,
# we then return whatever data we've already collected
logger = logging.getLogger(__name__)
logger.debug(
f"List keyframes: Error seeking video file {video_fpath}, "
f"video stream {video_stream_idx}, pts {pts + 1}, AV error: {e}"
)
return keyframes
except OSError as e:
logger = logging.getLogger(__name__)
logger.warning(
f"List keyframes: Error seeking video file {video_fpath}, "
f"video stream {video_stream_idx}, pts {pts + 1}, OS error: {e}"
)
return []
packet = next(container.demux(video=video_stream_idx))
if packet.pts is not None and packet.pts <= pts:
logger = logging.getLogger(__name__)
logger.warning(
f"Video file {video_fpath}, stream {video_stream_idx}: "
f"bad seek for packet {pts + 1} (got packet {packet.pts}), "
f"tolerance {tolerance_backward_seeks}."
)
tolerance_backward_seeks -= 1
if tolerance_backward_seeks == 0:
return []
pts += 1
continue
tolerance_backward_seeks = 2
pts = packet.pts
if pts is None:
return keyframes
if packet.is_keyframe:
keyframes.append(pts)
return keyframes
except OSError as e:
logger = logging.getLogger(__name__)
logger.warning(
f"List keyframes: Error opening video file container {video_fpath}, " f"OS error: {e}"
)
except RuntimeError as e:
logger = logging.getLogger(__name__)
logger.warning(
f"List keyframes: Error opening video file container {video_fpath}, "
f"Runtime error: {e}"
)
return []
def read_keyframes(
video_fpath: str, keyframes: FrameTsList, video_stream_idx: int = 0
) -> FrameList: # pyre-ignore[11]
"""
Reads keyframe data from a video file.
Args:
video_fpath (str): Video file path
keyframes (List[int]): List of keyframe timestamps (as counts in
timebase units to be used in container seek operations)
video_stream_idx (int): Video stream index (default: 0)
Returns:
List[Frame]: list of frames that correspond to the specified timestamps
"""
try:
with PathManager.open(video_fpath, "rb") as io:
container = av.open(io)
stream = container.streams.video[video_stream_idx]
frames = []
for pts in keyframes:
try:
container.seek(pts, any_frame=False, stream=stream)
frame = next(container.decode(video=0))
frames.append(frame)
except av.AVError as e:
logger = logging.getLogger(__name__)
logger.warning(
f"Read keyframes: Error seeking video file {video_fpath}, "
f"video stream {video_stream_idx}, pts {pts}, AV error: {e}"
)
container.close()
return frames
except OSError as e:
logger = logging.getLogger(__name__)
logger.warning(
f"Read keyframes: Error seeking video file {video_fpath}, "
f"video stream {video_stream_idx}, pts {pts}, OS error: {e}"
)
container.close()
return frames
except StopIteration:
logger = logging.getLogger(__name__)
logger.warning(
f"Read keyframes: Error decoding frame from {video_fpath}, "
f"video stream {video_stream_idx}, pts {pts}"
)
container.close()
return frames
container.close()
return frames
except OSError as e:
logger = logging.getLogger(__name__)
logger.warning(
f"Read keyframes: Error opening video file container {video_fpath}, OS error: {e}"
)
except RuntimeError as e:
logger = logging.getLogger(__name__)
logger.warning(
f"Read keyframes: Error opening video file container {video_fpath}, Runtime error: {e}"
)
return []
def video_list_from_file(video_list_fpath: str, base_path: Optional[str] = None):
"""
Create a list of paths to video files from a text file.
Args:
video_list_fpath (str): path to a plain text file with the list of videos
base_path (str): base path for entries from the video list (default: None)
"""
video_list = []
with PathManager.open(video_list_fpath, "r") as io:
for line in io:
video_list.append(maybe_prepend_base_path(base_path, str(line.strip())))
return video_list
def read_keyframe_helper_data(fpath: str):
"""
Read keyframe data from a file in CSV format: the header should contain
"video_id" and "keyframes" fields. Value specifications are:
video_id: int
keyframes: list(int)
Example of contents:
video_id,keyframes
2,"[1,11,21,31,41,51,61,71,81]"
Args:
fpath (str): File containing keyframe data
Return:
video_id_to_keyframes (dict: int -> list(int)): for a given video ID it
contains a list of keyframes for that video
"""
video_id_to_keyframes = {}
try:
with PathManager.open(fpath, "r") as io:
csv_reader = csv.reader(io)
header = next(csv_reader)
video_id_idx = header.index("video_id")
keyframes_idx = header.index("keyframes")
for row in csv_reader:
video_id = int(row[video_id_idx])
assert (
video_id not in video_id_to_keyframes
), f"Duplicate keyframes entry for video {fpath}"
video_id_to_keyframes[video_id] = (
[int(v) for v in row[keyframes_idx][1:-1].split(",")]
if len(row[keyframes_idx]) > 2
else []
)
except Exception as e:
logger = logging.getLogger(__name__)
logger.warning(f"Error reading keyframe helper data from {fpath}: {e}")
return video_id_to_keyframes
class VideoKeyframeDataset(Dataset):
"""
Dataset that provides keyframes for a set of videos.
"""
_EMPTY_FRAMES = torch.empty((0, 3, 1, 1))
def __init__(
self,
video_list: List[str],
category_list: Union[str, List[str], None] = None,
frame_selector: Optional[FrameSelector] = None,
transform: Optional[FrameTransform] = None,
keyframe_helper_fpath: Optional[str] = None,
):
"""
Dataset constructor
Args:
video_list (List[str]): list of paths to video files
category_list (Union[str, List[str], None]): list of animal categories for each
video file. If it is a string, or None, this applies to all videos
frame_selector (Callable: KeyFrameList -> KeyFrameList):
selects keyframes to process, keyframes are given by
packet timestamps in timebase counts. If None, all keyframes
are selected (default: None)
transform (Callable: torch.Tensor -> torch.Tensor):
transforms a batch of RGB images (tensors of size [B, 3, H, W]),
returns a tensor of the same size. If None, no transform is
applied (default: None)
"""
if type(category_list) == list:
self.category_list = category_list
else:
self.category_list = [category_list] * len(video_list)
assert len(video_list) == len(
self.category_list
), "length of video and category lists must be equal"
self.video_list = video_list
self.frame_selector = frame_selector
self.transform = transform
self.keyframe_helper_data = (
read_keyframe_helper_data(keyframe_helper_fpath)
if keyframe_helper_fpath is not None
else None
)
def __getitem__(self, idx: int) -> Dict[str, Any]:
"""
Gets selected keyframes from a given video
Args:
idx (int): video index in the video list file
Returns:
A dictionary containing two keys:
images (torch.Tensor): tensor of size [N, H, W, 3] or of size
defined by the transform that contains keyframes data
categories (List[str]): categories of the frames
"""
categories = [self.category_list[idx]]
fpath = self.video_list[idx]
keyframes = (
list_keyframes(fpath)
if self.keyframe_helper_data is None or idx not in self.keyframe_helper_data
else self.keyframe_helper_data[idx]
)
transform = self.transform
frame_selector = self.frame_selector
if not keyframes:
return {"images": self._EMPTY_FRAMES, "categories": []}
if frame_selector is not None:
keyframes = frame_selector(keyframes)
frames = read_keyframes(fpath, keyframes)
if not frames:
return {"images": self._EMPTY_FRAMES, "categories": []}
frames = np.stack([frame.to_rgb().to_ndarray() for frame in frames])
frames = torch.as_tensor(frames, device=torch.device("cpu"))
frames = frames[..., [2, 1, 0]] # RGB -> BGR
frames = frames.permute(0, 3, 1, 2).float() # NHWC -> NCHW
if transform is not None:
frames = transform(frames)
return {"images": frames, "categories": categories}
def __len__(self):
return len(self.video_list)
|