File size: 3,858 Bytes
43b7e92 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import os
import random
import torch
import torchvision.transforms as transforms
from PIL import Image
def recalculate_box_and_verify_if_valid(x, y, w, h, image_size, original_image_size, min_box_size):
scale = image_size / min(original_image_size)
crop_y = (original_image_size[1] * scale - image_size) // 2
crop_x = (original_image_size[0] * scale - image_size) // 2
x0 = max(x * scale - crop_x, 0)
y0 = max(y * scale - crop_y, 0)
x1 = min((x + w) * scale - crop_x, image_size)
y1 = min((y + h) * scale - crop_y, image_size)
if (x1 - x0) * (y1 - y0) / (image_size * image_size) < min_box_size:
return False, (None, None, None, None)
return True, (x0, y0, x1, y1)
class COCODataset(torch.utils.data.Dataset):
def __init__(
self,
data_path,
image_path,
image_size=512,
min_box_size=0.01,
max_boxes_per_data=8,
tokenizer=None,
):
super().__init__()
self.min_box_size = min_box_size
self.max_boxes_per_data = max_boxes_per_data
self.image_size = image_size
self.image_path = image_path
self.tokenizer = tokenizer
self.transforms = transforms.Compose(
[
transforms.Resize(image_size, interpolation=transforms.InterpolationMode.BILINEAR),
transforms.CenterCrop(image_size),
transforms.ToTensor(),
transforms.Normalize([0.5], [0.5]),
]
)
self.data_list = torch.load(data_path, map_location="cpu")
def __getitem__(self, index):
if self.max_boxes_per_data > 99:
assert False, "Are you sure setting such large number of boxes per image?"
out = {}
data = self.data_list[index]
image = Image.open(os.path.join(self.image_path, data["file_path"])).convert("RGB")
original_image_size = image.size
out["pixel_values"] = self.transforms(image)
annos = data["annos"]
areas, valid_annos = [], []
for anno in annos:
# x, y, w, h = anno['bbox']
x0, y0, x1, y1 = anno["bbox"]
x, y, w, h = x0, y0, x1 - x0, y1 - y0
valid, (x0, y0, x1, y1) = recalculate_box_and_verify_if_valid(
x, y, w, h, self.image_size, original_image_size, self.min_box_size
)
if valid:
anno["bbox"] = [x0, y0, x1, y1]
areas.append((x1 - x0) * (y1 - y0))
valid_annos.append(anno)
# Sort according to area and choose the largest N objects
wanted_idxs = torch.tensor(areas).sort(descending=True)[1]
wanted_idxs = wanted_idxs[: self.max_boxes_per_data]
valid_annos = [valid_annos[i] for i in wanted_idxs]
out["boxes"] = torch.zeros(self.max_boxes_per_data, 4)
out["masks"] = torch.zeros(self.max_boxes_per_data)
out["text_embeddings_before_projection"] = torch.zeros(self.max_boxes_per_data, 768)
for i, anno in enumerate(valid_annos):
out["boxes"][i] = torch.tensor(anno["bbox"]) / self.image_size
out["masks"][i] = 1
out["text_embeddings_before_projection"][i] = anno["text_embeddings_before_projection"]
prob_drop_boxes = 0.1
if random.random() < prob_drop_boxes:
out["masks"][:] = 0
caption = random.choice(data["captions"])
prob_drop_captions = 0.5
if random.random() < prob_drop_captions:
caption = ""
caption = self.tokenizer(
caption,
max_length=self.tokenizer.model_max_length,
padding="max_length",
truncation=True,
return_tensors="pt",
)
out["caption"] = caption
return out
def __len__(self):
return len(self.data_list)
|