Spaces:
Running
on
Zero
Running
on
Zero
# Copyright (c) Meta Platforms, Inc. and affiliates. | |
# All rights reserved. | |
# | |
# This source code is licensed under the license found in the | |
# LICENSE file in the root directory of this source tree. | |
import argparse | |
import tarfile | |
from itertools import repeat | |
from multiprocessing.pool import ThreadPool | |
from pathlib import Path | |
from tarfile import TarFile | |
from zipfile import ZipFile | |
import torch | |
from mmengine.utils.path import mkdir_or_exist | |
def parse_args(): | |
parser = argparse.ArgumentParser( | |
description='Download datasets for training') | |
parser.add_argument( | |
'--dataset-name', type=str, help='dataset name', default='coco2017') | |
parser.add_argument( | |
'--save-dir', | |
type=str, | |
help='the dir to save dataset', | |
default='data/coco') | |
parser.add_argument( | |
'--unzip', | |
action='store_true', | |
help='whether unzip dataset or not, zipped files will be saved') | |
parser.add_argument( | |
'--delete', | |
action='store_true', | |
help='delete the download zipped files') | |
parser.add_argument( | |
'--threads', type=int, help='number of threading', default=4) | |
args = parser.parse_args() | |
return args | |
def download(url, dir, unzip=True, delete=False, threads=1): | |
def download_one(url, dir): | |
f = dir / Path(url).name | |
if Path(url).is_file(): | |
Path(url).rename(f) | |
elif not f.exists(): | |
print(f'Downloading {url} to {f}') | |
torch.hub.download_url_to_file(url, f, progress=True) | |
if unzip and f.suffix in ('.zip', '.tar'): | |
print(f'Unzipping {f.name}') | |
if f.suffix == '.zip': | |
ZipFile(f).extractall(path=dir) | |
elif f.suffix == '.tar': | |
TarFile(f).extractall(path=dir) | |
if delete: | |
f.unlink() | |
print(f'Delete {f}') | |
dir = Path(dir) | |
if threads > 1: | |
pool = ThreadPool(threads) | |
pool.imap(lambda x: download_one(*x), zip(url, repeat(dir))) | |
pool.close() | |
pool.join() | |
else: | |
for u in [url] if isinstance(url, (str, Path)) else url: | |
download_one(u, dir) | |
def download_objects365v2(url, dir, unzip=True, delete=False, threads=1): | |
def download_single(url, dir): | |
if 'train' in url: | |
saving_dir = dir / Path('train_zip') | |
mkdir_or_exist(saving_dir) | |
f = saving_dir / Path(url).name | |
unzip_dir = dir / Path('train') | |
mkdir_or_exist(unzip_dir) | |
elif 'val' in url: | |
saving_dir = dir / Path('val') | |
mkdir_or_exist(saving_dir) | |
f = saving_dir / Path(url).name | |
unzip_dir = dir / Path('val') | |
mkdir_or_exist(unzip_dir) | |
else: | |
raise NotImplementedError | |
if Path(url).is_file(): | |
Path(url).rename(f) | |
elif not f.exists(): | |
print(f'Downloading {url} to {f}') | |
torch.hub.download_url_to_file(url, f, progress=True) | |
if unzip and str(f).endswith('.tar.gz'): | |
print(f'Unzipping {f.name}') | |
tar = tarfile.open(f) | |
tar.extractall(path=unzip_dir) | |
if delete: | |
f.unlink() | |
print(f'Delete {f}') | |
# process annotations | |
full_url = [] | |
for _url in url: | |
if 'zhiyuan_objv2_train.tar.gz' in _url or \ | |
'zhiyuan_objv2_val.json' in _url: | |
full_url.append(_url) | |
elif 'train' in _url: | |
for i in range(51): | |
full_url.append(f'{_url}patch{i}.tar.gz') | |
elif 'val/images/v1' in _url: | |
for i in range(16): | |
full_url.append(f'{_url}patch{i}.tar.gz') | |
elif 'val/images/v2' in _url: | |
for i in range(16, 44): | |
full_url.append(f'{_url}patch{i}.tar.gz') | |
else: | |
raise NotImplementedError | |
dir = Path(dir) | |
if threads > 1: | |
pool = ThreadPool(threads) | |
pool.imap(lambda x: download_single(*x), zip(full_url, repeat(dir))) | |
pool.close() | |
pool.join() | |
else: | |
for u in full_url: | |
download_single(u, dir) | |
def main(): | |
args = parse_args() | |
path = Path(args.save_dir) | |
if not path.exists(): | |
path.mkdir(parents=True, exist_ok=True) | |
data2url = dict( | |
# TODO: Support for downloading Panoptic Segmentation of COCO | |
coco2017=[ | |
'http://images.cocodataset.org/zips/train2017.zip', | |
'http://images.cocodataset.org/zips/val2017.zip', | |
'http://images.cocodataset.org/zips/test2017.zip', | |
'http://images.cocodataset.org/zips/unlabeled2017.zip', | |
'http://images.cocodataset.org/annotations/annotations_trainval2017.zip', # noqa | |
'http://images.cocodataset.org/annotations/stuff_annotations_trainval2017.zip', # noqa | |
'http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip', # noqa | |
'http://images.cocodataset.org/annotations/image_info_test2017.zip', # noqa | |
'http://images.cocodataset.org/annotations/image_info_unlabeled2017.zip', # noqa | |
], | |
coco2014=[ | |
'http://images.cocodataset.org/zips/train2014.zip', | |
'http://images.cocodataset.org/zips/val2014.zip', | |
'http://images.cocodataset.org/zips/test2014.zip', | |
'http://images.cocodataset.org/annotations/annotations_trainval2014.zip', # noqa | |
'http://images.cocodataset.org/annotations/image_info_test2014.zip' # noqa | |
], | |
lvis=[ | |
'https://s3-us-west-2.amazonaws.com/dl.fbaipublicfiles.com/LVIS/lvis_v1_train.json.zip', # noqa | |
'https://s3-us-west-2.amazonaws.com/dl.fbaipublicfiles.com/LVIS/lvis_v1_train.json.zip', # noqa | |
], | |
voc2007=[ | |
'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar', # noqa | |
'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar', # noqa | |
'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCdevkit_08-Jun-2007.tar', # noqa | |
], | |
voc2012=[ | |
'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar', # noqa | |
], | |
balloon=[ | |
# src link: https://github.com/matterport/Mask_RCNN/releases/download/v2.1/balloon_dataset.zip # noqa | |
'https://download.openmmlab.com/mmyolo/data/balloon_dataset.zip' | |
], | |
# Note: There is no download link for Objects365-V1 right now. If you | |
# would like to download Objects365-V1, please visit | |
# http://www.objects365.org/ to concat the author. | |
objects365v2=[ | |
# training annotations | |
'https://dorc.ks3-cn-beijing.ksyun.com/data-set/2020Objects365%E6%95%B0%E6%8D%AE%E9%9B%86/train/zhiyuan_objv2_train.tar.gz', # noqa | |
# validation annotations | |
'https://dorc.ks3-cn-beijing.ksyun.com/data-set/2020Objects365%E6%95%B0%E6%8D%AE%E9%9B%86/val/zhiyuan_objv2_val.json', # noqa | |
# training url root | |
'https://dorc.ks3-cn-beijing.ksyun.com/data-set/2020Objects365%E6%95%B0%E6%8D%AE%E9%9B%86/train/', # noqa | |
# validation url root_1 | |
'https://dorc.ks3-cn-beijing.ksyun.com/data-set/2020Objects365%E6%95%B0%E6%8D%AE%E9%9B%86/val/images/v1/', # noqa | |
# validation url root_2 | |
'https://dorc.ks3-cn-beijing.ksyun.com/data-set/2020Objects365%E6%95%B0%E6%8D%AE%E9%9B%86/val/images/v2/' # noqa | |
], | |
ade20k_2016=[ | |
# training images and semantic segmentation annotations | |
'http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip', # noqa | |
# instance segmentation annotations | |
'http://sceneparsing.csail.mit.edu/data/ChallengeData2017/annotations_instance.tar', # noqa | |
# img categories ids | |
'https://raw.githubusercontent.com/CSAILVision/placeschallenge/master/instancesegmentation/imgCatIds.json', # noqa | |
# category mapping | |
'https://raw.githubusercontent.com/CSAILVision/placeschallenge/master/instancesegmentation/categoryMapping.txt' # noqa | |
], | |
refcoco=[ | |
# images | |
'http://images.cocodataset.org/zips/train2014.zip', | |
# refcoco annotations | |
'https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco.zip', | |
# refcoco+ annotations | |
'https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco+.zip', | |
# refcocog annotations | |
'https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcocog.zip' | |
]) | |
url = data2url.get(args.dataset_name, None) | |
if url is None: | |
print('Only support ADE20K, COCO, RefCOCO, VOC, LVIS, ' | |
'balloon, and Objects365v2 now!') | |
return | |
if args.dataset_name == 'objects365v2': | |
download_objects365v2( | |
url, | |
dir=path, | |
unzip=args.unzip, | |
delete=args.delete, | |
threads=args.threads) | |
else: | |
download( | |
url, | |
dir=path, | |
unzip=args.unzip, | |
delete=args.delete, | |
threads=args.threads) | |
if __name__ == '__main__': | |
main() | |