Spaces:
Running
on
Zero
Running
on
Zero
# MIT License | |
# Copyright (c) 2022 Intelligent Systems Lab Org | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# The above copyright notice and this permission notice shall be included in all | |
# copies or substantial portions of the Software. | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
# SOFTWARE. | |
# File author: Shariq Farooq Bhat | |
from zoedepth.utils.misc import count_parameters, parallelize | |
from zoedepth.utils.config import get_config | |
from zoedepth.utils.arg_utils import parse_unknown | |
from zoedepth.trainers.builder import get_trainer | |
from zoedepth.models.builder import build_model | |
from zoedepth.data.data_mono import DepthDataLoader | |
import torch.utils.data.distributed | |
import torch.multiprocessing as mp | |
import torch | |
import numpy as np | |
from pprint import pprint | |
import argparse | |
import os | |
os.environ["PYOPENGL_PLATFORM"] = "egl" | |
os.environ["WANDB_START_METHOD"] = "thread" | |
def fix_random_seed(seed: int): | |
import random | |
import numpy | |
import torch | |
random.seed(seed) | |
numpy.random.seed(seed) | |
torch.manual_seed(seed) | |
torch.cuda.manual_seed(seed) | |
torch.cuda.manual_seed_all(seed) | |
torch.backends.cudnn.deterministic = True | |
torch.backends.cudnn.benchmark = True | |
def load_ckpt(config, model, checkpoint_dir="./checkpoints", ckpt_type="best"): | |
import glob | |
import os | |
from zoedepth.models.model_io import load_wts | |
if hasattr(config, "checkpoint"): | |
checkpoint = config.checkpoint | |
elif hasattr(config, "ckpt_pattern"): | |
pattern = config.ckpt_pattern | |
matches = glob.glob(os.path.join( | |
checkpoint_dir, f"*{pattern}*{ckpt_type}*")) | |
if not (len(matches) > 0): | |
raise ValueError(f"No matches found for the pattern {pattern}") | |
checkpoint = matches[0] | |
else: | |
return model | |
model = load_wts(model, checkpoint) | |
print("Loaded weights from {0}".format(checkpoint)) | |
return model | |
def main_worker(gpu, ngpus_per_node, config): | |
try: | |
seed = config.seed if 'seed' in config and config.seed else 43 | |
fix_random_seed(seed) | |
config.gpu = gpu | |
model = build_model(config) | |
model = load_ckpt(config, model) | |
model = parallelize(config, model) | |
total_params = f"{round(count_parameters(model)/1e6,2)}M" | |
config.total_params = total_params | |
print(f"Total parameters : {total_params}") | |
train_loader = DepthDataLoader(config, "train").data | |
test_loader = DepthDataLoader(config, "online_eval").data | |
trainer = get_trainer(config)( | |
config, model, train_loader, test_loader, device=config.gpu) | |
trainer.train() | |
finally: | |
import wandb | |
wandb.finish() | |
if __name__ == '__main__': | |
mp.set_start_method('forkserver') | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-m", "--model", type=str, default="synunet") | |
parser.add_argument("-d", "--dataset", type=str, default='nyu') | |
parser.add_argument("--trainer", type=str, default=None) | |
args, unknown_args = parser.parse_known_args() | |
overwrite_kwargs = parse_unknown(unknown_args) | |
overwrite_kwargs["model"] = args.model | |
if args.trainer is not None: | |
overwrite_kwargs["trainer"] = args.trainer | |
config = get_config(args.model, "train", args.dataset, **overwrite_kwargs) | |
# git_commit() | |
if config.use_shared_dict: | |
shared_dict = mp.Manager().dict() | |
else: | |
shared_dict = None | |
config.shared_dict = shared_dict | |
config.batch_size = config.bs | |
config.mode = 'train' | |
if config.root != "." and not os.path.isdir(config.root): | |
os.makedirs(config.root) | |
try: | |
node_str = os.environ['SLURM_JOB_NODELIST'].replace( | |
'[', '').replace(']', '') | |
nodes = node_str.split(',') | |
config.world_size = len(nodes) | |
config.rank = int(os.environ['SLURM_PROCID']) | |
# config.save_dir = "/ibex/scratch/bhatsf/videodepth/checkpoints" | |
except KeyError as e: | |
# We are NOT using SLURM | |
config.world_size = 1 | |
config.rank = 0 | |
nodes = ["127.0.0.1"] | |
if config.distributed: | |
print(config.rank) | |
port = np.random.randint(15000, 15025) | |
config.dist_url = 'tcp://{}:{}'.format(nodes[0], port) | |
print(config.dist_url) | |
config.dist_backend = 'nccl' | |
config.gpu = None | |
ngpus_per_node = torch.cuda.device_count() | |
config.num_workers = config.workers | |
config.ngpus_per_node = ngpus_per_node | |
print("Config:") | |
pprint(config) | |
if config.distributed: | |
config.world_size = ngpus_per_node * config.world_size | |
mp.spawn(main_worker, nprocs=ngpus_per_node, | |
args=(ngpus_per_node, config)) | |
else: | |
if ngpus_per_node == 1: | |
config.gpu = 0 | |
main_worker(config.gpu, ngpus_per_node, config) | |